# Install and import required libraries

In [None]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import google.colab
from tqdm import tqdm
import re
import warnings
import csv

from flair.datasets import UD_INDONESIAN
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
warnings.filterwarnings("ignore")

# Import data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
df_tweets = pd.read_csv('/content/drive/MyDrive/TA/tweets/ruhutsitompul/done_labelled_ruhutsitompul.csv', delimiter=',', error_bad_lines=False)
df_tweets

Unnamed: 0.1,Unnamed: 0,ID,Tweet Time,Convo ID,Likes Count,Retweets Count,Tweets,Tweets(rendered),in_reply_to_tweet_id,in_reply_to,...,clean_tweets_len,Month,social,historical,dehumanization,accusation,attack,loyalty,threat,Tweets.1
0,0,1585739644011892736,2022-10-27 21:06:07,1585739644011892736,460,80,Sama banget Komentar ini dgn para pendukung se...,Sama banget Komentar ini dgn para pendukung se...,,,...,11,October,1,0,0,0,0.0,0,0,Sama banget Komentar ini dgn para pendukung se...
1,1,1585737538806853632,2022-10-27 20:57:45,1585737538806853632,420,92,Ha ha ha sigundul penguasa ancol karena selama...,Ha ha ha sigundul penguasa ancol karena selama...,,,...,15,October,1,0,0,1,0.0,0,0,Ha ha ha sigundul penguasa ancol karena selama...
2,2,1584780616369049605,2022-10-25 05:35:17,1584780616369049605,343,40,Kasihan Ibu ini jadi korban akibat dicuci otak...,Kasihan Ibu ini jadi korban akibat dicuci otak...,,,...,14,October,0,0,1,1,0.0,0,0,Kasihan Ibu ini jadi korban akibat dicuci otak...
3,3,1584778048120836096,2022-10-25 05:25:05,1584778048120836096,212,23,Ha ha hakadrun pada stresssssss mengenai beber...,Ha ha hakadrun pada stresssssss mengenai beber...,,,...,18,October,1,0,1,0,0.0,0,0,Ha ha hakadrun pada stresssssss mengenai beber...
4,4,1584774254502912000,2022-10-25 05:10:00,1584774254502912000,860,63,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,,,...,33,October,0,0,1,0,0.0,0,0,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,4178,1118666888647602177,2019-04-18 00:05:44,1118666888647602177,0,0,@NediSetiadi Kami sedih krn biar sudah kalian ...,@NediSetiadi Kami sedih krn biar sudah kalian ...,,,...,14,April,1,0,0,1,0.0,0,0,@NediSetiadi Kami sedih krn biar sudah kalian ...
4179,4179,1118665503642247170,2019-04-18 00:00:14,1118665503642247170,7,0,@TanYoana haiya Amoy ini makin stresssssss kac...,@TanYoana haiya Amoy ini makin stresssssss kac...,,,...,5,April,0,0,0,0,0.0,0,0,@TanYoana haiya Amoy ini makin stresssssss kac...
4180,4180,1118664923301572608,2019-04-17 23:57:56,1118664923301572608,1684,200,Terima kasih KPU BAWASLU DKPP sbg Penyelenggar...,Terima kasih KPU BAWASLU DKPP sbg Penyelenggar...,,,...,31,April,0,0,0,0,0.0,0,0,Terima kasih KPU BAWASLU DKPP sbg Penyelenggar...
4181,4181,1118396091148460032,2019-04-17 06:09:41,1118396091148460032,6,0,"@panca66 Ha ha ha maaf memilih diri sendiri, s...","@panca66 Ha ha ha maaf memilih diri sendiri, s...",,,...,5,April,0,0,0,0,0.0,0,0,"@panca66 Ha ha ha maaf memilih diri sendiri, s..."


In [None]:
# Remove tweets without any aspect
df_tweets = df_tweets.loc[(df_tweets[['social', 'historical', 'dehumanization', 'accusation', 'attack', 'loyalty', 'threat']] != 0).any(axis=1)]
df_tweets = df_tweets.reset_index(drop=True)
df_tweets

Unnamed: 0.1,Unnamed: 0,ID,Tweet Time,Convo ID,Likes Count,Retweets Count,Tweets,Tweets(rendered),in_reply_to_tweet_id,in_reply_to,...,clean_tweets_len,Month,social,historical,dehumanization,accusation,attack,loyalty,threat,Tweets.1
0,0,1585739644011892736,2022-10-27 21:06:07,1585739644011892736,460,80,Sama banget Komentar ini dgn para pendukung se...,Sama banget Komentar ini dgn para pendukung se...,,,...,11,October,1,0,0,0,0.0,0,0,Sama banget Komentar ini dgn para pendukung se...
1,1,1585737538806853632,2022-10-27 20:57:45,1585737538806853632,420,92,Ha ha ha sigundul penguasa ancol karena selama...,Ha ha ha sigundul penguasa ancol karena selama...,,,...,15,October,1,0,0,1,0.0,0,0,Ha ha ha sigundul penguasa ancol karena selama...
2,2,1584780616369049605,2022-10-25 05:35:17,1584780616369049605,343,40,Kasihan Ibu ini jadi korban akibat dicuci otak...,Kasihan Ibu ini jadi korban akibat dicuci otak...,,,...,14,October,0,0,1,1,0.0,0,0,Kasihan Ibu ini jadi korban akibat dicuci otak...
3,3,1584778048120836096,2022-10-25 05:25:05,1584778048120836096,212,23,Ha ha hakadrun pada stresssssss mengenai beber...,Ha ha hakadrun pada stresssssss mengenai beber...,,,...,18,October,1,0,1,0,0.0,0,0,Ha ha hakadrun pada stresssssss mengenai beber...
4,4,1584774254502912000,2022-10-25 05:10:00,1584774254502912000,860,63,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,,,...,33,October,0,0,1,0,0.0,0,0,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,4151,1120875995962101760,2019-04-24 02:23:56,1120875995962101760,26,4,"@ustadtengkuzul Belajarlah menerima kekalahan,...","@ustadtengkuzul Belajarlah menerima kekalahan,...",,,...,12,April,1,0,0,1,0.0,0,0,"@ustadtengkuzul Belajarlah menerima kekalahan,..."
1660,4153,1120574450410217472,2019-04-23 06:25:42,1120574450410217472,1660,264,Aneh tapi Nyata ini bukan HOAX di Uk...,Aneh tapi Nyata ini bukan HOAX di Uk...,,,...,13,April,0,0,0,1,0.0,0,0,Aneh tapi Nyata ini bukan HOAX di Uk...
1661,4166,1119465028040527872,2019-04-20 04:57:15,1119465028040527872,1311,193,"Tim Sukses Pak Jokowi bukan Panglima Hukum, Pr...","Tim Sukses Pak Jokowi bukan Panglima Hukum, Pr...",,,...,24,April,1,0,0,1,0.0,0,0,"Tim Sukses Pak Jokowi bukan Panglima Hukum, Pr..."
1662,4168,1119429279811428353,2019-04-20 02:35:12,1119429279811428353,10,0,@FerdinandHutah2 Masih saja kau menggonggong s...,@FerdinandHutah2 Masih saja kau menggonggong s...,,,...,6,April,0,0,1,0,0.0,0,0,@FerdinandHutah2 Masih saja kau menggonggong s...


# POS tagging with flair

In [None]:
# 1. get the corpus
corpus = UD_INDONESIAN().downsample(0.1)
print(corpus)

# 2. what label do we want to predict?
label_type = 'upos'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

# 4. initialize embeddings
embedding_types = [

    WordEmbeddings('id-crawl'),
    WordEmbeddings('id'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-upos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)

2023-03-08 10:11:08,791 Reading data from /root/.flair/datasets/ud_indonesian
2023-03-08 10:11:08,793 Train: /root/.flair/datasets/ud_indonesian/id_gsd-ud-train.conllu
2023-03-08 10:11:08,795 Dev: /root/.flair/datasets/ud_indonesian/id_gsd-ud-dev.conllu
2023-03-08 10:11:08,799 Test: /root/.flair/datasets/ud_indonesian/id_gsd-ud-test.conllu
Corpus: 448 train + 56 dev + 56 test sentences
2023-03-08 10:11:28,392 Computing label dictionary. Progress:


448it [00:00, 6848.97it/s]

2023-03-08 10:11:28,541 Dictionary created for label 'upos' with 17 values: NOUN (seen 2099 times), PROPN (seen 1768 times), PUNCT (seen 1326 times), VERB (seen 1041 times), ADP (seen 897 times), PRON (seen 538 times), ADJ (seen 392 times), NUM (seen 307 times), CCONJ (seen 302 times), ADV (seen 291 times), DET (seen 287 times), SCONJ (seen 190 times), AUX (seen 189 times), PART (seen 89 times), X (seen 7 times), SYM (seen 2 times)
Dictionary with 17 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, CCONJ, ADV, DET, SCONJ, AUX, PART, X, SYM





2023-03-08 10:12:07,736 SequenceTagger predicts: Dictionary with 17 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, CCONJ, ADV, DET, SCONJ, AUX, PART, X, SYM
2023-03-08 10:12:07,797 ----------------------------------------------------------------------------------------------------
2023-03-08 10:12:07,800 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'id-crawl'
      (embedding): Embedding(1000000, 300)
    )
    (list_embedding_1): WordEmbeddings(
      'id'
      (embedding): Embedding(300686, 300)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=600, out_features=600, bias=True)
  (rnn): LSTM(600, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=19, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)"
2023-03-08 10:12:07,802 --------------------------------------------------------------

100%|██████████| 2/2 [00:00<00:00,  5.04it/s]

2023-03-08 10:12:15,957 Evaluating as a multi-label problem: False
2023-03-08 10:12:15,988 DEV : loss 2.2457034587860107 - f1-score (micro avg)  0.2963
2023-03-08 10:12:15,998 BAD EPOCHS (no improvement): 0
2023-03-08 10:12:16,000 saving best model





2023-03-08 10:12:24,354 ----------------------------------------------------------------------------------------------------
2023-03-08 10:12:24,963 epoch 2 - iter 1/14 - loss 2.21999531 - time (sec): 0.61 - samples/sec: 1250.91 - lr: 0.100000
2023-03-08 10:12:25,493 epoch 2 - iter 2/14 - loss 2.21059221 - time (sec): 1.14 - samples/sec: 1205.83 - lr: 0.100000
2023-03-08 10:12:26,400 epoch 2 - iter 3/14 - loss 2.18311145 - time (sec): 2.04 - samples/sec: 989.09 - lr: 0.100000
2023-03-08 10:12:27,116 epoch 2 - iter 4/14 - loss 2.15194126 - time (sec): 2.76 - samples/sec: 997.19 - lr: 0.100000
2023-03-08 10:12:27,977 epoch 2 - iter 5/14 - loss 2.15916253 - time (sec): 3.62 - samples/sec: 1014.09 - lr: 0.100000
2023-03-08 10:12:28,361 epoch 2 - iter 6/14 - loss 2.14649548 - time (sec): 4.00 - samples/sec: 1083.98 - lr: 0.100000
2023-03-08 10:12:28,836 epoch 2 - iter 7/14 - loss 2.13920219 - time (sec): 4.48 - samples/sec: 1143.65 - lr: 0.100000
2023-03-08 10:12:29,386 epoch 2 - iter 8/14 

100%|██████████| 2/2 [00:00<00:00,  5.06it/s]

2023-03-08 10:12:32,384 Evaluating as a multi-label problem: False
2023-03-08 10:12:32,406 DEV : loss 1.7855215072631836 - f1-score (micro avg)  0.4246
2023-03-08 10:12:32,416 BAD EPOCHS (no improvement): 0
2023-03-08 10:12:32,418 saving best model





2023-03-08 10:12:40,891 ----------------------------------------------------------------------------------------------------
2023-03-08 10:12:41,738 epoch 3 - iter 1/14 - loss 1.71761751 - time (sec): 0.84 - samples/sec: 792.65 - lr: 0.100000
2023-03-08 10:12:42,428 epoch 3 - iter 2/14 - loss 1.73838419 - time (sec): 1.53 - samples/sec: 850.45 - lr: 0.100000
2023-03-08 10:12:42,863 epoch 3 - iter 3/14 - loss 1.69574515 - time (sec): 1.97 - samples/sec: 936.33 - lr: 0.100000
2023-03-08 10:12:43,845 epoch 3 - iter 4/14 - loss 1.68287857 - time (sec): 2.95 - samples/sec: 876.49 - lr: 0.100000
2023-03-08 10:12:44,534 epoch 3 - iter 5/14 - loss 1.69856001 - time (sec): 3.64 - samples/sec: 916.92 - lr: 0.100000
2023-03-08 10:12:44,975 epoch 3 - iter 6/14 - loss 1.70451499 - time (sec): 4.08 - samples/sec: 975.08 - lr: 0.100000
2023-03-08 10:12:45,435 epoch 3 - iter 7/14 - loss 1.71516538 - time (sec): 4.54 - samples/sec: 1029.92 - lr: 0.100000
2023-03-08 10:12:45,827 epoch 3 - iter 8/14 - lo

100%|██████████| 2/2 [00:00<00:00,  4.93it/s]

2023-03-08 10:12:49,298 Evaluating as a multi-label problem: False
2023-03-08 10:12:49,322 DEV : loss 1.4564083814620972 - f1-score (micro avg)  0.5679
2023-03-08 10:12:49,335 BAD EPOCHS (no improvement): 0
2023-03-08 10:12:49,336 saving best model





2023-03-08 10:12:57,616 ----------------------------------------------------------------------------------------------------
2023-03-08 10:12:58,347 epoch 4 - iter 1/14 - loss 1.57461754 - time (sec): 0.73 - samples/sec: 854.39 - lr: 0.100000
2023-03-08 10:12:58,906 epoch 4 - iter 2/14 - loss 1.50185501 - time (sec): 1.29 - samples/sec: 985.85 - lr: 0.100000
2023-03-08 10:12:59,592 epoch 4 - iter 3/14 - loss 1.48909616 - time (sec): 1.97 - samples/sec: 1042.54 - lr: 0.100000
2023-03-08 10:13:00,136 epoch 4 - iter 4/14 - loss 1.48979807 - time (sec): 2.52 - samples/sec: 1070.92 - lr: 0.100000
2023-03-08 10:13:01,211 epoch 4 - iter 5/14 - loss 1.46638871 - time (sec): 3.59 - samples/sec: 971.07 - lr: 0.100000
2023-03-08 10:13:01,898 epoch 4 - iter 6/14 - loss 1.45725125 - time (sec): 4.28 - samples/sec: 971.24 - lr: 0.100000
2023-03-08 10:13:02,592 epoch 4 - iter 7/14 - loss 1.44584159 - time (sec): 4.97 - samples/sec: 1003.27 - lr: 0.100000
2023-03-08 10:13:02,951 epoch 4 - iter 8/14 - 

100%|██████████| 2/2 [00:01<00:00,  1.50it/s]

2023-03-08 10:13:08,080 Evaluating as a multi-label problem: False





2023-03-08 10:13:08,296 DEV : loss 1.2352898120880127 - f1-score (micro avg)  0.6242
2023-03-08 10:13:08,337 BAD EPOCHS (no improvement): 0
2023-03-08 10:13:08,341 saving best model
2023-03-08 10:13:18,616 ----------------------------------------------------------------------------------------------------
2023-03-08 10:13:19,025 epoch 5 - iter 1/14 - loss 1.24016233 - time (sec): 0.41 - samples/sec: 1611.28 - lr: 0.100000
2023-03-08 10:13:19,620 epoch 5 - iter 2/14 - loss 1.25947358 - time (sec): 1.00 - samples/sec: 1527.57 - lr: 0.100000
2023-03-08 10:13:20,098 epoch 5 - iter 3/14 - loss 1.23307571 - time (sec): 1.48 - samples/sec: 1514.35 - lr: 0.100000
2023-03-08 10:13:20,671 epoch 5 - iter 4/14 - loss 1.24636000 - time (sec): 2.05 - samples/sec: 1431.05 - lr: 0.100000
2023-03-08 10:13:21,033 epoch 5 - iter 5/14 - loss 1.24337648 - time (sec): 2.41 - samples/sec: 1494.92 - lr: 0.100000
2023-03-08 10:13:21,588 epoch 5 - iter 6/14 - loss 1.25240603 - time (sec): 2.97 - samples/sec: 14

100%|██████████| 2/2 [00:00<00:00,  4.95it/s]

2023-03-08 10:13:25,710 Evaluating as a multi-label problem: False
2023-03-08 10:13:25,734 DEV : loss 1.0920048952102661 - f1-score (micro avg)  0.6722
2023-03-08 10:13:25,746 BAD EPOCHS (no improvement): 0
2023-03-08 10:13:25,749 saving best model





2023-03-08 10:13:34,366 ----------------------------------------------------------------------------------------------------
2023-03-08 10:13:35,031 epoch 6 - iter 1/14 - loss 1.15714320 - time (sec): 0.66 - samples/sec: 991.06 - lr: 0.100000
2023-03-08 10:13:35,839 epoch 6 - iter 2/14 - loss 1.07743281 - time (sec): 1.47 - samples/sec: 957.73 - lr: 0.100000
2023-03-08 10:13:36,413 epoch 6 - iter 3/14 - loss 1.10212811 - time (sec): 2.04 - samples/sec: 1050.61 - lr: 0.100000
2023-03-08 10:13:36,869 epoch 6 - iter 4/14 - loss 1.12531618 - time (sec): 2.50 - samples/sec: 1111.94 - lr: 0.100000
2023-03-08 10:13:37,330 epoch 6 - iter 5/14 - loss 1.12879400 - time (sec): 2.96 - samples/sec: 1162.99 - lr: 0.100000
2023-03-08 10:13:38,080 epoch 6 - iter 6/14 - loss 1.11152838 - time (sec): 3.71 - samples/sec: 1143.84 - lr: 0.100000
2023-03-08 10:13:38,496 epoch 6 - iter 7/14 - loss 1.10430067 - time (sec): 4.13 - samples/sec: 1179.27 - lr: 0.100000
2023-03-08 10:13:39,099 epoch 6 - iter 8/14 

100%|██████████| 2/2 [00:00<00:00,  4.94it/s]

2023-03-08 10:13:42,020 Evaluating as a multi-label problem: False
2023-03-08 10:13:42,052 DEV : loss 0.95794278383255 - f1-score (micro avg)  0.7052
2023-03-08 10:13:42,062 BAD EPOCHS (no improvement): 0
2023-03-08 10:13:42,064 saving best model





2023-03-08 10:13:50,368 ----------------------------------------------------------------------------------------------------
2023-03-08 10:13:50,939 epoch 7 - iter 1/14 - loss 0.99539284 - time (sec): 0.57 - samples/sec: 1078.23 - lr: 0.100000
2023-03-08 10:13:51,440 epoch 7 - iter 2/14 - loss 0.99671498 - time (sec): 1.07 - samples/sec: 1133.99 - lr: 0.100000
2023-03-08 10:13:52,220 epoch 7 - iter 3/14 - loss 0.98955226 - time (sec): 1.85 - samples/sec: 1036.63 - lr: 0.100000
2023-03-08 10:13:52,673 epoch 7 - iter 4/14 - loss 1.01125233 - time (sec): 2.30 - samples/sec: 1096.52 - lr: 0.100000
2023-03-08 10:13:53,387 epoch 7 - iter 5/14 - loss 1.00134805 - time (sec): 3.02 - samples/sec: 1104.31 - lr: 0.100000
2023-03-08 10:13:53,823 epoch 7 - iter 6/14 - loss 1.01169964 - time (sec): 3.45 - samples/sec: 1139.88 - lr: 0.100000
2023-03-08 10:13:54,426 epoch 7 - iter 7/14 - loss 1.00122691 - time (sec): 4.05 - samples/sec: 1153.74 - lr: 0.100000
2023-03-08 10:13:54,923 epoch 7 - iter 8/1

100%|██████████| 2/2 [00:00<00:00,  4.98it/s]

2023-03-08 10:13:58,031 Evaluating as a multi-label problem: False
2023-03-08 10:13:58,054 DEV : loss 0.821689248085022 - f1-score (micro avg)  0.7389
2023-03-08 10:13:58,064 BAD EPOCHS (no improvement): 0
2023-03-08 10:13:58,066 saving best model





2023-03-08 10:14:06,447 ----------------------------------------------------------------------------------------------------
2023-03-08 10:14:07,267 epoch 8 - iter 1/14 - loss 0.87497897 - time (sec): 0.82 - samples/sec: 867.05 - lr: 0.100000
2023-03-08 10:14:07,864 epoch 8 - iter 2/14 - loss 0.85254607 - time (sec): 1.41 - samples/sec: 1060.06 - lr: 0.100000
2023-03-08 10:14:08,534 epoch 8 - iter 3/14 - loss 0.88919857 - time (sec): 2.08 - samples/sec: 1046.39 - lr: 0.100000
2023-03-08 10:14:09,121 epoch 8 - iter 4/14 - loss 0.87309209 - time (sec): 2.67 - samples/sec: 1061.46 - lr: 0.100000
2023-03-08 10:14:09,804 epoch 8 - iter 5/14 - loss 0.87410511 - time (sec): 3.35 - samples/sec: 1090.12 - lr: 0.100000
2023-03-08 10:14:10,441 epoch 8 - iter 6/14 - loss 0.88066175 - time (sec): 3.99 - samples/sec: 1089.11 - lr: 0.100000
2023-03-08 10:14:10,896 epoch 8 - iter 7/14 - loss 0.89847549 - time (sec): 4.45 - samples/sec: 1132.02 - lr: 0.100000
2023-03-08 10:14:11,274 epoch 8 - iter 8/14

100%|██████████| 2/2 [00:00<00:00,  4.82it/s]

2023-03-08 10:14:14,851 Evaluating as a multi-label problem: False
2023-03-08 10:14:14,876 DEV : loss 0.7707288861274719 - f1-score (micro avg)  0.7592
2023-03-08 10:14:14,886 BAD EPOCHS (no improvement): 0
2023-03-08 10:14:14,891 saving best model





2023-03-08 10:14:23,375 ----------------------------------------------------------------------------------------------------
2023-03-08 10:14:24,008 epoch 9 - iter 1/14 - loss 0.84794997 - time (sec): 0.52 - samples/sec: 1259.08 - lr: 0.100000
2023-03-08 10:14:24,661 epoch 9 - iter 2/14 - loss 0.85249132 - time (sec): 1.17 - samples/sec: 1115.94 - lr: 0.100000
2023-03-08 10:14:25,172 epoch 9 - iter 3/14 - loss 0.86306637 - time (sec): 1.68 - samples/sec: 1127.29 - lr: 0.100000
2023-03-08 10:14:25,695 epoch 9 - iter 4/14 - loss 0.84301240 - time (sec): 2.20 - samples/sec: 1156.60 - lr: 0.100000
2023-03-08 10:14:26,506 epoch 9 - iter 5/14 - loss 0.85288160 - time (sec): 3.01 - samples/sec: 1113.68 - lr: 0.100000
2023-03-08 10:14:26,948 epoch 9 - iter 6/14 - loss 0.85650919 - time (sec): 3.46 - samples/sec: 1181.07 - lr: 0.100000
2023-03-08 10:14:27,624 epoch 9 - iter 7/14 - loss 0.83643933 - time (sec): 4.13 - samples/sec: 1159.67 - lr: 0.100000
2023-03-08 10:14:28,067 epoch 9 - iter 8/1

100%|██████████| 2/2 [00:00<00:00,  5.08it/s]

2023-03-08 10:14:31,621 Evaluating as a multi-label problem: False
2023-03-08 10:14:31,645 DEV : loss 0.6795819997787476 - f1-score (micro avg)  0.7824
2023-03-08 10:14:31,660 BAD EPOCHS (no improvement): 0
2023-03-08 10:14:31,667 saving best model





2023-03-08 10:14:40,085 ----------------------------------------------------------------------------------------------------
2023-03-08 10:14:40,628 epoch 10 - iter 1/14 - loss 0.85604394 - time (sec): 0.54 - samples/sec: 1120.56 - lr: 0.100000
2023-03-08 10:14:41,358 epoch 10 - iter 2/14 - loss 0.82250885 - time (sec): 1.27 - samples/sec: 1056.36 - lr: 0.100000
2023-03-08 10:14:41,900 epoch 10 - iter 3/14 - loss 0.80881387 - time (sec): 1.81 - samples/sec: 1094.97 - lr: 0.100000
2023-03-08 10:14:42,914 epoch 10 - iter 4/14 - loss 0.81134773 - time (sec): 2.82 - samples/sec: 963.64 - lr: 0.100000
2023-03-08 10:14:43,492 epoch 10 - iter 5/14 - loss 0.80665812 - time (sec): 3.40 - samples/sec: 992.80 - lr: 0.100000
2023-03-08 10:14:44,030 epoch 10 - iter 6/14 - loss 0.80372408 - time (sec): 3.94 - samples/sec: 1074.50 - lr: 0.100000
2023-03-08 10:14:44,636 epoch 10 - iter 7/14 - loss 0.78719529 - time (sec): 4.55 - samples/sec: 1101.30 - lr: 0.100000
2023-03-08 10:14:45,096 epoch 10 - it

100%|██████████| 2/2 [00:00<00:00,  4.76it/s]

2023-03-08 10:14:48,362 Evaluating as a multi-label problem: False
2023-03-08 10:14:48,385 DEV : loss 0.6341026425361633 - f1-score (micro avg)  0.7952
2023-03-08 10:14:48,395 BAD EPOCHS (no improvement): 0
2023-03-08 10:14:48,397 saving best model





2023-03-08 10:15:06,404 ----------------------------------------------------------------------------------------------------
2023-03-08 10:15:13,552 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, CCONJ, ADV, DET, SCONJ, AUX, PART, X, SYM, <START>, <STOP>


100%|██████████| 2/2 [00:00<00:00,  4.31it/s]

2023-03-08 10:15:15,107 Evaluating as a multi-label problem: False





2023-03-08 10:15:15,142 0.7973	0.7973	0.7973	0.7973
2023-03-08 10:15:15,144 
Results:
- F-score (micro) 0.7973
- F-score (macro) 0.6173
- Accuracy 0.7973

By class:
              precision    recall  f1-score   support

        NOUN     0.6593    0.9330    0.7726       224
       PROPN     0.9437    0.7701    0.8481       261
        VERB     0.7843    0.9231    0.8481       130
       PUNCT     0.9929    1.0000    0.9964       139
         ADP     0.8571    0.9027    0.8793       113
        PRON     0.9038    0.9792    0.9400        48
         ADV     0.3846    0.3409    0.3614        44
         DET     0.8000    0.5217    0.6316        46
         ADJ     0.6818    0.2885    0.4054        52
       CCONJ     0.9333    0.8750    0.9032        32
         NUM     0.8519    0.8214    0.8364        28
         AUX     0.2778    0.2174    0.2439        23
       SCONJ     0.5385    0.3333    0.4118        21
        PART     1.0000    0.1000    0.1818        10
         SYM     0.0000 

{'test_score': 0.797274275979557,
 'dev_score_history': [0.29632408102025504,
  0.42460615153788445,
  0.5678919729932483,
  0.6241560390097525,
  0.6721680420105026,
  0.7051762940735183,
  0.7389347336834209,
  0.7591897974493623,
  0.7824456114028507,
  0.795198799699925],
 'train_loss_history': [2.5863374136407455,
  2.0340061279924484,
  1.6599990836881426,
  1.393549691717545,
  1.2258456930832262,
  1.0848034071738433,
  0.961092068002892,
  0.8911650465938303,
  0.8332131667737789,
  0.7772904783901028],
 'dev_loss_history': [2.2457034587860107,
  1.7855215072631836,
  1.4564083814620972,
  1.2352898120880127,
  1.0920048952102661,
  0.95794278383255,
  0.821689248085022,
  0.7707288861274719,
  0.6795819997787476,
  0.6341026425361633]}

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

tag_pos = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

with open('postagged.csv', mode='w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for index, tweet in df_tweets['clean_tweets'].iteritems():
        sentence = Sentence(tweet)
        tag_pos.predict(sentence)
        writer.writerow([index, tweet, sentence.to_tagged_string()])

2023-03-08 10:15:21,212 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, CCONJ, ADV, DET, SCONJ, AUX, PART, X, SYM, <START>, <STOP>


#1-gram

In [None]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in df_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')


2023-03-08 10:15:43,480 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, CCONJ, ADV, DET, SCONJ, AUX, PART, X, SYM, <START>, <STOP>
Top 10 most frequent POS tag patterns:
NOUN: 12854
PROPN: 4505
VERB: 2520
ADJ: 684
ADP: 436
PRON: 317
ADV: 299
PUNCT: 269
DET: 99
NUM: 53


In [None]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# specify the category for which to extract the most frequent POS tag patterns
social_tweets = df_tweets.loc[df_tweets.social== 1]

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in social_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns for Social aspect:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')

2023-03-08 10:16:06,316 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, CCONJ, ADV, DET, SCONJ, AUX, PART, X, SYM, <START>, <STOP>
Top 10 most frequent POS tag patterns for Social aspect:
NOUN: 6604
PROPN: 2173
VERB: 1307
ADJ: 328
ADP: 194
ADV: 139
PUNCT: 84
PRON: 79
DET: 56
NUM: 28


In [None]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# specify the category for which to extract the most frequent POS tag patterns
historical_tweets = df_tweets.loc[df_tweets.historical== 1]

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in historical_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns for Historical aspect:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')

2023-03-08 10:16:19,406 SequenceTagger predicts: Dictionary with 19 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, CCONJ, ADV, DET, SCONJ, AUX, PART, X, SYM, <START>, <STOP>
Top 10 most frequent POS tag patterns for Historical aspect:
NOUN: 303
PROPN: 121
VERB: 61
ADJ: 14
ADP: 10
DET: 4
PUNCT: 3
ADV: 3
PRON: 2
