# Install and import required libraries

In [16]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import google.colab
from tqdm import tqdm
import re
import warnings
import csv

from flair.datasets import UD_INDONESIAN
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from flair.data import Sentence

warnings.filterwarnings("ignore")

# Import data

In [18]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [19]:
df_tweets = pd.read_csv('/content/drive/MyDrive/TA/tweets/ruhutsitompul/done_labelled_ruhutsitompul.csv', delimiter=',', error_bad_lines=False)
df_tweets

Unnamed: 0.1,Unnamed: 0,ID,Tweet Time,Convo ID,Likes Count,Retweets Count,Tweets,Tweets(rendered),in_reply_to_tweet_id,in_reply_to,...,clean_tweets_len,Month,social,historical,dehumanization,accusation,attack,loyalty,threat,Tweets.1
0,0,1585739644011892736,2022-10-27 21:06:07,1585739644011892736,460,80,Sama banget Komentar ini dgn para pendukung se...,Sama banget Komentar ini dgn para pendukung se...,,,...,11,October,1,0,0,0,0.0,0,0,Sama banget Komentar ini dgn para pendukung se...
1,1,1585737538806853632,2022-10-27 20:57:45,1585737538806853632,420,92,Ha ha ha sigundul penguasa ancol karena selama...,Ha ha ha sigundul penguasa ancol karena selama...,,,...,15,October,1,0,0,1,0.0,0,0,Ha ha ha sigundul penguasa ancol karena selama...
2,2,1584780616369049605,2022-10-25 05:35:17,1584780616369049605,343,40,Kasihan Ibu ini jadi korban akibat dicuci otak...,Kasihan Ibu ini jadi korban akibat dicuci otak...,,,...,14,October,0,0,1,1,0.0,0,0,Kasihan Ibu ini jadi korban akibat dicuci otak...
3,3,1584778048120836096,2022-10-25 05:25:05,1584778048120836096,212,23,Ha ha hakadrun pada stresssssss mengenai beber...,Ha ha hakadrun pada stresssssss mengenai beber...,,,...,18,October,1,0,1,0,0.0,0,0,Ha ha hakadrun pada stresssssss mengenai beber...
4,4,1584774254502912000,2022-10-25 05:10:00,1584774254502912000,860,63,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,,,...,33,October,0,0,1,0,0.0,0,0,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,4178,1118666888647602177,2019-04-18 00:05:44,1118666888647602177,0,0,@NediSetiadi Kami sedih krn biar sudah kalian ...,@NediSetiadi Kami sedih krn biar sudah kalian ...,,,...,14,April,1,0,0,1,0.0,0,0,@NediSetiadi Kami sedih krn biar sudah kalian ...
4179,4179,1118665503642247170,2019-04-18 00:00:14,1118665503642247170,7,0,@TanYoana haiya Amoy ini makin stresssssss kac...,@TanYoana haiya Amoy ini makin stresssssss kac...,,,...,5,April,0,0,0,0,0.0,0,0,@TanYoana haiya Amoy ini makin stresssssss kac...
4180,4180,1118664923301572608,2019-04-17 23:57:56,1118664923301572608,1684,200,Terima kasih KPU BAWASLU DKPP sbg Penyelenggar...,Terima kasih KPU BAWASLU DKPP sbg Penyelenggar...,,,...,31,April,0,0,0,0,0.0,0,0,Terima kasih KPU BAWASLU DKPP sbg Penyelenggar...
4181,4181,1118396091148460032,2019-04-17 06:09:41,1118396091148460032,6,0,"@panca66 Ha ha ha maaf memilih diri sendiri, s...","@panca66 Ha ha ha maaf memilih diri sendiri, s...",,,...,5,April,0,0,0,0,0.0,0,0,"@panca66 Ha ha ha maaf memilih diri sendiri, s..."


In [20]:
# Remove tweets without any aspect
df_tweets = df_tweets.loc[(df_tweets[['social', 'historical', 'dehumanization', 'accusation', 'attack', 'loyalty', 'threat']] != 0).any(axis=1)]
df_tweets = df_tweets.reset_index(drop=True)
df_tweets

Unnamed: 0.1,Unnamed: 0,ID,Tweet Time,Convo ID,Likes Count,Retweets Count,Tweets,Tweets(rendered),in_reply_to_tweet_id,in_reply_to,...,clean_tweets_len,Month,social,historical,dehumanization,accusation,attack,loyalty,threat,Tweets.1
0,0,1585739644011892736,2022-10-27 21:06:07,1585739644011892736,460,80,Sama banget Komentar ini dgn para pendukung se...,Sama banget Komentar ini dgn para pendukung se...,,,...,11,October,1,0,0,0,0.0,0,0,Sama banget Komentar ini dgn para pendukung se...
1,1,1585737538806853632,2022-10-27 20:57:45,1585737538806853632,420,92,Ha ha ha sigundul penguasa ancol karena selama...,Ha ha ha sigundul penguasa ancol karena selama...,,,...,15,October,1,0,0,1,0.0,0,0,Ha ha ha sigundul penguasa ancol karena selama...
2,2,1584780616369049605,2022-10-25 05:35:17,1584780616369049605,343,40,Kasihan Ibu ini jadi korban akibat dicuci otak...,Kasihan Ibu ini jadi korban akibat dicuci otak...,,,...,14,October,0,0,1,1,0.0,0,0,Kasihan Ibu ini jadi korban akibat dicuci otak...
3,3,1584778048120836096,2022-10-25 05:25:05,1584778048120836096,212,23,Ha ha hakadrun pada stresssssss mengenai beber...,Ha ha hakadrun pada stresssssss mengenai beber...,,,...,18,October,1,0,1,0,0.0,0,0,Ha ha hakadrun pada stresssssss mengenai beber...
4,4,1584774254502912000,2022-10-25 05:10:00,1584774254502912000,860,63,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...,,,...,33,October,0,0,1,0,0.0,0,0,Kok sewot dgn Pidato Sambutan Bpk Joko Widodo ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,4151,1120875995962101760,2019-04-24 02:23:56,1120875995962101760,26,4,"@ustadtengkuzul Belajarlah menerima kekalahan,...","@ustadtengkuzul Belajarlah menerima kekalahan,...",,,...,12,April,1,0,0,1,0.0,0,0,"@ustadtengkuzul Belajarlah menerima kekalahan,..."
1660,4153,1120574450410217472,2019-04-23 06:25:42,1120574450410217472,1660,264,Aneh tapi Nyata ini bukan HOAX di Uk...,Aneh tapi Nyata ini bukan HOAX di Uk...,,,...,13,April,0,0,0,1,0.0,0,0,Aneh tapi Nyata ini bukan HOAX di Uk...
1661,4166,1119465028040527872,2019-04-20 04:57:15,1119465028040527872,1311,193,"Tim Sukses Pak Jokowi bukan Panglima Hukum, Pr...","Tim Sukses Pak Jokowi bukan Panglima Hukum, Pr...",,,...,24,April,1,0,0,1,0.0,0,0,"Tim Sukses Pak Jokowi bukan Panglima Hukum, Pr..."
1662,4168,1119429279811428353,2019-04-20 02:35:12,1119429279811428353,10,0,@FerdinandHutah2 Masih saja kau menggonggong s...,@FerdinandHutah2 Masih saja kau menggonggong s...,,,...,6,April,0,0,1,0,0.0,0,0,@FerdinandHutah2 Masih saja kau menggonggong s...


# POS tagging with flair

In [21]:
# 1. get the corpus
corpus = UD_INDONESIAN().downsample(0.1)
print(corpus)

# 2. what label do we want to predict?
label_type = 'upos'

# 3. make the label dictionary from the corpus
label_dict = corpus.make_label_dictionary(label_type=label_type)
print(label_dict)

# 4. initialize embeddings
embedding_types = [

    WordEmbeddings('id-crawl'),
    WordEmbeddings('id'),

    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

embeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

# 6. initialize trainer
trainer = ModelTrainer(tagger, corpus)

# 7. start training
trainer.train('resources/taggers/example-upos',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=10)

2023-03-08 09:53:33,625 Reading data from /root/.flair/datasets/ud_indonesian
2023-03-08 09:53:33,630 Train: /root/.flair/datasets/ud_indonesian/id_gsd-ud-train.conllu
2023-03-08 09:53:33,632 Dev: /root/.flair/datasets/ud_indonesian/id_gsd-ud-dev.conllu
2023-03-08 09:53:33,634 Test: /root/.flair/datasets/ud_indonesian/id_gsd-ud-test.conllu
Corpus: 448 train + 56 dev + 56 test sentences
2023-03-08 09:53:54,541 Computing label dictionary. Progress:


448it [00:00, 4657.29it/s]

2023-03-08 09:53:54,694 Dictionary created for label 'upos' with 18 values: NOUN (seen 2138 times), PROPN (seen 1813 times), PUNCT (seen 1398 times), VERB (seen 1031 times), ADP (seen 938 times), PRON (seen 530 times), ADJ (seen 375 times), NUM (seen 333 times), DET (seen 310 times), CCONJ (seen 283 times), ADV (seen 282 times), AUX (seen 210 times), SCONJ (seen 173 times), PART (seen 79 times), X (seen 23 times), SYM (seen 14 times), INTJ (seen 1 times)
Dictionary with 18 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ





2023-03-08 09:54:23,591 SequenceTagger predicts: Dictionary with 18 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ
2023-03-08 09:54:23,717 ----------------------------------------------------------------------------------------------------
2023-03-08 09:54:23,720 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'id-crawl'
      (embedding): Embedding(1000000, 300)
    )
    (list_embedding_1): WordEmbeddings(
      'id'
      (embedding): Embedding(300686, 300)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=600, out_features=600, bias=True)
  (rnn): LSTM(600, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=20, bias=True)
  (loss_function): ViterbiLoss()
  (crf): CRF()
)"
2023-03-08 09:54:23,722 --------------------------------------------------------

100%|██████████| 2/2 [00:00<00:00,  2.46it/s]

2023-03-08 09:54:37,924 Evaluating as a multi-label problem: False
2023-03-08 09:54:37,978 DEV : loss 2.2096564769744873 - f1-score (micro avg)  0.2363
2023-03-08 09:54:37,999 BAD EPOCHS (no improvement): 0
2023-03-08 09:54:38,007 saving best model





2023-03-08 09:54:46,756 ----------------------------------------------------------------------------------------------------
2023-03-08 09:54:47,322 epoch 2 - iter 1/14 - loss 2.24302647 - time (sec): 0.56 - samples/sec: 1242.25 - lr: 0.100000
2023-03-08 09:54:48,129 epoch 2 - iter 2/14 - loss 2.20596105 - time (sec): 1.37 - samples/sec: 1024.08 - lr: 0.100000
2023-03-08 09:54:50,259 epoch 2 - iter 3/14 - loss 2.16917738 - time (sec): 3.50 - samples/sec: 655.57 - lr: 0.100000
2023-03-08 09:54:51,199 epoch 2 - iter 4/14 - loss 2.14610577 - time (sec): 4.44 - samples/sec: 676.37 - lr: 0.100000
2023-03-08 09:54:52,105 epoch 2 - iter 5/14 - loss 2.14131741 - time (sec): 5.35 - samples/sec: 677.74 - lr: 0.100000
2023-03-08 09:54:52,792 epoch 2 - iter 6/14 - loss 2.12791373 - time (sec): 6.03 - samples/sec: 707.52 - lr: 0.100000
2023-03-08 09:54:53,791 epoch 2 - iter 7/14 - loss 2.10986136 - time (sec): 7.03 - samples/sec: 724.05 - lr: 0.100000
2023-03-08 09:54:54,525 epoch 2 - iter 8/14 - l

100%|██████████| 2/2 [00:00<00:00,  4.08it/s]

2023-03-08 09:55:00,865 Evaluating as a multi-label problem: False
2023-03-08 09:55:00,891 DEV : loss 1.6100496053695679 - f1-score (micro avg)  0.4904
2023-03-08 09:55:00,904 BAD EPOCHS (no improvement): 0
2023-03-08 09:55:00,909 saving best model





2023-03-08 09:55:09,417 ----------------------------------------------------------------------------------------------------
2023-03-08 09:55:11,660 epoch 3 - iter 1/14 - loss 1.66943724 - time (sec): 2.24 - samples/sec: 328.87 - lr: 0.100000
2023-03-08 09:55:12,276 epoch 3 - iter 2/14 - loss 1.71393862 - time (sec): 2.86 - samples/sec: 472.49 - lr: 0.100000
2023-03-08 09:55:13,006 epoch 3 - iter 3/14 - loss 1.67721717 - time (sec): 3.59 - samples/sec: 549.55 - lr: 0.100000
2023-03-08 09:55:13,897 epoch 3 - iter 4/14 - loss 1.70855370 - time (sec): 4.48 - samples/sec: 595.77 - lr: 0.100000
2023-03-08 09:55:14,350 epoch 3 - iter 5/14 - loss 1.69787653 - time (sec): 4.93 - samples/sec: 675.36 - lr: 0.100000
2023-03-08 09:55:15,409 epoch 3 - iter 6/14 - loss 1.69387164 - time (sec): 5.99 - samples/sec: 704.34 - lr: 0.100000
2023-03-08 09:55:16,226 epoch 3 - iter 7/14 - loss 1.68885034 - time (sec): 6.81 - samples/sec: 725.61 - lr: 0.100000
2023-03-08 09:55:18,367 epoch 3 - iter 8/14 - los

100%|██████████| 2/2 [00:00<00:00,  4.24it/s]

2023-03-08 09:55:22,421 Evaluating as a multi-label problem: False
2023-03-08 09:55:22,446 DEV : loss 1.3249773979187012 - f1-score (micro avg)  0.5722
2023-03-08 09:55:22,456 BAD EPOCHS (no improvement): 0
2023-03-08 09:55:22,460 saving best model





2023-03-08 09:55:32,344 ----------------------------------------------------------------------------------------------------
2023-03-08 09:55:34,548 epoch 4 - iter 1/14 - loss 1.34869105 - time (sec): 2.20 - samples/sec: 346.95 - lr: 0.100000
2023-03-08 09:55:35,288 epoch 4 - iter 2/14 - loss 1.41807657 - time (sec): 2.94 - samples/sec: 486.85 - lr: 0.100000
2023-03-08 09:55:35,610 epoch 4 - iter 3/14 - loss 1.40052773 - time (sec): 3.26 - samples/sec: 614.74 - lr: 0.100000
2023-03-08 09:55:36,008 epoch 4 - iter 4/14 - loss 1.40942120 - time (sec): 3.66 - samples/sec: 710.77 - lr: 0.100000
2023-03-08 09:55:36,718 epoch 4 - iter 5/14 - loss 1.42380340 - time (sec): 4.37 - samples/sec: 775.18 - lr: 0.100000
2023-03-08 09:55:37,129 epoch 4 - iter 6/14 - loss 1.45431645 - time (sec): 4.78 - samples/sec: 841.50 - lr: 0.100000
2023-03-08 09:55:37,637 epoch 4 - iter 7/14 - loss 1.44307304 - time (sec): 5.29 - samples/sec: 892.93 - lr: 0.100000
2023-03-08 09:55:38,102 epoch 4 - iter 8/14 - los

100%|██████████| 2/2 [00:00<00:00,  2.48it/s]

2023-03-08 09:55:45,870 Evaluating as a multi-label problem: False
2023-03-08 09:55:45,910 DEV : loss 1.1911381483078003 - f1-score (micro avg)  0.6064
2023-03-08 09:55:45,928 BAD EPOCHS (no improvement): 0
2023-03-08 09:55:45,934 saving best model





2023-03-08 09:55:55,595 ----------------------------------------------------------------------------------------------------
2023-03-08 09:55:56,195 epoch 5 - iter 1/14 - loss 1.32608558 - time (sec): 0.53 - samples/sec: 1246.87 - lr: 0.100000
2023-03-08 09:55:56,862 epoch 5 - iter 2/14 - loss 1.27770681 - time (sec): 1.20 - samples/sec: 1067.86 - lr: 0.100000
2023-03-08 09:55:57,261 epoch 5 - iter 3/14 - loss 1.31563967 - time (sec): 1.60 - samples/sec: 1190.73 - lr: 0.100000
2023-03-08 09:55:57,883 epoch 5 - iter 4/14 - loss 1.29718942 - time (sec): 2.22 - samples/sec: 1163.59 - lr: 0.100000
2023-03-08 09:55:58,356 epoch 5 - iter 5/14 - loss 1.31289486 - time (sec): 2.69 - samples/sec: 1203.20 - lr: 0.100000
2023-03-08 09:55:59,156 epoch 5 - iter 6/14 - loss 1.30032568 - time (sec): 3.49 - samples/sec: 1160.80 - lr: 0.100000
2023-03-08 09:56:01,553 epoch 5 - iter 7/14 - loss 1.28922814 - time (sec): 5.89 - samples/sec: 830.33 - lr: 0.100000
2023-03-08 09:56:02,189 epoch 5 - iter 8/14

100%|██████████| 2/2 [00:00<00:00,  3.80it/s]

2023-03-08 09:56:09,627 Evaluating as a multi-label problem: False
2023-03-08 09:56:09,655 DEV : loss 1.0377202033996582 - f1-score (micro avg)  0.6605
2023-03-08 09:56:09,667 BAD EPOCHS (no improvement): 0
2023-03-08 09:56:09,669 saving best model





2023-03-08 09:56:19,463 ----------------------------------------------------------------------------------------------------
2023-03-08 09:56:20,220 epoch 6 - iter 1/14 - loss 1.18831789 - time (sec): 0.75 - samples/sec: 711.28 - lr: 0.100000
2023-03-08 09:56:21,258 epoch 6 - iter 2/14 - loss 1.13534212 - time (sec): 1.79 - samples/sec: 677.78 - lr: 0.100000
2023-03-08 09:56:22,036 epoch 6 - iter 3/14 - loss 1.14033348 - time (sec): 2.57 - samples/sec: 705.77 - lr: 0.100000
2023-03-08 09:56:22,780 epoch 6 - iter 4/14 - loss 1.13493990 - time (sec): 3.31 - samples/sec: 758.98 - lr: 0.100000
2023-03-08 09:56:25,680 epoch 6 - iter 5/14 - loss 1.12552879 - time (sec): 6.22 - samples/sec: 539.17 - lr: 0.100000
2023-03-08 09:56:26,512 epoch 6 - iter 6/14 - loss 1.11408106 - time (sec): 7.05 - samples/sec: 595.91 - lr: 0.100000
2023-03-08 09:56:27,149 epoch 6 - iter 7/14 - loss 1.11811744 - time (sec): 7.68 - samples/sec: 644.74 - lr: 0.100000
2023-03-08 09:56:27,675 epoch 6 - iter 8/14 - los

100%|██████████| 2/2 [00:00<00:00,  4.02it/s]

2023-03-08 09:56:33,818 Evaluating as a multi-label problem: False
2023-03-08 09:56:33,844 DEV : loss 0.9188407063484192 - f1-score (micro avg)  0.6854
2023-03-08 09:56:33,856 BAD EPOCHS (no improvement): 0
2023-03-08 09:56:33,858 saving best model





2023-03-08 09:56:43,242 ----------------------------------------------------------------------------------------------------
2023-03-08 09:56:43,723 epoch 7 - iter 1/14 - loss 1.03216872 - time (sec): 0.43 - samples/sec: 1437.48 - lr: 0.100000
2023-03-08 09:56:44,427 epoch 7 - iter 2/14 - loss 1.03530881 - time (sec): 1.13 - samples/sec: 1173.34 - lr: 0.100000
2023-03-08 09:56:44,918 epoch 7 - iter 3/14 - loss 1.05185103 - time (sec): 1.62 - samples/sec: 1195.83 - lr: 0.100000
2023-03-08 09:56:45,419 epoch 7 - iter 4/14 - loss 1.04263693 - time (sec): 2.12 - samples/sec: 1193.32 - lr: 0.100000
2023-03-08 09:56:45,833 epoch 7 - iter 5/14 - loss 1.04116959 - time (sec): 2.54 - samples/sec: 1218.22 - lr: 0.100000
2023-03-08 09:56:47,730 epoch 7 - iter 6/14 - loss 1.03027454 - time (sec): 4.43 - samples/sec: 898.77 - lr: 0.100000
2023-03-08 09:56:48,378 epoch 7 - iter 7/14 - loss 1.03882745 - time (sec): 5.08 - samples/sec: 903.86 - lr: 0.100000
2023-03-08 09:56:50,683 epoch 7 - iter 8/14 

100%|██████████| 2/2 [00:00<00:00,  2.51it/s]

2023-03-08 09:56:57,083 Evaluating as a multi-label problem: False
2023-03-08 09:56:57,121 DEV : loss 0.8484406471252441 - f1-score (micro avg)  0.706
2023-03-08 09:56:57,136 BAD EPOCHS (no improvement): 0
2023-03-08 09:56:57,140 saving best model





2023-03-08 09:57:06,483 ----------------------------------------------------------------------------------------------------
2023-03-08 09:57:07,241 epoch 8 - iter 1/14 - loss 1.14998599 - time (sec): 0.72 - samples/sec: 1001.31 - lr: 0.100000
2023-03-08 09:57:07,645 epoch 8 - iter 2/14 - loss 1.02209454 - time (sec): 1.12 - samples/sec: 1196.55 - lr: 0.100000
2023-03-08 09:57:08,185 epoch 8 - iter 3/14 - loss 0.96326493 - time (sec): 1.66 - samples/sec: 1253.21 - lr: 0.100000
2023-03-08 09:57:08,991 epoch 8 - iter 4/14 - loss 0.95907254 - time (sec): 2.47 - samples/sec: 1113.39 - lr: 0.100000
2023-03-08 09:57:09,660 epoch 8 - iter 5/14 - loss 0.94891211 - time (sec): 3.14 - samples/sec: 1087.06 - lr: 0.100000
2023-03-08 09:57:10,773 epoch 8 - iter 6/14 - loss 0.97456439 - time (sec): 4.25 - samples/sec: 992.42 - lr: 0.100000
2023-03-08 09:57:11,548 epoch 8 - iter 7/14 - loss 0.96965813 - time (sec): 5.02 - samples/sec: 982.02 - lr: 0.100000
2023-03-08 09:57:12,798 epoch 8 - iter 8/14 

100%|██████████| 2/2 [00:00<00:00,  4.15it/s]

2023-03-08 09:57:19,604 Evaluating as a multi-label problem: False
2023-03-08 09:57:19,628 DEV : loss 0.7241466641426086 - f1-score (micro avg)  0.758
2023-03-08 09:57:19,640 BAD EPOCHS (no improvement): 0
2023-03-08 09:57:19,642 saving best model





2023-03-08 09:57:28,346 ----------------------------------------------------------------------------------------------------
2023-03-08 09:57:31,265 epoch 9 - iter 1/14 - loss 0.88246101 - time (sec): 2.91 - samples/sec: 290.50 - lr: 0.100000
2023-03-08 09:57:32,321 epoch 9 - iter 2/14 - loss 0.87258386 - time (sec): 3.97 - samples/sec: 389.65 - lr: 0.100000
2023-03-08 09:57:33,031 epoch 9 - iter 3/14 - loss 0.89423776 - time (sec): 4.68 - samples/sec: 458.96 - lr: 0.100000
2023-03-08 09:57:33,807 epoch 9 - iter 4/14 - loss 0.91839679 - time (sec): 5.45 - samples/sec: 518.71 - lr: 0.100000
2023-03-08 09:57:35,675 epoch 9 - iter 5/14 - loss 0.89865960 - time (sec): 7.32 - samples/sec: 502.33 - lr: 0.100000
2023-03-08 09:57:36,427 epoch 9 - iter 6/14 - loss 0.88395473 - time (sec): 8.07 - samples/sec: 548.31 - lr: 0.100000
2023-03-08 09:57:37,255 epoch 9 - iter 7/14 - loss 0.88495112 - time (sec): 8.90 - samples/sec: 577.55 - lr: 0.100000
2023-03-08 09:57:37,909 epoch 9 - iter 8/14 - los

100%|██████████| 2/2 [00:00<00:00,  4.01it/s]

2023-03-08 09:57:42,148 Evaluating as a multi-label problem: False
2023-03-08 09:57:42,174 DEV : loss 0.6458045840263367 - f1-score (micro avg)  0.7751
2023-03-08 09:57:42,184 BAD EPOCHS (no improvement): 0
2023-03-08 09:57:42,186 saving best model





2023-03-08 09:57:51,252 ----------------------------------------------------------------------------------------------------
2023-03-08 09:57:51,983 epoch 10 - iter 1/14 - loss 0.87051506 - time (sec): 0.68 - samples/sec: 1014.05 - lr: 0.100000
2023-03-08 09:57:53,861 epoch 10 - iter 2/14 - loss 0.83622774 - time (sec): 2.56 - samples/sec: 578.60 - lr: 0.100000
2023-03-08 09:57:56,246 epoch 10 - iter 3/14 - loss 0.79559221 - time (sec): 4.95 - samples/sec: 481.59 - lr: 0.100000
2023-03-08 09:57:56,648 epoch 10 - iter 4/14 - loss 0.81617253 - time (sec): 5.35 - samples/sec: 551.97 - lr: 0.100000
2023-03-08 09:57:57,343 epoch 10 - iter 5/14 - loss 0.81684204 - time (sec): 6.04 - samples/sec: 602.54 - lr: 0.100000
2023-03-08 09:57:58,100 epoch 10 - iter 6/14 - loss 0.82382208 - time (sec): 6.80 - samples/sec: 635.11 - lr: 0.100000
2023-03-08 09:57:58,750 epoch 10 - iter 7/14 - loss 0.82323499 - time (sec): 7.45 - samples/sec: 677.97 - lr: 0.100000
2023-03-08 09:57:59,412 epoch 10 - iter 8

100%|██████████| 2/2 [00:05<00:00,  2.93s/it]

2023-03-08 09:58:09,783 Evaluating as a multi-label problem: False
2023-03-08 09:58:09,815 DEV : loss 0.6197919249534607 - f1-score (micro avg)  0.7836
2023-03-08 09:58:09,826 BAD EPOCHS (no improvement): 0
2023-03-08 09:58:09,831 saving best model





2023-03-08 09:58:28,197 ----------------------------------------------------------------------------------------------------
2023-03-08 09:58:35,152 SequenceTagger predicts: Dictionary with 20 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ, <START>, <STOP>


100%|██████████| 2/2 [00:00<00:00,  5.07it/s]

2023-03-08 09:58:36,673 Evaluating as a multi-label problem: False





2023-03-08 09:58:36,711 0.7886	0.7886	0.7886	0.7886
2023-03-08 09:58:36,713 
Results:
- F-score (micro) 0.7886
- F-score (macro) 0.6445
- Accuracy 0.7886

By class:
              precision    recall  f1-score   support

       PROPN     0.7429    0.9579    0.8368       190
        NOUN     0.8494    0.6714    0.7500       210
       PUNCT     0.9854    1.0000    0.9926       135
        VERB     0.7760    0.9238    0.8435       105
         ADP     0.7304    0.9231    0.8155        91
        PRON     0.7500    0.9344    0.8321        61
         NUM     0.8125    0.8966    0.8525        29
         ADJ     0.8571    0.2667    0.4068        45
         ADV     0.1852    0.1724    0.1786        29
       CCONJ     1.0000    0.7857    0.8800        28
         AUX     0.5714    0.5455    0.5581        22
         DET     0.9167    0.3793    0.5366        29
       SCONJ     0.5000    0.0526    0.0952        19
        PART     0.5000    0.4000    0.4444         5

    accuracy           

{'test_score': 0.7885771543086172,
 'dev_score_history': [0.23629893238434163,
  0.4903914590747331,
  0.5722419928825623,
  0.606405693950178,
  0.6604982206405694,
  0.6854092526690392,
  0.706049822064057,
  0.7580071174377224,
  0.7750889679715303,
  0.7836298932384341],
 'train_loss_history': [2.6291418969601184,
  1.9866362692618127,
  1.5969153221682737,
  1.4051699155815753,
  1.2626137192330202,
  1.1160078270535445,
  1.024502293363269,
  0.9353549992998565,
  0.8715583325728778,
  0.8124591664909753],
 'dev_loss_history': [2.2096564769744873,
  1.6100496053695679,
  1.3249773979187012,
  1.1911381483078003,
  1.0377202033996582,
  0.9188407063484192,
  0.8484406471252441,
  0.7241466641426086,
  0.6458045840263367,
  0.6197919249534607]}

#1-gram

In [22]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# specify the category for which to extract the most frequent POS tag patterns
dehumanization_tweets = df_tweets.loc[df_tweets.dehumanization== 1]

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in dehumanization_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns for Dehumanization aspect:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')

2023-03-08 09:58:43,258 SequenceTagger predicts: Dictionary with 20 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ, <START>, <STOP>
Top 10 most frequent POS tag patterns for Dehumanization aspect:
PROPN: 4978
NOUN: 1817
VERB: 1359
PUNCT: 500
ADJ: 390
PRON: 321
ADP: 300
ADV: 266
AUX: 113
NUM: 70


In [23]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# specify the category for which to extract the most frequent POS tag patterns
accusation_tweets = df_tweets.loc[df_tweets.accusation== 1]

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in accusation_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns for Accusation aspect:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')

2023-03-08 09:58:58,237 SequenceTagger predicts: Dictionary with 20 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ, <START>, <STOP>
Top 10 most frequent POS tag patterns for Accusation aspect:
PROPN: 3771
NOUN: 1285
VERB: 1168
ADJ: 256
PUNCT: 214
ADP: 189
ADV: 183
PRON: 150
AUX: 112
NUM: 32


In [24]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# specify the category for which to extract the most frequent POS tag patterns
attack_tweets = df_tweets.loc[df_tweets.attack== 1]

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in attack_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns for Attack aspect:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')

2023-03-08 09:59:09,891 SequenceTagger predicts: Dictionary with 20 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ, <START>, <STOP>
Top 10 most frequent POS tag patterns for Attack aspect:
PROPN: 133
NOUN: 108
VERB: 86
ADJ: 22
PRON: 15
ADV: 14
ADP: 12
AUX: 9
PUNCT: 8
PART: 1


In [25]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# specify the category for which to extract the most frequent POS tag patterns
loyalty_tweets = df_tweets.loc[df_tweets.loyalty== 1]

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in loyalty_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns for Loyalty aspect:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')

2023-03-08 09:59:17,777 SequenceTagger predicts: Dictionary with 20 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ, <START>, <STOP>
Top 10 most frequent POS tag patterns for Loyalty aspect:
PROPN: 287
VERB: 90
NOUN: 88
PRON: 17
AUX: 15
ADV: 14
ADP: 12
PUNCT: 11
ADJ: 10
NUM: 5


In [26]:
# load the Flair POS tagger for Indonesian language
tagger = SequenceTagger.load('resources/taggers/example-upos/best-model.pt')

# specify the category for which to extract the most frequent POS tag patterns
threat_tweets = df_tweets.loc[df_tweets.threat== 1]

print(f'Number of tweets in filtered dataframe: {len(df_tweets)}')

# create an empty dictionary to store the POS tag patterns and their frequency
tag_patterns = {}

# iterate over each tweet in the dataset
for tweet in threat_tweets['clean_tweets'] :
    # create a Flair sentence object from the tweet text
    sentence = Sentence(tweet, use_tokenizer=True)
    
    # use the POS tagger to predict the POS tags for the sentence
    tagger.predict(sentence)
    
    # iterate over each token in the sentence and extract the POS tag
    for token in sentence:
        tag = token.get_label().value
        
        # add the POS tag to the tag_patterns dictionary and increment its frequency count
        if tag not in tag_patterns:
            tag_patterns[tag] = 1
        else:
            tag_patterns[tag] += 1

# sort the tag_patterns dictionary by frequency count in descending order
sorted_tag_patterns = {k: v for k, v in sorted(tag_patterns.items(), key=lambda item: item[1], reverse=True)}

# print the top 10 most frequent POS tag patterns
print('Top 10 most frequent POS tag patterns for Threat aspect:')
for tag, freq in list(sorted_tag_patterns.items())[:10]:
    print(f'{tag}: {freq}')

2023-03-08 10:00:13,245 SequenceTagger predicts: Dictionary with 20 tags: <unk>, NOUN, PROPN, PUNCT, VERB, ADP, PRON, ADJ, NUM, DET, CCONJ, ADV, AUX, SCONJ, PART, X, SYM, INTJ, <START>, <STOP>
Number of tweets in filtered dataframe: 1664
Top 10 most frequent POS tag patterns for Threat aspect:
PROPN: 472
VERB: 202
NOUN: 193
ADJ: 28
ADP: 27
PRON: 25
PUNCT: 20
AUX: 19
ADV: 15
NUM: 6
