In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install flair==0.11.3 &> /dev/null

In [None]:
from glob import glob
dataset = glob("/content/drive/MyDrive/gold-random-json/*")

## Data Preprocessing

In [None]:
import json
from itertools import product
from flair.data import Sentence
from flair.tokenization import TokenizerWrapper
from nltk import wordpunct_tokenize

train_corpus = []
dev_corpus = []

for folder,corpus_type in product(dataset,["train","dev"]):
    for line in open(folder + '/' + corpus_type + '.jsonl'):
        obj = json.loads(line)
        sentence = Sentence(obj["text"],use_tokenizer = TokenizerWrapper(wordpunct_tokenize))

        for span_labels in obj["location_mentions"]:
            s,e = span_labels["start_offset"],span_labels["end_offset"]
            s_tok,e_tok = -1,-1
            for i,x in zip(range(len(sentence)),sentence):
                if s <= x.start_pos and x.end_pos <= e :
                    s_tok = s_tok if s_tok != -1 else i
                    e_tok = i
            try:
                sentence[s_tok:e_tok + 1].add_label("ner",span_labels["type"])  
            except:
                pass
        globals()[corpus_type + '_corpus'].append(sentence)
    
    

## Training

In [None]:
from flair.models import SequenceTagger
from flair.embeddings import WordEmbeddings,FlairEmbeddings,StackedEmbeddings,TransformerWordEmbeddings
from flair.trainers import ModelTrainer
from flair.data import Corpus

corpus = Corpus(train_corpus,dev_corpus) # store train and test data
corpus.downsample(0.35) #downsample to 35% 

<flair.data.Corpus at 0x7f7625fc0850>

### GLOVE + BiLSTM + CRF

In [None]:
label_type = 'ner' #Name-Entity Recognition (NER): It can recognise whether a 
                    #word represents a person, location or names in the text.


label_dict = corpus.make_label_dictionary(label_type=label_type) #Create a label dictionary from ner and data
print(label_dict)

embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]
#The three embedding models will be concatenated and should give state of the art results.
#Document Embeddings generate one embedding for an entire text. 
#The produced embeddings are PyTorch vectors. 
#There are two different methods using the word embeddings to obtain a document embedding

embeddings = StackedEmbeddings(embeddings=embedding_types)

tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

trainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.08,
              mini_batch_size=32,
              max_epochs=16)

2023-04-08 23:32:22,977 Computing label dictionary. Progress:


4534it [00:00, 48000.84it/s]

2023-04-08 23:32:23,118 Dictionary created for label 'ner' with 13 values: Country (seen 1594 times), State (seen 1398 times), City/town (seen 1219 times), Island (seen 307 times), Human-made Point-of-Interest (seen 78 times), County (seen 70 times), Natural Point-of-Interest (seen 53 times), District (seen 49 times), Continent (seen 36 times), Neighborhood (seen 30 times), Road/street (seen 19 times), Other locations (seen 14 times)
Dictionary with 13 tags: <unk>, Country, State, City/town, Island, Human-made Point-of-Interest, County, Natural Point-of-Interest, District, Continent, Neighborhood, Road/street, Other locations





2023-04-08 23:32:23,769 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpk43ba7h8


100%|██████████| 160000128/160000128 [00:07<00:00, 20487359.02B/s]

2023-04-08 23:32:31,996 copying /tmp/tmpk43ba7h8 to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2023-04-08 23:32:32,270 removing temp file /tmp/tmpk43ba7h8
2023-04-08 23:32:32,717 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmpej18pkwd


100%|██████████| 21494764/21494764 [00:01<00:00, 11539629.95B/s]

2023-04-08 23:32:35,067 copying /tmp/tmpej18pkwd to cache at /root/.flair/embeddings/glove.gensim
2023-04-08 23:32:35,124 removing temp file /tmp/tmpej18pkwd





2023-04-08 23:32:42,893 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmp2o2rjgi2


100%|██████████| 73034624/73034624 [00:03<00:00, 18370146.69B/s]

2023-04-08 23:32:47,273 copying /tmp/tmp2o2rjgi2 to cache at /root/.flair/embeddings/news-forward-0.4.1.pt





2023-04-08 23:32:47,344 removing temp file /tmp/tmp2o2rjgi2
2023-04-08 23:32:59,024 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-backward-0.4.1.pt not found in cache, downloading to /tmp/tmpr0h3ppqa


100%|██████████| 73034575/73034575 [00:03<00:00, 18532360.66B/s]

2023-04-08 23:33:03,370 copying /tmp/tmpr0h3ppqa to cache at /root/.flair/embeddings/news-backward-0.4.1.pt





2023-04-08 23:33:03,458 removing temp file /tmp/tmpr0h3ppqa
2023-04-08 23:33:03,696 SequenceTagger predicts: Dictionary with 49 tags: O, S-Country, B-Country, E-Country, I-Country, S-State, B-State, E-State, I-State, S-City/town, B-City/town, E-City/town, I-City/town, S-Island, B-Island, E-Island, I-Island, S-Human-made Point-of-Interest, B-Human-made Point-of-Interest, E-Human-made Point-of-Interest, I-Human-made Point-of-Interest, S-County, B-County, E-County, I-County, S-Natural Point-of-Interest, B-Natural Point-of-Interest, E-Natural Point-of-Interest, I-Natural Point-of-Interest, S-District, B-District, E-District, I-District, S-Continent, B-Continent, E-Continent, I-Continent, S-Neighborhood, B-Neighborhood, E-Neighborhood, I-Neighborhood, S-Road/street, B-Road/street, E-Road/street, I-Road/street, S-Other locations, B-Other locations, E-Other locations, I-Other locations
2023-04-08 23:33:04,081 ------------------------------------------------------------------------------------

100%|██████████| 23/23 [00:11<00:00,  1.95it/s]

2023-04-08 23:34:16,409 Evaluating as a multi-label problem: False
2023-04-08 23:34:16,428 DEV : loss 0.13423362374305725 - f1-score (micro avg)  0.5864
2023-04-08 23:34:16,515 BAD EPOCHS (no improvement): 0
2023-04-08 23:34:16,516 saving best model





2023-04-08 23:34:18,337 ----------------------------------------------------------------------------------------------------
2023-04-08 23:34:20,573 epoch 2 - iter 14/142 - loss 0.14191576 - samples/sec: 200.65 - lr: 0.080000
2023-04-08 23:34:23,222 epoch 2 - iter 28/142 - loss 0.14258659 - samples/sec: 169.38 - lr: 0.080000
2023-04-08 23:34:25,919 epoch 2 - iter 42/142 - loss 0.14218087 - samples/sec: 166.32 - lr: 0.080000
2023-04-08 23:34:27,952 epoch 2 - iter 56/142 - loss 0.14029236 - samples/sec: 220.66 - lr: 0.080000
2023-04-08 23:34:30,027 epoch 2 - iter 70/142 - loss 0.13700418 - samples/sec: 216.23 - lr: 0.080000
2023-04-08 23:34:32,297 epoch 2 - iter 84/142 - loss 0.13456363 - samples/sec: 197.69 - lr: 0.080000
2023-04-08 23:34:34,381 epoch 2 - iter 98/142 - loss 0.13107448 - samples/sec: 215.17 - lr: 0.080000
2023-04-08 23:34:36,650 epoch 2 - iter 112/142 - loss 0.12889636 - samples/sec: 197.70 - lr: 0.080000
2023-04-08 23:34:39,350 epoch 2 - iter 126/142 - loss 0.12794297 -

100%|██████████| 23/23 [00:03<00:00,  5.95it/s]

2023-04-08 23:34:45,898 Evaluating as a multi-label problem: False
2023-04-08 23:34:45,917 DEV : loss 0.07759781181812286 - f1-score (micro avg)  0.763





2023-04-08 23:34:45,999 BAD EPOCHS (no improvement): 0
2023-04-08 23:34:46,001 saving best model
2023-04-08 23:34:47,828 ----------------------------------------------------------------------------------------------------
2023-04-08 23:34:49,991 epoch 3 - iter 14/142 - loss 0.12297922 - samples/sec: 207.96 - lr: 0.080000
2023-04-08 23:34:52,360 epoch 3 - iter 28/142 - loss 0.11305208 - samples/sec: 189.31 - lr: 0.080000
2023-04-08 23:34:55,061 epoch 3 - iter 42/142 - loss 0.10976290 - samples/sec: 166.32 - lr: 0.080000
2023-04-08 23:34:57,276 epoch 3 - iter 56/142 - loss 0.10951057 - samples/sec: 202.97 - lr: 0.080000
2023-04-08 23:34:59,298 epoch 3 - iter 70/142 - loss 0.10977638 - samples/sec: 221.90 - lr: 0.080000
2023-04-08 23:35:01,621 epoch 3 - iter 84/142 - loss 0.10858500 - samples/sec: 193.05 - lr: 0.080000
2023-04-08 23:35:03,691 epoch 3 - iter 98/142 - loss 0.10802018 - samples/sec: 216.75 - lr: 0.080000
2023-04-08 23:35:05,812 epoch 3 - iter 112/142 - loss 0.10710995 - samp

100%|██████████| 23/23 [00:04<00:00,  5.02it/s]

2023-04-08 23:35:16,128 Evaluating as a multi-label problem: False
2023-04-08 23:35:16,145 DEV : loss 0.06737015396356583 - f1-score (micro avg)  0.7909





2023-04-08 23:35:16,225 BAD EPOCHS (no improvement): 0
2023-04-08 23:35:16,226 saving best model
2023-04-08 23:35:18,093 ----------------------------------------------------------------------------------------------------
2023-04-08 23:35:20,364 epoch 4 - iter 14/142 - loss 0.09900058 - samples/sec: 197.67 - lr: 0.080000
2023-04-08 23:35:23,175 epoch 4 - iter 28/142 - loss 0.09719023 - samples/sec: 159.51 - lr: 0.080000
2023-04-08 23:35:26,076 epoch 4 - iter 42/142 - loss 0.09620804 - samples/sec: 154.67 - lr: 0.080000
2023-04-08 23:35:28,100 epoch 4 - iter 56/142 - loss 0.09692831 - samples/sec: 222.17 - lr: 0.080000
2023-04-08 23:35:30,127 epoch 4 - iter 70/142 - loss 0.09598469 - samples/sec: 221.39 - lr: 0.080000
2023-04-08 23:35:32,192 epoch 4 - iter 84/142 - loss 0.09409871 - samples/sec: 217.26 - lr: 0.080000
2023-04-08 23:35:34,315 epoch 4 - iter 98/142 - loss 0.09277843 - samples/sec: 211.34 - lr: 0.080000
2023-04-08 23:35:36,378 epoch 4 - iter 112/142 - loss 0.09136544 - samp

100%|██████████| 23/23 [00:03<00:00,  5.84it/s]

2023-04-08 23:35:45,938 Evaluating as a multi-label problem: False
2023-04-08 23:35:45,956 DEV : loss 0.06296397000551224 - f1-score (micro avg)  0.797





2023-04-08 23:35:46,044 BAD EPOCHS (no improvement): 0
2023-04-08 23:35:46,046 saving best model
2023-04-08 23:35:47,907 ----------------------------------------------------------------------------------------------------
2023-04-08 23:35:50,106 epoch 5 - iter 14/142 - loss 0.08139902 - samples/sec: 204.77 - lr: 0.080000
2023-04-08 23:35:52,456 epoch 5 - iter 28/142 - loss 0.08165926 - samples/sec: 190.91 - lr: 0.080000
2023-04-08 23:35:55,262 epoch 5 - iter 42/142 - loss 0.08607891 - samples/sec: 159.88 - lr: 0.080000
2023-04-08 23:35:57,630 epoch 5 - iter 56/142 - loss 0.08454541 - samples/sec: 189.53 - lr: 0.080000
2023-04-08 23:35:59,647 epoch 5 - iter 70/142 - loss 0.08428696 - samples/sec: 222.39 - lr: 0.080000
2023-04-08 23:36:01,758 epoch 5 - iter 84/142 - loss 0.08605561 - samples/sec: 212.49 - lr: 0.080000
2023-04-08 23:36:03,831 epoch 5 - iter 98/142 - loss 0.08531961 - samples/sec: 216.47 - lr: 0.080000
2023-04-08 23:36:05,912 epoch 5 - iter 112/142 - loss 0.08666522 - samp

100%|██████████| 23/23 [00:03<00:00,  5.80it/s]

2023-04-08 23:36:15,650 Evaluating as a multi-label problem: False
2023-04-08 23:36:15,674 DEV : loss 0.05976879969239235 - f1-score (micro avg)  0.8104





2023-04-08 23:36:15,763 BAD EPOCHS (no improvement): 0
2023-04-08 23:36:15,765 saving best model
2023-04-08 23:36:17,613 ----------------------------------------------------------------------------------------------------
2023-04-08 23:36:19,749 epoch 6 - iter 14/142 - loss 0.08654735 - samples/sec: 210.23 - lr: 0.080000
2023-04-08 23:36:21,797 epoch 6 - iter 28/142 - loss 0.08301345 - samples/sec: 219.05 - lr: 0.080000
2023-04-08 23:36:24,366 epoch 6 - iter 42/142 - loss 0.08036761 - samples/sec: 174.89 - lr: 0.080000
2023-04-08 23:36:27,064 epoch 6 - iter 56/142 - loss 0.07748057 - samples/sec: 166.30 - lr: 0.080000
2023-04-08 23:36:29,023 epoch 6 - iter 70/142 - loss 0.08016485 - samples/sec: 228.99 - lr: 0.080000
2023-04-08 23:36:31,376 epoch 6 - iter 84/142 - loss 0.08031839 - samples/sec: 190.64 - lr: 0.080000
2023-04-08 23:36:33,415 epoch 6 - iter 98/142 - loss 0.08116702 - samples/sec: 220.02 - lr: 0.080000
2023-04-08 23:36:35,494 epoch 6 - iter 112/142 - loss 0.08149917 - samp

100%|██████████| 23/23 [00:05<00:00,  4.56it/s]

2023-04-08 23:36:45,884 Evaluating as a multi-label problem: False
2023-04-08 23:36:45,905 DEV : loss 0.055345818400382996 - f1-score (micro avg)  0.8339





2023-04-08 23:36:45,998 BAD EPOCHS (no improvement): 0
2023-04-08 23:36:45,999 saving best model
2023-04-08 23:36:47,851 ----------------------------------------------------------------------------------------------------
2023-04-08 23:36:50,016 epoch 7 - iter 14/142 - loss 0.06597243 - samples/sec: 207.28 - lr: 0.080000
2023-04-08 23:36:52,123 epoch 7 - iter 28/142 - loss 0.06749176 - samples/sec: 213.04 - lr: 0.080000
2023-04-08 23:36:54,733 epoch 7 - iter 42/142 - loss 0.07455789 - samples/sec: 172.09 - lr: 0.080000
2023-04-08 23:36:57,388 epoch 7 - iter 56/142 - loss 0.07454072 - samples/sec: 168.97 - lr: 0.080000
2023-04-08 23:36:59,430 epoch 7 - iter 70/142 - loss 0.07513979 - samples/sec: 219.69 - lr: 0.080000
2023-04-08 23:37:01,378 epoch 7 - iter 84/142 - loss 0.07673554 - samples/sec: 230.28 - lr: 0.080000
2023-04-08 23:37:03,672 epoch 7 - iter 98/142 - loss 0.07530178 - samples/sec: 195.58 - lr: 0.080000
2023-04-08 23:37:05,801 epoch 7 - iter 112/142 - loss 0.07604762 - samp

100%|██████████| 23/23 [00:04<00:00,  5.23it/s]

2023-04-08 23:37:15,687 Evaluating as a multi-label problem: False
2023-04-08 23:37:15,707 DEV : loss 0.05467117205262184 - f1-score (micro avg)  0.8321





2023-04-08 23:37:15,797 BAD EPOCHS (no improvement): 1
2023-04-08 23:37:15,799 ----------------------------------------------------------------------------------------------------
2023-04-08 23:37:17,996 epoch 8 - iter 14/142 - loss 0.07703382 - samples/sec: 204.56 - lr: 0.080000
2023-04-08 23:37:19,968 epoch 8 - iter 28/142 - loss 0.07456923 - samples/sec: 227.46 - lr: 0.080000
2023-04-08 23:37:21,957 epoch 8 - iter 42/142 - loss 0.07321340 - samples/sec: 225.58 - lr: 0.080000
2023-04-08 23:37:24,483 epoch 8 - iter 56/142 - loss 0.07219899 - samples/sec: 177.63 - lr: 0.080000
2023-04-08 23:37:27,320 epoch 8 - iter 70/142 - loss 0.07236620 - samples/sec: 158.15 - lr: 0.080000
2023-04-08 23:37:29,351 epoch 8 - iter 84/142 - loss 0.07339063 - samples/sec: 221.40 - lr: 0.080000
2023-04-08 23:37:31,739 epoch 8 - iter 98/142 - loss 0.07081072 - samples/sec: 187.79 - lr: 0.080000
2023-04-08 23:37:33,795 epoch 8 - iter 112/142 - loss 0.07160510 - samples/sec: 218.11 - lr: 0.080000
2023-04-08 

100%|██████████| 23/23 [00:05<00:00,  4.29it/s]

2023-04-08 23:37:43,701 Evaluating as a multi-label problem: False
2023-04-08 23:37:43,718 DEV : loss 0.052561625838279724 - f1-score (micro avg)  0.8433





2023-04-08 23:37:43,806 BAD EPOCHS (no improvement): 0
2023-04-08 23:37:43,807 saving best model
2023-04-08 23:37:45,635 ----------------------------------------------------------------------------------------------------
2023-04-08 23:37:47,902 epoch 9 - iter 14/142 - loss 0.06938317 - samples/sec: 198.07 - lr: 0.080000
2023-04-08 23:37:49,874 epoch 9 - iter 28/142 - loss 0.07279933 - samples/sec: 227.48 - lr: 0.080000
2023-04-08 23:37:51,959 epoch 9 - iter 42/142 - loss 0.06883445 - samples/sec: 215.13 - lr: 0.080000
2023-04-08 23:37:54,384 epoch 9 - iter 56/142 - loss 0.06692482 - samples/sec: 184.96 - lr: 0.080000
2023-04-08 23:37:57,053 epoch 9 - iter 70/142 - loss 0.06959046 - samples/sec: 168.10 - lr: 0.080000
2023-04-08 23:37:59,330 epoch 9 - iter 84/142 - loss 0.06926821 - samples/sec: 197.42 - lr: 0.080000
2023-04-08 23:38:01,368 epoch 9 - iter 98/142 - loss 0.07008053 - samples/sec: 220.11 - lr: 0.080000
2023-04-08 23:38:03,610 epoch 9 - iter 112/142 - loss 0.06845946 - samp

100%|██████████| 23/23 [00:06<00:00,  3.63it/s]

2023-04-08 23:38:14,273 Evaluating as a multi-label problem: False
2023-04-08 23:38:14,291 DEV : loss 0.05071743205189705 - f1-score (micro avg)  0.8459





2023-04-08 23:38:14,373 BAD EPOCHS (no improvement): 0
2023-04-08 23:38:14,378 saving best model
2023-04-08 23:38:16,202 ----------------------------------------------------------------------------------------------------
2023-04-08 23:38:18,575 epoch 10 - iter 14/142 - loss 0.06961368 - samples/sec: 189.36 - lr: 0.080000
2023-04-08 23:38:20,545 epoch 10 - iter 28/142 - loss 0.06859121 - samples/sec: 227.67 - lr: 0.080000
2023-04-08 23:38:22,553 epoch 10 - iter 42/142 - loss 0.06594987 - samples/sec: 223.41 - lr: 0.080000
2023-04-08 23:38:25,019 epoch 10 - iter 56/142 - loss 0.06694907 - samples/sec: 181.93 - lr: 0.080000
2023-04-08 23:38:27,744 epoch 10 - iter 70/142 - loss 0.06548666 - samples/sec: 164.80 - lr: 0.080000
2023-04-08 23:38:29,926 epoch 10 - iter 84/142 - loss 0.06450865 - samples/sec: 205.68 - lr: 0.080000
2023-04-08 23:38:32,114 epoch 10 - iter 98/142 - loss 0.06528491 - samples/sec: 205.01 - lr: 0.080000
2023-04-08 23:38:34,152 epoch 10 - iter 112/142 - loss 0.0641029

100%|██████████| 23/23 [00:05<00:00,  4.20it/s]

2023-04-08 23:38:44,146 Evaluating as a multi-label problem: False
2023-04-08 23:38:44,163 DEV : loss 0.05003725364804268 - f1-score (micro avg)  0.8388





2023-04-08 23:38:44,247 BAD EPOCHS (no improvement): 1
2023-04-08 23:38:44,248 ----------------------------------------------------------------------------------------------------
2023-04-08 23:38:46,419 epoch 11 - iter 14/142 - loss 0.05655616 - samples/sec: 206.82 - lr: 0.080000
2023-04-08 23:38:48,512 epoch 11 - iter 28/142 - loss 0.06260848 - samples/sec: 214.40 - lr: 0.080000
2023-04-08 23:38:50,570 epoch 11 - iter 42/142 - loss 0.06365982 - samples/sec: 217.97 - lr: 0.080000
2023-04-08 23:38:52,876 epoch 11 - iter 56/142 - loss 0.06225945 - samples/sec: 194.46 - lr: 0.080000
2023-04-08 23:38:55,385 epoch 11 - iter 70/142 - loss 0.06268702 - samples/sec: 178.80 - lr: 0.080000
2023-04-08 23:38:58,055 epoch 11 - iter 84/142 - loss 0.06100859 - samples/sec: 168.30 - lr: 0.080000
2023-04-08 23:39:00,199 epoch 11 - iter 98/142 - loss 0.06037663 - samples/sec: 209.34 - lr: 0.080000
2023-04-08 23:39:02,226 epoch 11 - iter 112/142 - loss 0.06172295 - samples/sec: 221.31 - lr: 0.080000
202

100%|██████████| 23/23 [00:04<00:00,  4.81it/s]

2023-04-08 23:39:11,487 Evaluating as a multi-label problem: False





2023-04-08 23:39:11,516 DEV : loss 0.049998342990875244 - f1-score (micro avg)  0.8297
2023-04-08 23:39:11,667 BAD EPOCHS (no improvement): 2
2023-04-08 23:39:11,669 ----------------------------------------------------------------------------------------------------
2023-04-08 23:39:14,424 epoch 12 - iter 14/142 - loss 0.06227691 - samples/sec: 162.96 - lr: 0.080000
2023-04-08 23:39:16,354 epoch 12 - iter 28/142 - loss 0.06304169 - samples/sec: 232.44 - lr: 0.080000
2023-04-08 23:39:18,392 epoch 12 - iter 42/142 - loss 0.06269613 - samples/sec: 220.13 - lr: 0.080000
2023-04-08 23:39:20,573 epoch 12 - iter 56/142 - loss 0.06251657 - samples/sec: 205.68 - lr: 0.080000
2023-04-08 23:39:22,613 epoch 12 - iter 70/142 - loss 0.06106749 - samples/sec: 220.19 - lr: 0.080000
2023-04-08 23:39:24,878 epoch 12 - iter 84/142 - loss 0.06085046 - samples/sec: 198.08 - lr: 0.080000
2023-04-08 23:39:27,485 epoch 12 - iter 98/142 - loss 0.06112841 - samples/sec: 172.32 - lr: 0.080000
2023-04-08 23:39:29

100%|██████████| 23/23 [00:04<00:00,  4.86it/s]

2023-04-08 23:39:39,298 Evaluating as a multi-label problem: False





2023-04-08 23:39:39,329 DEV : loss 0.05013889819383621 - f1-score (micro avg)  0.8273
2023-04-08 23:39:39,481 BAD EPOCHS (no improvement): 3
2023-04-08 23:39:39,485 ----------------------------------------------------------------------------------------------------
2023-04-08 23:39:42,332 epoch 13 - iter 14/142 - loss 0.04940157 - samples/sec: 157.68 - lr: 0.080000
2023-04-08 23:39:45,046 epoch 13 - iter 28/142 - loss 0.05192514 - samples/sec: 165.38 - lr: 0.080000
2023-04-08 23:39:47,117 epoch 13 - iter 42/142 - loss 0.05563411 - samples/sec: 216.61 - lr: 0.080000
2023-04-08 23:39:49,161 epoch 13 - iter 56/142 - loss 0.05618548 - samples/sec: 219.46 - lr: 0.080000
2023-04-08 23:39:51,234 epoch 13 - iter 70/142 - loss 0.05688547 - samples/sec: 216.44 - lr: 0.080000
2023-04-08 23:39:53,276 epoch 13 - iter 84/142 - loss 0.05812942 - samples/sec: 219.75 - lr: 0.080000
2023-04-08 23:39:55,640 epoch 13 - iter 98/142 - loss 0.05814595 - samples/sec: 189.76 - lr: 0.080000
2023-04-08 23:39:58,

100%|██████████| 23/23 [00:03<00:00,  5.98it/s]

2023-04-08 23:40:06,882 Evaluating as a multi-label problem: False
2023-04-08 23:40:06,900 DEV : loss 0.048486653715372086 - f1-score (micro avg)  0.8407





2023-04-08 23:40:06,983 Epoch    13: reducing learning rate of group 0 to 4.0000e-02.
2023-04-08 23:40:06,986 BAD EPOCHS (no improvement): 4
2023-04-08 23:40:06,987 ----------------------------------------------------------------------------------------------------
2023-04-08 23:40:09,066 epoch 14 - iter 14/142 - loss 0.05248401 - samples/sec: 216.06 - lr: 0.040000
2023-04-08 23:40:11,746 epoch 14 - iter 28/142 - loss 0.05320896 - samples/sec: 167.54 - lr: 0.040000
2023-04-08 23:40:14,438 epoch 14 - iter 42/142 - loss 0.05410345 - samples/sec: 166.98 - lr: 0.040000
2023-04-08 23:40:16,600 epoch 14 - iter 56/142 - loss 0.05381998 - samples/sec: 207.55 - lr: 0.040000
2023-04-08 23:40:18,624 epoch 14 - iter 70/142 - loss 0.05358808 - samples/sec: 221.55 - lr: 0.040000
2023-04-08 23:40:20,756 epoch 14 - iter 84/142 - loss 0.05326517 - samples/sec: 210.75 - lr: 0.040000
2023-04-08 23:40:22,796 epoch 14 - iter 98/142 - loss 0.05244824 - samples/sec: 220.37 - lr: 0.040000
2023-04-08 23:40:25,

100%|██████████| 23/23 [00:03<00:00,  5.94it/s]

2023-04-08 23:40:34,514 Evaluating as a multi-label problem: False
2023-04-08 23:40:34,538 DEV : loss 0.04655779153108597 - f1-score (micro avg)  0.8391





2023-04-08 23:40:34,621 BAD EPOCHS (no improvement): 1
2023-04-08 23:40:34,622 ----------------------------------------------------------------------------------------------------
2023-04-08 23:40:36,768 epoch 15 - iter 14/142 - loss 0.05360477 - samples/sec: 209.20 - lr: 0.040000
2023-04-08 23:40:38,811 epoch 15 - iter 28/142 - loss 0.04901628 - samples/sec: 219.82 - lr: 0.040000
2023-04-08 23:40:41,261 epoch 15 - iter 42/142 - loss 0.04938773 - samples/sec: 183.08 - lr: 0.040000
2023-04-08 23:40:43,969 epoch 15 - iter 56/142 - loss 0.05172510 - samples/sec: 165.95 - lr: 0.040000
2023-04-08 23:40:46,120 epoch 15 - iter 70/142 - loss 0.05262092 - samples/sec: 209.16 - lr: 0.040000
2023-04-08 23:40:48,202 epoch 15 - iter 84/142 - loss 0.05216020 - samples/sec: 215.44 - lr: 0.040000
2023-04-08 23:40:50,258 epoch 15 - iter 98/142 - loss 0.05184702 - samples/sec: 218.21 - lr: 0.040000
2023-04-08 23:40:52,357 epoch 15 - iter 112/142 - loss 0.05242530 - samples/sec: 213.74 - lr: 0.040000
202

100%|██████████| 23/23 [00:05<00:00,  4.36it/s]

2023-04-08 23:41:03,007 Evaluating as a multi-label problem: False
2023-04-08 23:41:03,029 DEV : loss 0.04661393538117409 - f1-score (micro avg)  0.8408





2023-04-08 23:41:03,128 BAD EPOCHS (no improvement): 2
2023-04-08 23:41:03,130 ----------------------------------------------------------------------------------------------------
2023-04-08 23:41:05,281 epoch 16 - iter 14/142 - loss 0.04384619 - samples/sec: 208.90 - lr: 0.040000
2023-04-08 23:41:07,581 epoch 16 - iter 28/142 - loss 0.04290437 - samples/sec: 195.10 - lr: 0.040000
2023-04-08 23:41:09,756 epoch 16 - iter 42/142 - loss 0.04463130 - samples/sec: 206.30 - lr: 0.040000
2023-04-08 23:41:12,345 epoch 16 - iter 56/142 - loss 0.04649082 - samples/sec: 173.43 - lr: 0.040000
2023-04-08 23:41:15,050 epoch 16 - iter 70/142 - loss 0.04604770 - samples/sec: 165.98 - lr: 0.040000
2023-04-08 23:41:17,081 epoch 16 - iter 84/142 - loss 0.04777234 - samples/sec: 221.14 - lr: 0.040000
2023-04-08 23:41:19,135 epoch 16 - iter 98/142 - loss 0.04897748 - samples/sec: 218.34 - lr: 0.040000
2023-04-08 23:41:21,253 epoch 16 - iter 112/142 - loss 0.04986477 - samples/sec: 211.82 - lr: 0.040000
202

100%|██████████| 23/23 [00:05<00:00,  4.34it/s]

2023-04-08 23:41:31,170 Evaluating as a multi-label problem: False
2023-04-08 23:41:31,188 DEV : loss 0.049965061247348785 - f1-score (micro avg)  0.8421





2023-04-08 23:41:31,273 BAD EPOCHS (no improvement): 3
2023-04-08 23:41:33,057 ----------------------------------------------------------------------------------------------------
2023-04-08 23:41:33,062 loading file resources/taggers/sota-ner-flair/best-model.pt
2023-04-08 23:41:34,412 SequenceTagger predicts: Dictionary with 51 tags: O, S-Country, B-Country, E-Country, I-Country, S-State, B-State, E-State, I-State, S-City/town, B-City/town, E-City/town, I-City/town, S-Island, B-Island, E-Island, I-Island, S-Human-made Point-of-Interest, B-Human-made Point-of-Interest, E-Human-made Point-of-Interest, I-Human-made Point-of-Interest, S-County, B-County, E-County, I-County, S-Natural Point-of-Interest, B-Natural Point-of-Interest, E-Natural Point-of-Interest, I-Natural Point-of-Interest, S-District, B-District, E-District, I-District, S-Continent, B-Continent, E-Continent, I-Continent, S-Neighborhood, B-Neighborhood, E-Neighborhood, I-Neighborhood, S-Road/street, B-Road/street, E-Road/st

100%|██████████| 16/16 [00:06<00:00,  2.42it/s]

2023-04-08 23:41:41,347 Evaluating as a multi-label problem: False
2023-04-08 23:41:41,372 0.8195	0.7837	0.8012	0.7054
2023-04-08 23:41:41,373 
Results:
- F-score (micro) 0.8012
- F-score (macro) 0.3995
- Accuracy 0.7054

By class:
                              precision    recall  f1-score   support

                     Country     0.9036    0.9146    0.9091       164
                       State     0.9216    0.8598    0.8896       164
                   City/town     0.5827    0.7629    0.6607        97
                      Island     0.8077    0.7241    0.7636        29
                      County     0.8571    0.4286    0.5714        14
   Natural Point-of-Interest     1.0000    0.2000    0.3333        10
Human-made Point-of-Interest     0.0000    0.0000    0.0000        10
                    District     0.0000    0.0000    0.0000         8
                 Road/street     0.0000    0.0000    0.0000         4
                   Continent     1.0000    0.5000    0.6667        




{'test_score': 0.8012170385395536,
 'dev_score_history': [0.5864156018829859,
  0.7629733520336606,
  0.7909215955983495,
  0.7969507969507968,
  0.8103683492496591,
  0.8338804990151019,
  0.8321263989466755,
  0.8433420365535248,
  0.8459016393442623,
  0.8387942332896461,
  0.8297455968688845,
  0.8272727272727273,
  0.8407310704960836,
  0.8390501319261214,
  0.8407894736842105,
  0.8421052631578947],
 'train_loss_history': [0.29591518258124644,
  0.12780638969223454,
  0.10420612382003334,
  0.09320439881275348,
  0.08493016419369469,
  0.08005953663002992,
  0.07538931812670736,
  0.07131873515221727,
  0.06775897024080119,
  0.06528490034178705,
  0.06233132578392103,
  0.06019205521856355,
  0.05859153641687127,
  0.05340869543243447,
  0.05241081952297504,
  0.05170304166125478],
 'dev_loss_history': [0.13423362374305725,
  0.07759781181812286,
  0.06737015396356583,
  0.06296397000551224,
  0.05976879969239235,
  0.055345818400382996,
  0.05467117205262184,
  0.05256162583827

In [None]:
# del tagger,trainer,embeddings

### Transformer

In [None]:
# label_type = 'ner'

# label_dict = corpus.make_label_dictionary(label_type=label_type)
# print(label_dict)

# embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
#                                        layers="-1",
#                                        subtoken_pooling="first",
#                                        fine_tune=True,
#                                        use_context=True,
#                                        )

# tagger = SequenceTagger(hidden_size=256,
#                         embeddings=embeddings,
#                         tag_dictionary=label_dict,
#                         tag_type='ner',
#                         use_crf=False,
#                         use_rnn=False,
#                         reproject_embeddings=False,
#                         )

# trainer = ModelTrainer(tagger, corpus)

# trainer.fine_tune('resources/taggers/sota-ner-flair',
#               learning_rate=0.00085,
#               mini_batch_size=16,
#               max_epochs=8)

## INFERENCE

In [None]:
model = SequenceTagger.load('./resources/taggers/sota-ner-flair/final-model.pt') #loading model

2023-04-08 23:41:41,407 loading file ./resources/taggers/sota-ner-flair/final-model.pt
2023-04-08 23:41:43,082 SequenceTagger predicts: Dictionary with 51 tags: O, S-Country, B-Country, E-Country, I-Country, S-State, B-State, E-State, I-State, S-City/town, B-City/town, E-City/town, I-City/town, S-Island, B-Island, E-Island, I-Island, S-Human-made Point-of-Interest, B-Human-made Point-of-Interest, E-Human-made Point-of-Interest, I-Human-made Point-of-Interest, S-County, B-County, E-County, I-County, S-Natural Point-of-Interest, B-Natural Point-of-Interest, E-Natural Point-of-Interest, I-Natural Point-of-Interest, S-District, B-District, E-District, I-District, S-Continent, B-Continent, E-Continent, I-Continent, S-Neighborhood, B-Neighborhood, E-Neighborhood, I-Neighborhood, S-Road/street, B-Road/street, E-Road/street, I-Road/street, S-Other locations, B-Other locations, E-Other locations, I-Other locations, <START>


In [None]:
folder_to_check = '/content/drive/MyDrive/gold-random-json/kerala_floods_2018/' # END PATH WITH / symbol 

import os

for file in glob(folder_to_check + "*"):
    if "train" not in file:
        continue
    for line in open(file,'r'):
        obj = json.loads(line)
        s = Sentence(obj["text"],use_tokenizer = TokenizerWrapper(wordpunct_tokenize))
        model.predict(s) #model is being used to predict on sentence s
        outp =  {}
        outp["tweet_id"] = obj["tweet_id"]
        outp["location_mentions"] = []
        for e in s.labels:
            outp["location_mentions"].append({"text":e.data_point.text,"start_offset":e.data_point.start_position,"end_offset":e.data_point.end_position})
        with open("prediction.jsonl",'a') as out:
            json.dump(json.dumps(outp),out)
            out.write('\n')

In [None]:
numobs = 5

print("\033[1m Predictions \033[0m")
i = 0

for line in open("prediction.jsonl",'r'):
    print(json.loads(line))
    print()
    i += 1
    if i == numobs:
        break

print('\033[1m Ground Truth \033[0m')
i = 0

for line in open(folder_to_check + "train.jsonl"):
    #print(line)
    i += 1
    if i == numobs:
        break

[1m Predictions [0m
{"tweet_id": "1032071697221005312", "location_mentions": [{"text": "Kerala", "start_offset": 58, "end_offset": 64}, {"text": "Kerala", "start_offset": 204, "end_offset": 210}, {"text": "Karnataka", "start_offset": 223, "end_offset": 232}]}

{"tweet_id": "1034334390195822592", "location_mentions": []}

{"tweet_id": "1030975049510137856", "location_mentions": [{"text": "Kerala", "start_offset": 44, "end_offset": 50}]}

{"tweet_id": "1034481352341692417", "location_mentions": [{"text": "Kerala", "start_offset": 107, "end_offset": 113}, {"text": "India", "start_offset": 259, "end_offset": 264}]}

{"tweet_id": "1031529205673078784", "location_mentions": [{"text": "Kerala", "start_offset": 0, "end_offset": 6}, {"text": "india", "start_offset": 93, "end_offset": 98}, {"text": "kerala", "start_offset": 100, "end_offset": 106}, {"text": "Kerala", "start_offset": 208, "end_offset": 214}]}

[1m Ground Truth [0m


In [None]:
import pickle

pickle.dump(model,open('model-loc.pkl' , 'wb'))