In [None]:
!pip install flair &> /dev/null

In [None]:
from glob import glob
dataset = glob("../input/twitter-loc/gold-random-json/*")

## Data Preprocessing

In [None]:
import json
from itertools import product
from flair.data import Sentence
from flair.tokenization import TokenizerWrapper
from nltk import wordpunct_tokenize

train_corpus = []
dev_corpus = []

for folder,corpus_type in product(dataset,["train","dev"]):
    for line in open(folder + '/' + corpus_type + '.jsonl'):
        obj = json.loads(line)
        sentence = Sentence(obj["text"],use_tokenizer = TokenizerWrapper(wordpunct_tokenize))

        for span_labels in obj["location_mentions"]:
            s,e = span_labels["start_offset"],span_labels["end_offset"]
            s_tok,e_tok = -1,-1
            for i,x in zip(range(len(sentence)),sentence):
                if s <= x.start_pos and x.end_pos <= e :
                    s_tok = s_tok if s_tok != -1 else i
                    e_tok = i
            try:
                sentence[s_tok:e_tok + 1].add_label("ner",span_labels["type"])  
            except:
                pass
        globals()[corpus_type + '_corpus'].append(sentence)
    
    

## Training

In [None]:
from flair.models import SequenceTagger
from flair.embeddings import WordEmbeddings,FlairEmbeddings,StackedEmbeddings,TransformerWordEmbeddings
from flair.trainers import ModelTrainer
from flair.data import Corpus

corpus = Corpus(train_corpus,dev_corpus) # store train and test data
corpus.downsample(0.35) #downsample to 35% 

### GLOVE + BiLSTM + CRF

In [None]:
label_type = 'ner' #Name-Entity Recognition (NER):Â It can recognise whether a 
                    #word represents a person, location or names in the text.


label_dict = corpus.make_label_dictionary(label_type=label_type) #Create a label dictionary from ner and data
print(label_dict)

embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]
#The three embedding models will be concatenated and should give state of the art results.
#Document Embeddings generate one embedding for an entire text. 
#The produced embeddings are PyTorch vectors. 
#There are two different methods using the word embeddings to obtain a document embedding

embeddings = StackedEmbeddings(embeddings=embedding_types)

tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=label_dict,
                        tag_type=label_type,
                        use_crf=True)

trainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.08,
              mini_batch_size=32,
              max_epochs=16)

In [None]:
# del tagger,trainer,embeddings

### Transformer

In [None]:
# label_type = 'ner'

# label_dict = corpus.make_label_dictionary(label_type=label_type)
# print(label_dict)

# embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
#                                        layers="-1",
#                                        subtoken_pooling="first",
#                                        fine_tune=True,
#                                        use_context=True,
#                                        )

# tagger = SequenceTagger(hidden_size=256,
#                         embeddings=embeddings,
#                         tag_dictionary=label_dict,
#                         tag_type='ner',
#                         use_crf=False,
#                         use_rnn=False,
#                         reproject_embeddings=False,
#                         )

# trainer = ModelTrainer(tagger, corpus)

# trainer.fine_tune('resources/taggers/sota-ner-flair',
#               learning_rate=0.00085,
#               mini_batch_size=16,
#               max_epochs=8)

## INFERENCE

In [None]:
model = SequenceTagger.load('./resources/taggers/sota-ner-flair/final-model.pt') #loading model

In [None]:
folder_to_check = '../input/twitter-loc/gold-random-json/kerala_floods_2018/' # END PATH WITH / symbol 

import os

for file in glob(folder_to_check + "*"):
    if "train" not in file:
        continue
    for line in open(file,'r'):
        obj = json.loads(line)
        s = Sentence(obj["text"],use_tokenizer = TokenizerWrapper(wordpunct_tokenize))
        model.predict(s) #model is being used to predict on sentence s
        outp =  {}
        outp["tweet_id"] = obj["tweet_id"]
        outp["location_mentions"] = []
        for e in s.labels:
            outp["location_mentions"].append({"text":e.data_point.text,"start_offset":e.data_point.start_position,"end_offset":e.data_point.end_position})
        with open("prediction.jsonl",'a') as out:
            json.dump(json.dumps(outp),out)
            out.write('\n')

In [None]:
numobs = 5

print("\033[1m Predictions \033[0m")
i = 0

for line in open("prediction.jsonl",'r'):
    print(json.loads(line))
    print()
    i += 1
    if i == numobs:
        break

print('\033[1m Ground Truth \033[0m')
i = 0

for line in open(folder_to_check + "train.jsonl"):
    #print(line)
    i += 1
    if i == numobs:
        break