# Imports

In [1]:
import sys
sys.path.append("..")

In [2]:
import numpy as np
import pandas as pd
from src import FlairDataset
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook
%matplotlib inline

In [3]:
path = "../data/external/real_fake_disaster/"

In [4]:
dataset = FlairDataset.csv_classification(
    data_folder=path, filename='data', column_mapping=['text', 'label'])

2020-04-05 15:12:06,797 Reading data from /home/uv/Documents/meetup(in_progress)/entiretydotai/data/interim
2020-04-05 15:12:06,798 Train: /home/uv/Documents/meetup(in_progress)/entiretydotai/data/interim/train.csv
2020-04-05 15:12:06,799 Dev: /home/uv/Documents/meetup(in_progress)/entiretydotai/data/interim/valid.csv
2020-04-05 15:12:06,800 Test: /home/uv/Documents/meetup(in_progress)/entiretydotai/data/interim/test.csv


# Tag Prediction

In [5]:
from flair.data import Sentence
from flair.models import SequenceTagger
# load the NER tagger

tagger_ner = SequenceTagger.load('ner-fast')
tagger_pos = SequenceTagger.load('pos-fast')

2020-04-05 15:12:07,899 loading file /home/uv/.flair/models/en-ner-fast-conll03-v0.4.pt
2020-04-05 15:12:09,026 loading file /home/uv/.flair/models/en-pos-ontonotes-fast-v0.4.pt


In [22]:
test = dataset.train_data[:10][['label', 'text']]
test

Unnamed: 0,label,text
6859,0,@AshGhebranious civil rights continued in the ...
6751,0,Dakota Skye gets horny with some porn then get...
7281,1,Richard returns after whirlwind few days http:...
909,0,Bloody Mary in the sink. Beet juice http://t.c...
5939,0,@Real_Liam_Payne I SCREAMED AT THE TOP OF MY L...
4419,0,Mom is hijacking my account to earn MCR STATUS...
5418,0,Just realized that maybe it not normal to sit ...
3440,0,Im Dead!!! My two Loves in 1 photo! My Heart e...
1905,0,Nick Williams just hit another bomb. Just crus...
3416,0,Philadelphia EaglesÛª Jordan Matthews Is Goin...


In [26]:
def ner_tag(row):
    sentence = Sentence(row['text'],use_tokenizer=True)
    temp = tagger_ner.predict(sentence)
    row['ner_tag'] = sentence.to_tagged_string()
    return row

In [9]:
def pos_tag(row):
    sentence = Sentence(row['text'],use_tokenizer=True)
    temp = tagger_pos.predict(sentence)
    row['pos_tag'] = sentence.to_tagged_string()
    return row

In [10]:
test = test.apply(ner_tag, axis=1)

In [11]:
test

Unnamed: 0,label,text,ner_tag
6859,0,@AshGhebranious civil rights continued in the ...,@ AshGhebranious <S-MISC> civil rights continu...
6751,0,Dakota Skye gets horny with some porn then get...,Dakota <B-PER> Skye <E-PER> gets horny with so...
7281,1,Richard returns after whirlwind few days http:...,Richard <S-PER> returns after whirlwind few da...
909,0,Bloody Mary in the sink. Beet juice http://t.c...,Bloody Mary <S-PER> in the sink . Beet juice h...
5939,0,@Real_Liam_Payne I SCREAMED AT THE TOP OF MY L...,@ Real _ Liam <S-PER> _ Payne <S-PER> I SCREAM...
4419,0,Mom is hijacking my account to earn MCR STATUS...,Mom <S-PER> is hijacking my account to earn MC...
5418,0,Just realized that maybe it not normal to sit ...,Just realized that maybe it not normal to sit ...
3440,0,Im Dead!!! My two Loves in 1 photo! My Heart e...,Im Dead !! ! My two Loves in 1 photo ! My Hear...
1905,0,Nick Williams just hit another bomb. Just crus...,Nick <B-PER> Williams <E-PER> just hit another...
3416,0,Philadelphia EaglesÛª Jordan Matthews Is Goin...,Philadelphia <B-ORG> Eagles <E-ORG> ‰ Û ª Jord...


In [12]:
test = test.apply(pos_tag, axis=1)

In [13]:
# used with flair get_spans
# tags are not in format similar to BILOU 
# we get the score for each tags
def ner_tags_updated(rows,):
    sentence = Sentence(rows["text"],use_tokenizer=True)
    temp = tagger_ner.predict(sentence)
    text = sentence.to_tokenized_string().split(" ")
    entity_tagged = sentence.get_spans('ner')
    tagged_text = [ent.text for ent in entity_tagged]
    tagged_label = [ent.tag for ent in entity_tagged]
    tagged_score = [ent.score for ent in entity_tagged]
    corpus = []
    cleaned_ner_tag = []
    score = []
    for i in text:
        if i in tagged_text:
            corpus.append(i)
            index = tagged_text.index(i)
            cleaned_ner_tag.append(tagged_label[index])
            score.append(round(tagged_score[index],2))
            
        else:
            corpus.append(i)
            cleaned_ner_tag.append("NA")
            score.append(np.NaN)
    rows["updated_ner_corpus"] = corpus
    rows["updated_cleaned_ner"] = cleaned_ner_tag
    rows["updated_ner_score"] = score
    return rows

In [14]:
test = test.apply(ner_tags_updated,axis=1)

Unnamed: 0,label,text,ner_tag,pos_tag,updated_ner_corpus,updated_cleaned_ner,updated_ner_score
6859,0,@AshGhebranious civil rights continued in the ...,@ AshGhebranious <S-MISC> civil rights continu...,@ <SYM> AshGhebranious <ADJ> civil <ADJ> right...,"[@, AshGhebranious, civil, rights, continued, ...","[NA, MISC, NA, NA, NA, NA, NA, NA, NA, NA, NA,...","[nan, 0.7, nan, nan, nan, nan, nan, nan, nan, ..."
6751,0,Dakota Skye gets horny with some porn then get...,Dakota <B-PER> Skye <E-PER> gets horny with so...,Dakota <PROPN> Skye <PROPN> gets <VERB> horny ...,"[Dakota, Skye, gets, horny, with, some, porn, ...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
7281,1,Richard returns after whirlwind few days http:...,Richard <S-PER> returns after whirlwind few da...,Richard <PROPN> returns <VERB> after <ADP> whi...,"[Richard, returns, after, whirlwind, few, days...","[PER, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...","[0.98, nan, nan, nan, nan, nan, nan, nan, nan,..."
909,0,Bloody Mary in the sink. Beet juice http://t.c...,Bloody Mary <S-PER> in the sink . Beet juice h...,Bloody <ADJ> Mary <PROPN> in <ADP> the <DET> s...,"[Bloody, Mary, in, the, sink, ., Beet, juice, ...","[NA, PER, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...","[nan, 0.64, nan, nan, nan, nan, nan, nan, nan,..."
5939,0,@Real_Liam_Payne I SCREAMED AT THE TOP OF MY L...,@ Real _ Liam <S-PER> _ Payne <S-PER> I SCREAM...,@ <SYM> Real <PROPN> _ <SYM> Liam <PROPN> _ <S...,"[@, Real, _, Liam, _, Payne, I, SCREAMED, AT, ...","[NA, NA, NA, PER, NA, PER, NA, NA, NA, NA, NA,...","[nan, nan, nan, 0.64, nan, 0.99, nan, nan, nan..."
4419,0,Mom is hijacking my account to earn MCR STATUS...,Mom <S-PER> is hijacking my account to earn MC...,Mom <PROPN> is <VERB> hijacking <VERB> my <PRO...,"[Mom, is, hijacking, my, account, to, earn, MC...","[PER, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...","[0.92, nan, nan, nan, nan, nan, nan, nan, nan,..."
5418,0,Just realized that maybe it not normal to sit ...,Just realized that maybe it not normal to sit ...,Just <ADV> realized <VERB> that <ADP> maybe <A...,"[Just, realized, that, maybe, it, not, normal,...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
3440,0,Im Dead!!! My two Loves in 1 photo! My Heart e...,Im Dead !! ! My two Loves in 1 photo ! My Hear...,Im <VERB> Dead <ADJ> !! <PUNCT> ! <PUNCT> My <...,"[Im, Dead, !!, !, My, two, Loves, in, 1, photo...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1905,0,Nick Williams just hit another bomb. Just crus...,Nick <B-PER> Williams <E-PER> just hit another...,Nick <PROPN> Williams <PROPN> just <ADV> hit <...,"[Nick, Williams, just, hit, another, bomb, ., ...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA]","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
3416,0,Philadelphia EaglesÛª Jordan Matthews Is Goin...,Philadelphia <B-ORG> Eagles <E-ORG> ‰ Û ª Jord...,Philadelphia <PROPN> Eagles <PROPN> ‰ <SYM> Û ...,"[Philadelphia, Eagles, ‰, Û, ª, Jordan, Matthe...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


In [15]:
import re
def clean_tags(rows,):
    for j in ['<', '>']:
        rows = str(rows).replace(j, "")
        rows = re.sub(' +', ' ', str(rows))
        rows = str(rows).strip()
    return rows

In [16]:
def extract_pos_tags(rows,):
    text = rows['pos_tag'].split(" ")
    corpus = [i for i in text if not i.strip().startswith("<")]
    tags = [clean_tags(i) for i in text if i.strip().startswith("<")]
    if len(corpus)== len(tags):
        rows['pos_corpus'] = corpus
        rows['cleaned_pos_tags'] = tags
    return rows
        
    

In [17]:
test = test.apply(extract_pos_tags,axis=1)
test

Unnamed: 0,label,text,ner_tag,pos_tag,pos_corpus,cleaned_pos_tags
6859,0,@AshGhebranious civil rights continued in the ...,@ AshGhebranious <S-MISC> civil rights continu...,@ <SYM> AshGhebranious <ADJ> civil <ADJ> right...,"[@, AshGhebranious, civil, rights, continued, ...","[SYM, ADJ, ADJ, NOUN, VERB, ADP, DET, NOUN, PU..."
6751,0,Dakota Skye gets horny with some porn then get...,Dakota <B-PER> Skye <E-PER> gets horny with so...,Dakota <PROPN> Skye <PROPN> gets <VERB> horny ...,"[Dakota, Skye, gets, horny, with, some, porn, ...","[PROPN, PROPN, VERB, ADJ, ADP, DET, NOUN, ADV,..."
7281,1,Richard returns after whirlwind few days http:...,Richard <S-PER> returns after whirlwind few da...,Richard <PROPN> returns <VERB> after <ADP> whi...,"[Richard, returns, after, whirlwind, few, days...","[PROPN, VERB, ADP, ADJ, ADJ, NOUN, X, X, X, SY..."
909,0,Bloody Mary in the sink. Beet juice http://t.c...,Bloody Mary <S-PER> in the sink . Beet juice h...,Bloody <ADJ> Mary <PROPN> in <ADP> the <DET> s...,"[Bloody, Mary, in, the, sink, ., Beet, juice, ...","[ADJ, PROPN, ADP, DET, NOUN, PUNCT, NOUN, NOUN..."
5939,0,@Real_Liam_Payne I SCREAMED AT THE TOP OF MY L...,@ Real _ Liam <S-PER> _ Payne <S-PER> I SCREAM...,@ <SYM> Real <PROPN> _ <SYM> Liam <PROPN> _ <S...,"[@, Real, _, Liam, _, Payne, I, SCREAMED, AT, ...","[SYM, PROPN, SYM, PROPN, SYM, PROPN, PRON, VER..."
4419,0,Mom is hijacking my account to earn MCR STATUS...,Mom <S-PER> is hijacking my account to earn MC...,Mom <PROPN> is <VERB> hijacking <VERB> my <PRO...,"[Mom, is, hijacking, my, account, to, earn, MC...","[PROPN, VERB, VERB, PRON, NOUN, PART, VERB, PR..."
5418,0,Just realized that maybe it not normal to sit ...,Just realized that maybe it not normal to sit ...,Just <ADV> realized <VERB> that <ADP> maybe <A...,"[Just, realized, that, maybe, it, not, normal,...","[ADV, VERB, ADP, ADV, PRON, ADV, ADJ, PART, VE..."
3440,0,Im Dead!!! My two Loves in 1 photo! My Heart e...,Im Dead !! ! My two Loves in 1 photo ! My Hear...,Im <VERB> Dead <ADJ> !! <PUNCT> ! <PUNCT> My <...,"[Im, Dead, !!, !, My, two, Loves, in, 1, photo...","[VERB, ADJ, PUNCT, PUNCT, PRON, NUM, NOUN, ADP..."
1905,0,Nick Williams just hit another bomb. Just crus...,Nick <B-PER> Williams <E-PER> just hit another...,Nick <PROPN> Williams <PROPN> just <ADV> hit <...,"[Nick, Williams, just, hit, another, bomb, ., ...","[PROPN, PROPN, ADV, VERB, DET, NOUN, PUNCT, AD..."
3416,0,Philadelphia EaglesÛª Jordan Matthews Is Goin...,Philadelphia <B-ORG> Eagles <E-ORG> ‰ Û ª Jord...,Philadelphia <PROPN> Eagles <PROPN> ‰ <SYM> Û ...,"[Philadelphia, Eagles, ‰, Û, ª, Jordan, Matthe...","[PROPN, PROPN, SYM, SYM, NUM, PROPN, PROPN, VE..."


In [19]:
# if used Sentence.to_tagged_string()
def extract_ner_tags(rows,):
    text = rows['ner_tag'].split(" ")
    #print(text)
    tot_words = len(text)
    #print(tot_words)
    words = []
    tags = []
    for i,wd in enumerate(text):
        if wd.startswith("<"):
            continue
#         print(words)
#         print(tags)
        if i+1 < tot_words:
            #print(i)
            if text[i+1].startswith("<"):
#                 print(wd)
                words.append(wd)
                tags.append(clean_tags(text[i+1]))
            else:
                #print(wd)
                words.append(wd)
                tags.append("NA")
        
        else:
            if not text[i].startswith("<"):
                words.append(wd)
                tags.append("NA")
                
    if len(words) == len(tags):
        rows['ner_corpus'] = words
        rows['cleaned_ner_tags'] = tags
    return rows

In [20]:
test = test.apply(extract_ner_tags,axis = 1)

In [21]:
test

Unnamed: 0,label,text,ner_tag,pos_tag,pos_corpus,cleaned_pos_tags,ner_corpus,cleaned_ner_tags
6859,0,@AshGhebranious civil rights continued in the ...,@ AshGhebranious <S-MISC> civil rights continu...,@ <SYM> AshGhebranious <ADJ> civil <ADJ> right...,"[@, AshGhebranious, civil, rights, continued, ...","[SYM, ADJ, ADJ, NOUN, VERB, ADP, DET, NOUN, PU...","[@, AshGhebranious, civil, rights, continued, ...","[NA, S-MISC, NA, NA, NA, NA, NA, NA, NA, NA, N..."
6751,0,Dakota Skye gets horny with some porn then get...,Dakota <B-PER> Skye <E-PER> gets horny with so...,Dakota <PROPN> Skye <PROPN> gets <VERB> horny ...,"[Dakota, Skye, gets, horny, with, some, porn, ...","[PROPN, PROPN, VERB, ADJ, ADP, DET, NOUN, ADV,...","[Dakota, Skye, gets, horny, with, some, porn, ...","[B-PER, E-PER, NA, NA, NA, NA, NA, NA, NA, NA,..."
7281,1,Richard returns after whirlwind few days http:...,Richard <S-PER> returns after whirlwind few da...,Richard <PROPN> returns <VERB> after <ADP> whi...,"[Richard, returns, after, whirlwind, few, days...","[PROPN, VERB, ADP, ADJ, ADJ, NOUN, X, X, X, SY...","[Richard, returns, after, whirlwind, few, days...","[S-PER, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA..."
909,0,Bloody Mary in the sink. Beet juice http://t.c...,Bloody Mary <S-PER> in the sink . Beet juice h...,Bloody <ADJ> Mary <PROPN> in <ADP> the <DET> s...,"[Bloody, Mary, in, the, sink, ., Beet, juice, ...","[ADJ, PROPN, ADP, DET, NOUN, PUNCT, NOUN, NOUN...","[Bloody, Mary, in, the, sink, ., Beet, juice, ...","[NA, S-PER, NA, NA, NA, NA, NA, NA, NA, NA, NA..."
5939,0,@Real_Liam_Payne I SCREAMED AT THE TOP OF MY L...,@ Real _ Liam <S-PER> _ Payne <S-PER> I SCREAM...,@ <SYM> Real <PROPN> _ <SYM> Liam <PROPN> _ <S...,"[@, Real, _, Liam, _, Payne, I, SCREAMED, AT, ...","[SYM, PROPN, SYM, PROPN, SYM, PROPN, PRON, VER...","[@, Real, _, Liam, _, Payne, I, SCREAMED, AT, ...","[NA, NA, NA, S-PER, NA, S-PER, NA, NA, NA, NA,..."
4419,0,Mom is hijacking my account to earn MCR STATUS...,Mom <S-PER> is hijacking my account to earn MC...,Mom <PROPN> is <VERB> hijacking <VERB> my <PRO...,"[Mom, is, hijacking, my, account, to, earn, MC...","[PROPN, VERB, VERB, PRON, NOUN, PART, VERB, PR...","[Mom, is, hijacking, my, account, to, earn, MC...","[S-PER, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA..."
5418,0,Just realized that maybe it not normal to sit ...,Just realized that maybe it not normal to sit ...,Just <ADV> realized <VERB> that <ADP> maybe <A...,"[Just, realized, that, maybe, it, not, normal,...","[ADV, VERB, ADP, ADV, PRON, ADV, ADJ, PART, VE...","[Just, realized, that, maybe, it, not, normal,...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N..."
3440,0,Im Dead!!! My two Loves in 1 photo! My Heart e...,Im Dead !! ! My two Loves in 1 photo ! My Hear...,Im <VERB> Dead <ADJ> !! <PUNCT> ! <PUNCT> My <...,"[Im, Dead, !!, !, My, two, Loves, in, 1, photo...","[VERB, ADJ, PUNCT, PUNCT, PRON, NUM, NOUN, ADP...","[Im, Dead, !!, !, My, two, Loves, in, 1, photo...","[NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N..."
1905,0,Nick Williams just hit another bomb. Just crus...,Nick <B-PER> Williams <E-PER> just hit another...,Nick <PROPN> Williams <PROPN> just <ADV> hit <...,"[Nick, Williams, just, hit, another, bomb, ., ...","[PROPN, PROPN, ADV, VERB, DET, NOUN, PUNCT, AD...","[Nick, Williams, just, hit, another, bomb, ., ...","[B-PER, E-PER, NA, NA, NA, NA, NA, NA, NA, NA]"
3416,0,Philadelphia EaglesÛª Jordan Matthews Is Goin...,Philadelphia <B-ORG> Eagles <E-ORG> ‰ Û ª Jord...,Philadelphia <PROPN> Eagles <PROPN> ‰ <SYM> Û ...,"[Philadelphia, Eagles, ‰, Û, ª, Jordan, Matthe...","[PROPN, PROPN, SYM, SYM, NUM, PROPN, PROPN, VE...","[Philadelphia, Eagles, ‰, Û, ª, Jordan, Matthe...","[B-ORG, E-ORG, NA, NA, NA, B-PER, E-PER, NA, N..."


In [66]:
test.ner_corpus == test.pos_corpus

6859    True
6751    True
7281    True
909     True
5939    True
4419    True
5418    True
3440    True
1905    True
3416    True
dtype: bool

# Understanding Conll data fromat for training our own corpus

## Loading conll dataset

The data file contains one word per line, with empty lines representing sentence boundaries.

In [20]:
with open('../data/external/pos_tag_retraining/conll.train', 'r') as f:
    txt = f.read()

In [21]:
txt.split("\n")[:10]

['-DOCSTART- -X- -X- O',
 '',
 'EU NNP I-NP I-ORG',
 'rejects VBZ I-VP O',
 'German JJ I-NP I-MISC',
 'call NN I-NP O',
 'to TO I-VP O',
 'boycott VB I-VP O',
 'British JJ I-NP I-MISC',
 'lamb NN I-NP O']

## Preprocessing

In [22]:
txt = txt.split("\n")

In [23]:
txt = [x for x in txt if x != '-DOCSTART- -X- -X- O']

In [24]:
txt[:10]

['',
 'EU NNP I-NP I-ORG',
 'rejects VBZ I-VP O',
 'German JJ I-NP I-MISC',
 'call NN I-NP O',
 'to TO I-VP O',
 'boycott VB I-VP O',
 'British JJ I-NP I-MISC',
 'lamb NN I-NP O',
 '. . O O']

In [25]:
# Initialize empty list for storing words
words = []
# initialize empty list for storing sentences #
corpus = []

for i in tqdm_notebook(txt):
    if i == '':
        ## previous words form a sentence ##
        corpus.append(' '.join(words))
        ## Refresh Word list ##
        words = []
    else:
       ## word at index 0 ##
        words.append(i.split()[0])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=218609.0), HTML(value='')))




In [26]:
corpus[:10]

['',
 'EU rejects German call to boycott British lamb .',
 'Peter Blackburn',
 'BRUSSELS 1996-08-22',
 'The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .',
 "Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .",
 '" We do n\'t support any such recommendation because we do n\'t see any grounds for it , " the Commission \'s chief spokesman Nikolaus van der Pas told a news briefing .',
 'He said further scientific study was required and if it was found that action was needed it should be taken by the European Union .',
 'He said a proposal last month by EU Farm Commissioner Franz Fischler to ban sheep brains , spleens and spinal cords from the human and animal food chains was a highly specific 

In [27]:
corpus = [x for x in corpus if x != '']

In [28]:
# Initialize empty list for storing word pos
w_pos = []
#initialize empty list for storing sentence pos #
POS = []
for i in tqdm_notebook(txt):
  ## blank sentence = new line ##
    if i == '':
        ## previous words form a sentence POS ##
        POS.append(' '.join(w_pos))
    ## Refresh words list ##
        w_pos = []
    else:
        ## pos tag from index 1 ##
        w_pos.append(i.split()[1])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=218609.0), HTML(value='')))




In [29]:
POS = [x for x in POS if x != '']

## Flair Prediction

In [30]:
f_pos = []
for i in tqdm_notebook(corpus[:10]):
    sentence = Sentence(i)
    tagger_pos.predict(sentence)
  ## append tagged sentence ##
    f_pos.append(sentence.to_tagged_string())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [31]:
f_pos[:1]

['EU <PROPN> rejects <VERB> German <ADJ> call <NOUN> to <PART> boycott <VERB> British <ADJ> lamb <NOUN> . <PUNCT>']

In [32]:
for i in tqdm_notebook(range(len(f_pos))):
    ## for every words ith sentence ##
    for j in corpus[i].split():
        ## replace that word from ith sentence in f_pos ##
        f_pos[i] = str(f_pos[i]).replace(j, "", 1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [33]:
f_pos[:1]

[' <PROPN>  <VERB>  <ADJ>  <NOUN>  <PART>  <VERB>  <ADJ>  <NOUN>  <PUNCT>']

In [34]:
f_pos = [clean_tags(i) for i in tqdm_notebook(f_pos)]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [35]:
f_pos[:1]

['PROPN VERB ADJ NOUN PART VERB ADJ NOUN PUNCT']

In [41]:
corpus[:1],f_pos[:1]

(['EU rejects German call to boycott British lamb .'],
 ['PROPN VERB ADJ NOUN PART VERB ADJ NOUN PUNCT'])

In [70]:
f_pos[:1][0].split(" ")[2]

'ADJ'

In [91]:
f_pos[0]

'PROPN VERB ADJ NOUN PART VERB ADJ NOUN PUNCT'

In [92]:
corpus[0]

'EU rejects German call to boycott British lamb .'

# Converting our dataset in conll dataset format

In [345]:
test.columns

Index(['label', 'text', 'ner_tag', 'pos_tag', 'pos_corpus', 'cleaned_pos_tags',
       'ner_corpus', 'cleaned_ner_tags'],
      dtype='object')

In [348]:
#pos_tags
train_corpus = ""
for row in tqdm_notebook(test.itertuples()):
    words = ""
    corpus = row.pos_corpus
    pos_tags = row.cleaned_pos_tags
    ner_tags = row.cleaned_ner_tags
    for j, word in enumerate(corpus):
        txt_tag = str(word) + " " + pos_tags[j] + " " + ner_tags[j]
        words = words + "\n" + txt_tag
    train_corpus = train_corpus + "\n" + words
train_corpus = train_corpus[2:]    
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [349]:
print(train_corpus)

@AshGhebranious ADJ NA
civil ADJ NA
rights NOUN NA
continued VERB NA
in ADP NA
the DET NA
60s. NOUN NA
And CCONJ NA
what PRON NA
about ADP NA
trans-generational ADJ NA
trauma? NOUN NA
if ADP NA
anything NOUN NA
we PRON NA
should AUX NA
listen VERB NA
to ADP NA
the DET NA
Americans. PROPN S-MISC

Dakota PROPN B-PER
Skye PROPN NA
gets VERB E-PER
horny ADJ NA
with ADP NA
some DET NA
porn NOUN NA
then ADV NA
gets VERB NA
her PRON NA
juicy ADJ NA
pussy NOUN NA
pounded VERB NA
http://t.co/qew4c5M1xd NOUN NA
View NOUN NA
and CCONJ NA
download VERB NA
video NOUN NA

Richard PROPN S-PER
returns VERB NA
after ADP NA
whirlwind ADJ NA
few ADJ NA
days NOUN NA
http://t.co/L8W30WFW3R NOUN NA
#MLB NOUN NA

Bloody ADJ NA
Mary PROPN S-PER
in ADP NA
the DET NA
sink. NOUN NA
Beet NOUN NA
juice NOUN NA
http://t.co/LUigmHMa1i X NA

@Real_Liam_Payne INTJ NA
I PRON NA
SCREAMED VERB NA
AT ADP NA
THE DET NA
TOP NOUN NA
OF ADP NA
MY PRON NA
LUNGS NOUN NA
WHEN ADV NA
YOU PRON NA
SAID VERB NA
YOU PRON NA
GUYS NOUN

# Retraining POS Tag

In [208]:
tagger = SequenceTagger.load("pos-fast")

2020-04-04 23:53:17,935 loading file /home/uv/.flair/models/en-pos-ontonotes-fast-v0.4.pt


In [351]:
with open("../data/interim/pos_train.txt",'w') as f:
    f.write(train_corpus)

In [213]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
columns = {0: 'text', 1: 'pos', 2: 'ner'}
data_folder = "../data/interim"
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='pos_train.txt')

2020-04-04 23:55:02,573 Reading data from ../data/interim
2020-04-04 23:55:02,575 Train: ../data/interim/pos_train.txt
2020-04-04 23:55:02,578 Dev: None
2020-04-04 23:55:02,580 Test: None


In [220]:
tag_type = 'pos'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [221]:
vars(tag_dictionary)

{'item2idx': {b'<unk>': 0,
  b'O': 1,
  b'PROPN': 2,
  b'VERB': 3,
  b'PART': 4,
  b'ADP': 5,
  b'NUM': 6,
  b'NOUN': 7,
  b'INTJ': 8,
  b'PRON': 9,
  b'DET': 10,
  b'ADV': 11,
  b'AUX': 12,
  b'X': 13,
  b'ADJ': 14,
  b'CCONJ': 15,
  b'PUNCT': 16,
  b'<START>': 17,
  b'<STOP>': 18},
 'idx2item': [b'<unk>',
  b'O',
  b'PROPN',
  b'VERB',
  b'PART',
  b'ADP',
  b'NUM',
  b'NOUN',
  b'INTJ',
  b'PRON',
  b'DET',
  b'ADV',
  b'AUX',
  b'X',
  b'ADJ',
  b'CCONJ',
  b'PUNCT',
  b'<START>',
  b'<STOP>'],
 'multi_label': False}

In [223]:
from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('../models/retraining/pos/flair_pos_test',
              train_with_dev=False, max_epochs=1)

2020-04-04 23:59:29,013 ----------------------------------------------------------------------------------------------------
2020-04-04 23:59:29,018 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2048, out_features=2048, bias=True)
  (rnn): LSTM(2048, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_feature

{'test_score': 1.0,
 'dev_score_history': [1.0],
 'train_loss_history': [2.442347288131714],
 'dev_loss_history': [tensor(1.2042)]}

# Retraining NER Tags

In [350]:
tagger = SequenceTagger.load("ner-fast")

2020-04-05 00:53:27,397 loading file /home/uv/.flair/models/en-ner-fast-conll03-v0.4.pt


In [353]:
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file='pos_train.txt')

2020-04-05 00:54:15,678 Reading data from ../data/interim
2020-04-05 00:54:15,683 Train: ../data/interim/pos_train.txt
2020-04-05 00:54:15,686 Dev: None
2020-04-05 00:54:15,688 Test: None


In [354]:
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

In [355]:
vars(tag_dictionary)

{'item2idx': {b'<unk>': 0,
  b'O': 1,
  b'NA': 2,
  b'S-PER': 3,
  b'B-PER': 4,
  b'E-PER': 5,
  b'S-MISC': 6,
  b'S-ORG': 7,
  b'<START>': 8,
  b'<STOP>': 9},
 'idx2item': [b'<unk>',
  b'O',
  b'NA',
  b'S-PER',
  b'B-PER',
  b'E-PER',
  b'S-MISC',
  b'S-ORG',
  b'<START>',
  b'<STOP>'],
 'multi_label': False}

In [356]:
from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('../models/retraining/ner/flair_ner_test',
              train_with_dev=False, max_epochs=1)

2020-04-05 00:55:11,586 ----------------------------------------------------------------------------------------------------
2020-04-05 00:55:11,588 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings('glove')
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.25, inplace=False)
        (encoder): Embedding(275, 100)
        (rnn): LSTM(100, 1024)
        (decoder): Linear(in_features=1024, out_features=275, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=2148, out_features=2148, bias=True)
  (rnn): LSTM(2148, 256, batch_first=True, b

{'test_score': 0.1111,
 'dev_score_history': [0.1333],
 'train_loss_history': [187.69935607910156],
 'dev_loss_history': [tensor(148.3703)]}