In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy

In [None]:
df = pd.read_json('./annotate_rehearsal/News_Category_Dataset_v2.json', lines=True)

In [125]:
np.random.seed(42)

In [127]:
rng = list(df.index)

In [128]:
np.random.shuffle(rng)

In [129]:
df1 = df.loc[rng[:1000]].copy() #train
df2 = df.loc[rng[1000:2000]].copy() #test

In [130]:
df1['text'] = df1['headline'] + '. ' + df1['short_description'] #train
df2['text'] = df2['headline'] + '. ' + df2['short_description'] #test

In [131]:
nlp = spacy.load('en_core_web_sm')

In [132]:
df1.reset_index(inplace=True)
df2.reset_index(inplace=True)

In [133]:
def generate_examples(df):
    examples = []
    for idx, doc in enumerate(nlp.pipe(df['text'])):
        dct = {}
        if len(doc.ents) > 0:
            dct['text'] = doc.text
            l = []
            for ent in doc.ents:
                l.append([ent.start_char, ent.end_char, ent.label_])
            dct['label'] = l        
            examples.append(dct)
    return examples

In [134]:
#annotated train examples
train_df = pd.DataFrame(generate_examples(df1))
train_df.shape

(840, 2)

In [143]:
train_df.head()

Unnamed: 0,text,label
0,Shooting Rampage Results In 'Mass Casualties' ...,"[[29, 45, ORG], [49, 70, ORG], [72, 79, GPE], ..."
1,"Donald Trump Lashes Out At CNN, ABC Over Repor...","[[0, 19, PERSON], [27, 30, ORG], [32, 35, ORG]..."
2,Model Kate Moss Walks Louis Vuitton RTW Fall 2...,"[[0, 39, PERSON], [69, 73, PERSON], [121, 138,..."
3,7 Ways Single Moms Cope With Loneliness. “Am I...,"[[0, 1, CARDINAL], [99, 107, PERSON]]"
4,Sen. Sherrod Brown: Steve Bannon 'Is A White S...,"[[5, 18, PERSON], [20, 34, PERSON], [84, 100, ..."


In [135]:
#annotated test examples
test_df = pd.DataFrame(generate_examples(df2))
test_df.shape

(831, 2)

In [144]:
test_df.head()

Unnamed: 0,text,label
0,"Why Your Body, Mind and Soul Will Love the Cen...","[[261, 272, DATE]]"
1,"11 Ways To Turn Up A Down Day. Logy, low, and ...","[[0, 2, CARDINAL]]"
2,"A Thank You Note to the Serengeti. ""Are you na...","[[24, 33, PERSON], [205, 216, PERSON], [224, 2..."
3,Progressive Canadian Leader: Justin Trudeau Sh...,"[[12, 20, NORP], [29, 43, PERSON], [65, 77, PE..."
4,Hundreds of Toddlers Said to Be Taken Into Cus...,"[[0, 8, CARDINAL], [12, 20, ORG], [58, 64, GPE]]"


In [136]:
train_df[:200].to_json(f'nlp_rehearsal_200.json', orient='records', lines=True)
train_df.to_json(f'nlp_rehearsal_1000.json', orient='records', lines=True)

In [137]:
test_df.to_json(f'test_nlp_rehearsal_1000.json', orient='records', lines=True)

In [138]:
idx = 2
txt = examples_df['text'].loc[idx]

In [139]:
# Entities generated by default spacy model given original text
displacy.render(nlp(txt), style='ent')

In [140]:
df1.loc[idx, 'text']

'Model Kate Moss Walks Louis Vuitton RTW Fall 2013. Golden girl model Kate Moss wowed in her usual chic way as she walked the Louis Vuitton Ready-To-Wear Fall Collection 2013 fashion show. Moss continues to give us a close look at her ability to inspire while wearing all things gorgeous. Yes, absolutely Louis Vuitton gorgeous.'

In [141]:
df1.head();

In [142]:
# Entities produced by spacy using modified text capitalization
displacy.render(nlp('Model Kate Moss walks Louis Vuitton RTW Fall 2013. Golden girl model Kate Moss wowed in her usual chic way as she walked the Louis Vuitton Ready-To-Wear Fall Collection 2013 fashion show. Moss continues to give us a close look at her ability to inspire while wearing all things gorgeous. Yes, absolutely Louis Vuitton gorgeous.'), 
                style='ent')

In [115]:
# for idx, doc in enumerate (nlp.pipe(examples_df['text'][:200])):
#     for ent in doc.ents:
#         if ent.label_ == 'GPE':
#             print (idx, doc)
#             break