Named Entity extraction - Corpus to .CSV file for Amazon Mechanical Turk Annotation
--

**This .ipynb notebook is used for extraction is primarily used to extract the named entities from the corpus and load it to .csv file.**


Import required Libraries
--


In [1]:
import re
import json
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from collections import defaultdict
import pandas as pd
import numpy as np

Variabe Declaration
--

In [2]:
labels = ["PERSON", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "TIME", "LAW", "NORP"]
path = f"/Users/gurpreetbedi/Downloads/COLX_523_Group2/transcripts/en/filtered/filtered_annotated_ted_talks_en.json"
talks_id, paras_id, before, entities, labels_, after, entities_id, temp  = ([] for _ in range(8))
talks, title, talker, ents = ({} for _ in range(4))

Preprocess the talks
--

In [3]:
def preprocess_talk(content):
    text = {}
    ents = {}
    title = content["title"]
    author = content["talker"]
    for i in range(len(content["text"])):
        text[i] = content["text"][i]["text"]
        for j in range(len(content["text"][i]["ents"])):
            ents[j] = content["text"][i]["ents"][j]
    return text, title, author, ents

To get the context before the entity
--

In [4]:
def get_before_context(para, start):
    '''takes the paragraph and entitty start index and returns the context before the entity
    '''
    bef = []
    before_patt = re.compile("[^.]+$")
    bef_ent = before_patt.search(para[:start])
    check = bef_ent
    if check:
        bef.append(bef_ent.group(0))
    else:
        bef.append(para[:start])
    return bef

To get th context after the entity
--

In [5]:
def get_after_context(para, end):
    '''takes the paragraph and entitty end index and returns the context after the entity
    '''    
    aft = []
    after_patt = re.compile("[^.]*")
    aft_ent = after_patt.search(para[end+1:]) 
    check = aft_ent
    if check:
        aft.append(aft_ent.group(0))
    else:
        aft.append(para[end+1:])    
    return aft

Grouping same labels
--

In [6]:
def get_labels(label):
    '''takes the label and return similar grouped labels
    '''        
    lab = []
    if label == "FAC":
        lab.append("ORG")
    elif label == "GPE":
        lab.append("LOC")
    else:
        lab.append(label) 
    return lab

In [7]:
def flatten_all(df, df_cols, fill_value=''):
    '''flatten the lists csv load
    '''        
    if df_cols and not isinstance(df_cols, list):
        df_cols = [df_cols]
    idx_cols = df.columns.difference(df_cols)
    lens = df[df_cols[0]].str.len()
    if (lens > 0).all():
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[df_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in df_cols}).loc[:, df.columns]
    else:
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[df_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in df_cols}).append(df.loc[lens==0, idx_cols]).fillna(fill_value).loc[:, df.columns]

Create CSV from the extracted dataset
--

In [8]:
def create_csv(talks_id, paras_id, entities_id, before, entities, after, labels_):
    '''takes the required fileds as argumensts and returns the pandas dataframe
    '''    
    ids = {"talk_id" : talks_id,
          "para_id" : paras_id,
          "entity_id" : entities_id,
          "before" : before,
          "entities" : entities,
          "after" : after,
          "label" : labels_}

    df = pd.DataFrame(data=ids)
    final_df = flatten_all(df, df_cols=list(df.columns))
    return final_df

In [9]:
with open(path) as json_file:
    contents = json.load(json_file)

num_of_talks = -1

processed_talk = {}
for i in range(len(contents[:num_of_talks])):
    talks[i], title[i], talker[i], ents[i]= preprocess_talk(contents[:num_of_talks][i])
    
assert len(talks) == len(ents), "There are missing entities, please recheck!"

for talk_id, text in talks.items():
    for para_id, para in text.items():
        entity_id, label, t_id, p_id, before_cont, after_cont, entity = ([] for _ in range(7))
        for e_id, ents_ in ents[talk_id].items():
            if ents_["label"] in labels:
                start = ents_["start"]
                end = ents_["end"]
                if ents_["text"] == para[start:end] and (ents_["text"],ents_["label"]) not in temp:
                    entity_id.append(e_id)        
                    entity.append(ents_["text"])   
                    before_cont.append(get_before_context(para, start))
                    after_cont.append(get_after_context(para, end))
                    t_id.append(talk_id)
                    p_id.append(para_id)  
                    label.append(get_labels(ents_["label"]))
                    temp.append((ents_["text"], ents_["label"]))

        if entity_id:
            entities_id.append(entity_id)
        if t_id:
            talks_id.append(t_id)
        if p_id:
            paras_id.append(p_id)
        if entity:
            entities.append(entity)            
        if before_cont:
            before.append(before_cont)
        if after_cont:
            after.append(after_cont)
        if label:
            labels_.append(label)     

assert len(talks_id) == len(paras_id) == len(entities_id) == len(before) == len(entities) == len(after) == len(labels_), "Length mismatch, please check as all should be the same!"

annotation_df = create_csv(talks_id, paras_id, entities_id, before, entities, after, labels_)

In [10]:
print(f"Total number of rows extracted : {annotation_df.shape[0]}")

Total number of rows extracted : 1550


Top 60 records from the dataframe
--

In [11]:
annotation_df.head(60)

Unnamed: 0,talk_id,para_id,entity_id,before,entities,after,label
0,0,0,0,"Aesthetically speaking, there's a world of di...",Beethoven,"and Justin Bieber, but both artists have used ...",PERSON
1,0,0,1,"Aesthetically speaking, there's a world of di...",Justin Bieber,but both artists have used the same building ...,PERSON
2,0,0,5,In the,Western,"music tradition, pitches are named after the f...",NORP
3,0,0,16,The point is that just like the second hand o...,one minute,"into sixty seconds, with each second just as l...",TIME
4,0,0,17,The point is that just like the second hand o...,sixty seconds,with each second just as long as every other ...,TIME
5,1,0,1,Which ends up being heard as the main beat is...,Rubin,"s vase, can be reversed depending on cultural ...",PERSON
6,1,0,10,This is the basis of the music of Whirling De...,Latin American,"rhythms, such as Joropo, and even Bach's famou...",NORP
7,1,0,11,This is the basis of the music of Whirling De...,Joropo,and even Bach's famous Chaconne,NORP
8,1,0,12,This is the basis of the music of Whirling De...,Bach,s famous Chaconne,PERSON
9,1,0,13,This is the basis of the music of Whirling De...,Chaconne,Now if we remember Rubin's vase and hear the ...,PERSON


Writing Datafram to .csv file for Amazon Mechanical Turk
--

In [12]:
annotation_df.to_csv("/Users/gurpreetbedi/Downloads/COLX_523_Group2/transcripts/en/annotated/Annotation.csv", index=False)

Testing for Unique Entity-Label pairs using SpaCy Display render feature
--

**Note: Extract Named entity is being used from [process_corpus.ipynb](https://github.ubc.ca/iameleve/COLX_523_Group2/blob/master/src/process_corpus.ipynb) for testing purposes.**

In [13]:
def extract_named_entity(talk, title=None, talker=None):
    ne_lex = {}
    doc = nlp(talk)
    ents = []
    ne_lex["text"] = talk            
    for ent in doc.ents:
        if ent.label_ in labels:
            ent_lex = {}
            ent_lex["start"] = ent.start_char
            ent_lex["end"] = ent.end_char
            ent_lex["text"] = ent.text
            ent_lex["label"] = ent.label_
            ents.append(ent_lex)
    ne_lex["ents"] = ents
    ne_lex["title"] = title
    ne_lex["talker"] = talker

    return ne_lex

Below are the test cases
--

In [14]:
dict_ = extract_named_entity(talks[11][0])
set([(ents["text"], ents["label"]) for ents in dict_["ents"]])

{("B. It's", 'PERSON'),
 ('Christian Rudder', 'PERSON'),
 ('OkCupid', 'GPE'),
 ('OkCupid', 'ORG'),
 ('OkCupid', 'PERSON'),
 ('the United States', 'GPE')}

In [15]:
annotation_df.query("talk_id==11").query("para_id==0")[["entities", "label"]]

Unnamed: 0,entities,label
109,Christian Rudder,PERSON
110,OkCupid,LOC
111,the United States,LOC
112,OkCupid,ORG
113,OkCupid,PERSON
114,B. It's,PERSON


In [16]:
dict_ = extract_named_entity(talks[44][14])
set([(ents["text"], ents["label"]) for ents in dict_["ents"]])

{('Chiho', 'PERSON'),
 ('Mick Jagger', 'PERSON'),
 ('Nirvana', 'PERSON'),
 ('TED', 'ORG')}

In [17]:
annotation_df.query("talk_id==44").query("para_id==14")[["entities", "label"]]

Unnamed: 0,entities,label
289,Mick Jagger,PERSON
290,Chiho,PERSON
291,TED,ORG


In [18]:
dict_ = extract_named_entity(talks[1][0])
set([(ents["text"], ents["label"]) for ents in dict_["ents"]])

{('Argentinian', 'NORP'),
 ('Bach', 'PERSON'),
 ('Brazilian', 'NORP'),
 ('Chacarera', 'ORG'),
 ('Chaconne', 'PERSON'),
 ('Cuban', 'NORP'),
 ('Joropo', 'NORP'),
 ('Latin American', 'NORP'),
 ('Middle Eastern', 'LOC'),
 ('Northern Romanian', 'LOC'),
 ('Persian', 'NORP'),
 ('Puerto Rican', 'NORP'),
 ('Quechua', 'GPE'),
 ('Rubin', 'PERSON')}

In [19]:
annotation_df.query("talk_id==1").query("para_id==0")[["entities", "label"]]

Unnamed: 0,entities,label
5,Rubin,PERSON
6,Latin American,NORP
7,Joropo,NORP
8,Bach,PERSON
9,Chaconne,PERSON
10,Chacarera,ORG
11,Quechua,LOC
12,Persian,NORP
13,Cuban,NORP
14,Puerto Rican,NORP


**So, the data is matching to the the filtered data set!!**