In [122]:
import spacy
import pandas as pd
import numpy as np
from spacy import displacy
import networkx as nx
from nltk.corpus import stopwords
import nltk
# nltk.download('stopwords')

In [123]:
stopwords= stopwords.words('english')

In [5]:
# !python -m spacy download en_core_web_sm
import en_core_web_sm
nlp = en_core_web_sm.load()

# get sentences and relations dataframe

In [108]:
def extract_sentences_and_relations(file):
    '''
    read the input file line by line and extract sentences and relations from it. 
    '''
    sentences=[]
    relations=[]

    with open(file) as f:
        i=1
        for line in f:
            # store sentence from line 1
            if i==1:
                sen=line.split('"')[1]
                sentences.append(sen)

            # store relation from line 2
            elif i==2:
                # remove extra white spaces at the end
                relation= line.strip()
                relations.append(relation)
                
            elif i%4==0:
                i=0

            i+=1
    return sentences, relations

In [34]:
train_file= "train.txt"
test_file= "test.txt"
train_sent, train_rel= extract_sentences_and_relations(train_file)
test_sent, test_rel= extract_sentences_and_relations(test_file)

In [111]:
tr= "semeval_train.txt"
te= "semeval_test.txt"
tr_sen, tr_rel= extract_sentences_and_relations(tr)
te_sen, te_rel= extract_sentences_and_relations(te)

In [114]:
tr_rel[0]

'Component-Whole(e2,e1)'

In [112]:
len(te_sen)

2717

In [35]:
train = {'Sentences': train_sent, 'Relations': train_rel}
test = {'Sentences': test_sent, 'Relations': test_rel}
train_df= pd.DataFrame(train, columns=['Sentences', 'Relations'])
test_df= pd.DataFrame(test, columns=['Sentences', 'Relations'])

In [36]:
train_df.shape

(17641, 2)

In [37]:
train_df.head()

Unnamed: 0,Sentences,Relations
0,<e1> Thom Yorke </e1> of <e2> Radiohead </e2>...,"per:employee_of(e1,e2)"
1,<e1> Leland High School </e1> is a public hig...,"org:city_of_headquarters(e1,e2)"
2,The 2008 Ohio Bobcats football team represent...,"org:members(e2,e1)"
3,<e1> Holy Cross High School </e1> is a Cathol...,"org:founded_by(e1,e2)"
4,Hastings was unable to confirm news reports t...,"per:employee_of(e2,e1)"


In [38]:
len(train_df.Relations.unique())

37

In [39]:
test_df.shape

(3405, 2)

In [40]:
test_df.head()

Unnamed: 0,Sentences,Relations
0,After returning to the U.K. she attended the ...,"org:stateorprovince_of_headquarters(e2,e1)"
1,Supported by their own buying staff <e1> Mars...,"org:stateorprovince_of_headquarters(e1,e2)"
2,The California Department of Alcohol and Drug...,"org:stateorprovince_of_headquarters(e2,e1)"
3,But but <e1> Aetna </e1> 's headquarters are ...,"org:stateorprovince_of_headquarters(e1,e2)"
4,<e1> Singapore Airlines </e1> ( <e2> SIA </e2...,"org:alternate_names(e1,e2)"


In [41]:
len(test_df.Relations.unique())

37

# extract e1, e2, and its position.

In [32]:
def get_entity_index(sen):
    sen_list= sen.split()

    for i, word in enumerate(sen_list):
        if word=='<e1>':
            start1=i

        elif word=='</e1>':
            end1=i

        if word=='<e2>':
            start2=i

        elif word=='</e2>':
            end2=i
            
    # get e1 and e2
    e1= " ".join(sen_list[start1+1 : end1])
    e2= " ".join(sen_list[start2+1 : end2])
    
    return e1, e2, [start1, end1, start2, end2]

In [33]:
train_df[['e1', 'e2', 'position']]= train_df.Sentences.apply(lambda sen: get_entity_index(sen)).apply(pd.Series)
test_df[['e1', 'e2', 'position']]= test_df.Sentences.apply(lambda sen: get_entity_index(sen)).apply(pd.Series)

In [44]:
train_df.head()

Unnamed: 0,Sentences,Relations,e1,e2,position
0,<e1> Thom Yorke </e1> of <e2> Radiohead </e2>...,"per:employee_of(e1,e2)",Thom Yorke,Radiohead,"[0, 3, 5, 7]"
1,<e1> Leland High School </e1> is a public hig...,"org:city_of_headquarters(e1,e2)",Leland High School,San Jose,"[0, 4, 16, 19]"
2,The 2008 Ohio Bobcats football team represent...,"org:members(e2,e1)",Ohio University,NCAA,"[7, 10, 14, 16]"
3,<e1> Holy Cross High School </e1> is a Cathol...,"org:founded_by(e1,e2)",Holy Cross High School,Congregation of Holy Cross,"[0, 5, 19, 24]"
4,Hastings was unable to confirm news reports t...,"per:employee_of(e2,e1)",Democratic,Bill Gwatney,"[13, 15, 18, 21]"


# remove missing entity sentences

In [45]:
# replace empty e1 and e2 with na and then dropna 
train_df.replace('', np.nan, inplace=True)
train_df.dropna(inplace=True)

test_df.replace('', np.nan, inplace=True)
test_df.dropna(inplace=True)

In [46]:
train_df.shape

(17521, 5)

In [47]:
test_df.shape

(3379, 5)

In [48]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

# extract nlp features from sentence, pos, enr, e1, e2, root, words in between 

In [49]:
def get_sen_without_entity(sen):
    sen_list= sen.split()
    sen_without_entity= " ".join([token for token in sen_list if token not in {'<e1>','</e1>', '<e2>', '</e2>'}]) 
    return sen_without_entity

def get_pos(sen_without_entity):
    sen_pos = [token.pos_ for token in nlp(sen_without_entity)]
    return sen_pos

def get_root(entity):
    # create a span object that has property .root
    doc = nlp(entity)
    sen= list(doc.sents)[0]
    return str(sen.root)

def get_enr(entity):
    for ent in nlp(entity).ents:
        return str(ent.label_)

def shortest_dep_path(sen, root_e1, root_e2):
    doc = nlp(sen)
    
    #print dependency tree 
    #displacy.render(doc,jupyter=True)

    # Load spacy's dependency tree into a networkx graph
    edges = []
    for token in doc:
        for child in token.children:
            edges.append(('{0}'.format(token.lower_),
                          '{0}'.format(child.lower_)))
            
    graph = nx.Graph(edges)
    entity1 = root_e1.lower()
    entity2 = root_e2.lower()
    
    try:
        out = str(" ".join(nx.shortest_path(graph, source=entity1, target=entity2)[1:-1]))
        
    except (nx.NetworkXNoPath,  nx.NodeNotFound) as e:
        out= None
    
    return out

In [50]:
def extract(df):
    df['sen_without_entity']= df.Sentences.apply(get_sen_without_entity)
    df['sen_pos']= df.sen_without_entity.apply(get_pos)
    df['pos_e1']= df.apply(lambda row: str(row.sen_pos[row.position[0]]), axis=1)
    df['pos_e2']= df.apply(lambda row: str(row.sen_pos[row.position[2]-2]), axis=1)
    df['enr_e1']= df.e1.apply(get_enr)
    df['enr_e2']= df.e2.apply(get_enr)
    df['root_e1']= df.e1.apply(get_root)
    df['root_e2']= df.e2.apply(get_root)
    df['shortest_dep_path'] = df.apply(lambda row: shortest_dep_path(row.sen_without_entity, row.root_e1, row.root_e2), axis=1)
    df['shortest_dep_path'] = df.apply(lambda row: shortest_dep_path(row.sen_without_entity, row.root_e1, row.root_e2), axis=1)
    
    return df

In [51]:
train_df= extract(train_df)
test_df= extract(test_df)

In [52]:
# drop missing shortest dep path rows
train_df.dropna(subset=['shortest_dep_path'], inplace= True)
test_df.dropna(subset=['shortest_dep_path'], inplace= True)

In [53]:
train_df.head()

Unnamed: 0,Sentences,Relations,e1,e2,position,sen_without_entity,sen_pos,pos_e1,pos_e2,enr_e1,enr_e2,root_e1,root_e2,shortest_dep_path
0,<e1> Thom Yorke </e1> of <e2> Radiohead </e2>...,"per:employee_of(e1,e2)",Thom Yorke,Radiohead,"[0, 3, 5, 7]",Thom Yorke of Radiohead has included the + for...,"[PROPN, PROPN, ADP, PROPN, AUX, VERB, DET, NUM...",PROPN,PROPN,,,Yorke,Radiohead,of
1,<e1> Leland High School </e1> is a public hig...,"org:city_of_headquarters(e1,e2)",Leland High School,San Jose,"[0, 4, 16, 19]",Leland High School is a public high school loc...,"[PROPN, PROPN, PROPN, AUX, DET, ADJ, ADJ, NOUN...",PROPN,PROPN,ORG,GPE,School,Jose,district
2,The 2008 Ohio Bobcats football team represent...,"org:members(e2,e1)",Ohio University,NCAA,"[7, 10, 14, 16]",The 2008 Ohio Bobcats football team represente...,"[DET, NUM, PROPN, PROPN, NOUN, NOUN, VERB, PRO...",PROPN,PROPN,ORG,ORG,University,NCAA,represented team the division
3,<e1> Holy Cross High School </e1> is a Cathol...,"org:founded_by(e1,e2)",Holy Cross High School,Congregation of Holy Cross,"[0, 5, 19, 24]",Holy Cross High School is a Catholic secondary...,"[PROPN, PROPN, PROPN, PROPN, AUX, DET, ADJ, AD...",PROPN,PROPN,ORG,,School,Congregation,founded by
4,Hastings was unable to confirm news reports t...,"per:employee_of(e2,e1)",Democratic,Bill Gwatney,"[13, 15, 18, 21]",Hastings was unable to confirm news reports th...,"[PROPN, AUX, ADJ, PART, VERB, NOUN, VERB, SCON...",PART,NOUN,,PERSON,Democratic,Gwatney,party chairman


In [54]:
test_df.head()

Unnamed: 0,Sentences,Relations,e1,e2,position,sen_without_entity,sen_pos,pos_e1,pos_e2,enr_e1,enr_e2,root_e1,root_e2,shortest_dep_path
0,After returning to the U.K. she attended the ...,"org:stateorprovince_of_headquarters(e2,e1)",Isle of Wight,Ryde School with Upper Chine,"[16, 20, 32, 38]",After returning to the U.K. she attended the i...,"[ADP, VERB, ADP, DET, PROPN, PRON, VERB, DET, ...",PROPN,PROPN,,,Isle,School,the
1,Supported by their own buying staff <e1> Mars...,"org:stateorprovince_of_headquarters(e1,e2)",Mars,Maryland,"[6, 8, 25, 27]",Supported by their own buying staff Mars purch...,"[VERB, ADP, DET, ADJ, NOUN, NOUN, PROPN, NOUN,...",PROPN,PROPN,LOC,GPE,Mars,Maryland,purchases produce seafood from growers
2,The California Department of Alcohol and Drug...,"org:stateorprovince_of_headquarters(e2,e1)",California,substance abuse,"[13, 15, 20, 23]",The California Department of Alcohol and Drug ...,"[DET, PROPN, PROPN, ADP, PROPN, CCONJ, PROPN, ...",PROPN,NOUN,GPE,,California,abuse,agency concerned with prevention
3,But but <e1> Aetna </e1> 's headquarters are ...,"org:stateorprovince_of_headquarters(e1,e2)",Aetna,Connecticut,"[2, 4, 9, 11]",But but Aetna 's headquarters are in Connectic...,"[CCONJ, CCONJ, PROPN, PART, NOUN, AUX, ADP, PR...",PROPN,PROPN,ORG,GPE,Aetna,Connecticut,headquarters are in
4,<e1> Singapore Airlines </e1> ( <e2> SIA </e2...,"org:alternate_names(e1,e2)",Singapore Airlines,SIA,"[0, 3, 5, 7]",Singapore Airlines ( SIA ) said Wednesday it w...,"[PROPN, PROPN, PUNCT, PROPN, PUNCT, VERB, PROP...",PROPN,PROPN,ORG,,Airlines,SIA,


In [55]:
train_df.shape

(17255, 14)

In [56]:
test_df.shape

(3331, 14)

In [57]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [31]:
train_df= pd.read_csv('train.csv')
test_df= pd.read_csv('test.csv')

# get words in between in lemma form and after removing stop words

In [105]:
def get_words_in_between(sen):
    '''
    get the words in between entities which are not stop words
    '''
    words= sen.sen_without_entity.split()
    words_in_between= words[sen.position[1]-1: sen.position[2]-2]
    return " ".join([word for word in words_in_between if word not in stopwords])    

In [106]:
train_df['words_in_between']= train_df.apply(get_words_in_between, axis=1)
test_df['words_in_between']= test_df.apply(get_words_in_between, axis=1)

In [107]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [95]:
def get_sen_without_entity(sen):
    '''
    remove entity tags from the sentence and get its lemma form, return string
    '''
    sen_list= sen.split()
    sen_without_entity= " ".join([token for token in sen_list if token not in {'<e1>','</e1>', '<e2>', '</e2>'}]) 
    words=[str(token.lemma_) for token in nlp(sen_without_entity)]
    return " ".join(words)

In [98]:
train_df['sen_without_entity']= train_df.Sentences.apply(get_sen_without_entity)
test_df['sen_without_entity']= test_df.Sentences.apply(get_sen_without_entity)

In [102]:
train_df.head()

Unnamed: 0,Sentences,Relations,e1,e2,position,sen_without_entity,sen_pos,pos_e1,pos_e2,enr_e1,enr_e2,root_e1,root_e2,shortest_dep_path,words_in_between
0,<e1> Thom Yorke </e1> of <e2> Radiohead </e2>...,"per:employee_of(e1,e2)",Thom Yorke,Radiohead,"[0, 3, 5, 7]",Thom Yorke of Radiohead have include the + for...,"['PROPN', 'PROPN', 'ADP', 'PROPN', 'AUX', 'VER...",PROPN,PROPN,,,Yorke,Radiohead,of,[]
1,<e1> Leland High School </e1> is a public hig...,"org:city_of_headquarters(e1,e2)",Leland High School,San Jose,"[0, 4, 16, 19]",Leland High School be a public high school loc...,"['PROPN', 'PROPN', 'PROPN', 'AUX', 'DET', 'ADJ...",PROPN,PROPN,ORG,GPE,School,Jose,district,"[public, high, school, locate, Almaden, Valley]"
2,The 2008 Ohio Bobcats football team represent...,"org:members(e2,e1)",Ohio University,NCAA,"[7, 10, 14, 16]",the 2008 Ohio Bobcats football team represent ...,"['DET', 'NUM', 'PROPN', 'PROPN', 'NOUN', 'NOUN...",PROPN,PROPN,ORG,ORG,University,NCAA,represented team the division,[2008]
3,<e1> Holy Cross High School </e1> is a Cathol...,"org:founded_by(e1,e2)",Holy Cross High School,Congregation of Holy Cross,"[0, 5, 19, 24]",Holy Cross High School be a catholic secondary...,"['PROPN', 'PROPN', 'PROPN', 'PROPN', 'AUX', 'D...",PROPN,PROPN,ORG,,School,Congregation,founded by,"[catholic, secondary, school, found, Waterbury..."
4,Hastings was unable to confirm news reports t...,"per:employee_of(e2,e1)",Democratic,Bill Gwatney,"[13, 15, 18, 21]",Hastings be unable to confirm news report that...,"['PROPN', 'AUX', 'ADJ', 'PART', 'VERB', 'NOUN'...",PART,NOUN,,PERSON,Democratic,Gwatney,party chairman,"[Democratic, Party]"


In [103]:
print(train_df.shape, test_df.shape)

(17255, 15) (3331, 15)


In [125]:
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)