In [285]:
#pip install spacy
import pandas as pd
import spacy
import networkx as nx
from spacy.lang.en import English
import en_core_web_sm
nlp = en_core_web_sm.load()
from nltk.corpus import wordnet as wn
import pickle
from gensim.models import Word2Vec
from nltk.corpus import stopwords
stopwords= stopwords.words('english')
import numpy as np
from time import time

In [73]:
def get_entity_index(sen):
    sen_list= sen.split()

    for i, word in enumerate(sen_list):
        if word=='<e1>':
            start1=i

        elif word=='</e1>':
            end1=i

        if word=='<e2>':
            start2=i

        elif word=='</e2>':
            end2=i
            
    # get e1 and e2
    e1= " ".join(sen_list[start1+1 : end1])
    e2= " ".join(sen_list[start2+1 : end2])
    
    return e1, e2, [start1, end1, start2, end2]

def get_sen_without_entity(sen):
    sen_list= sen.split()
    sen_without_entity= " ".join([token for token in sen_list if token not in {'<e1>','</e1>', '<e2>', '</e2>'}]) 
    return sen_without_entity

def get_ner(entity):
    for ent in nlp(entity).ents:
        return str(ent.label_)

def shortest_dep_path(tokens, root_e1, root_e2):
    
    #print dependency tree 
    #displacy.render(doc,jupyter=True)

    # Load spacy's dependency tree into a networkx graph
    edges = []
    for token in tokens:
        for child in token.children:
            edges.append(('{0}'.format(token.lower_),
                          '{0}'.format(child.lower_)))
            
    graph = nx.Graph(edges)
    entity1 = root_e1.lower()
    entity2 = root_e2.lower()
    
    try:
        out = str(" ".join(nx.shortest_path(graph, source=entity1, target=entity2)[1:-1]))
        
    except (nx.NetworkXNoPath,  nx.NodeNotFound) as e:
        out= None
    
    return out

def extracting_sysNet(sen_without_entity):
    words= sen_without_entity.split()
    
    word_features_hypernymy={}
    word_features_hyponymy={}
    word_features_meronym={}
    word_features_holonymy={}
    
    for word in words:
        word_features_hypernymy[word]=[]
        word_features_hyponymy[word]=[]
        word_features_meronym[word]=[]
        word_features_holonymy[word]=[]

    for word in words:  
        for i,j in enumerate(wn.synsets(word)):

            #Hypernymy
            for hyper in j.hypernyms():
                Hypernyms= hyper.lemma_names()
                for entry in Hypernyms:
                    word_features_hypernymy[word].append(entry)

            #Hyponymy
            for hypo in j.hyponyms():
                Hyponyms=hypo.lemma_names()
                for h in Hyponyms:
                    word_features_hyponymy[word].append(h)

            #Meronyms
            for mem in j.part_meronyms():
                Meronyms=mem.lemma_names()
                for m in Meronyms:
                    word_features_meronym[word].append(m)

            #Holonyms
            for holo in j.part_holonyms():
                Holonyms=holo.lemma_names()
                for ho in Holonyms:
                    word_features_holonymy[word].append(ho)

    return word_features_hypernymy, word_features_hyponymy, word_features_meronym, word_features_holonymy

In [133]:
def tokenize(sen):
    doc= nlp(sen)
    return [token for token in doc]
        
def lemmatize(tokens):
    return [token.lemma_ for token in tokens]
        
def get_pos_sen(tokens):
    return [token.pos_ for token in tokens]

# get root
def get_root(entity):
    # create a span object that has property .root
    doc = nlp(entity)
    sen= list(doc.sents)[0]
    return str(sen.root)      

def get_words_in_between(sen_without_entity, position):
    '''
    get the words in between entities which are not stop words
    '''
    words= sen_without_entity.split()
    words_in_between= words[position[1]-1: position[2]-2]
    return " ".join([word for word in words_in_between if word not in stopwords])  

In [134]:
# sen= "The 2008 Ohio Bobcats football team represented <e1> Ohio University </e1> during the 2008 <e2> NCAA </e2> Division I FBS football season . "

In [260]:
sen= " On Wednesday <e1> Guyana </e1> 's President <e2> Bharrat Jagdeo </e2> publicly questioned why it's taking so long to get its first installment of funds under a $ 250 million forest conservation agreement with Norway . "

In [261]:
# extracting entity and its position
e1, e2, position= get_entity_index(sen)
# removing entity tags
sen_without_entity= get_sen_without_entity(sen)
# tokenize, lemmatize, and POS tags from spacy
tokens= tokenize(sen_without_entity)
lemmas= lemmatize(tokens)
pos_sen= get_pos_sen(tokens)
# get pos_e1
pos_e1= pos_sen[position[0]]
pos_e2= pos_sen[position[2]-2]
# root e1 and root e2
root_e1= get_root(e1)
root_e2= get_root(e2)
# SDP
SDP= shortest_dep_path(tokens, root_e1, root_e2)
# SysNet
word_features_hypernymy, word_features_hypernomy, word_features_meronym, word_features_holonymy= extracting_sysNet(sen_without_entity)
# get NER of e1 and e2
enr_e1, enr_e2= get_ner(e1), get_ner(e2)
# get words in between 
words_in_between= get_words_in_between(sen_without_entity, position)

In [262]:
print('entities: ', e1, e2)
print('tokens: ', tokens)
print('lemmas: ', lemmas)
print('pos: ', pos_sen)
print('SDP: ', SDP)
print('word_features_hypernymy: ', word_features_hypernymy )
print('enr_e1: ', enr_e1)
print('enr_e2: ', enr_e2)
print('pos_e1: ', pos_e1)
print('pos_e2: ', pos_e2)
print('words in between: ', words_in_between)

entities:  Guyana Bharrat Jagdeo
tokens:  [On, Wednesday, Guyana, 's, President, Bharrat, Jagdeo, publicly, questioned, why, it, 's, taking, so, long, to, get, its, first, installment, of, funds, under, a, $, 250, million, forest, conservation, agreement, with, Norway, .]
lemmas:  ['on', 'Wednesday', 'Guyana', "'s", 'President', 'Bharrat', 'Jagdeo', 'publicly', 'question', 'why', '-PRON-', 'be', 'take', 'so', 'long', 'to', 'get', '-PRON-', 'first', 'installment', 'of', 'fund', 'under', 'a', '$', '250', 'million', 'forest', 'conservation', 'agreement', 'with', 'Norway', '.']
pos:  ['ADP', 'PROPN', 'PROPN', 'PART', 'PROPN', 'PROPN', 'PROPN', 'ADV', 'VERB', 'ADV', 'PRON', 'AUX', 'VERB', 'ADV', 'ADV', 'PART', 'AUX', 'DET', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'ADP', 'DET', 'SYM', 'NUM', 'NUM', 'NOUN', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'PUNCT']
SDP:  president bharrat
word_features_hypernymy:  {'On': [], 'Wednesday': ['weekday'], 'Guyana': [], "'s": [], 'President': ['corporate_executive', 'business

In [263]:
pos_enr= pd.DataFrame([[pos_e1, pos_e2, enr_e1, enr_e2]], columns=['pos_e1', 'pos_e2', 'enr_e1', 'enr_e2'])

# encode these features

In [264]:
file= open('ce.obj', 'rb') 
ce= pickle.load(file)
file.close()

In [265]:
pos_enr_enc= ce.transform(pos_enr)

In [266]:
pos_enr_enc.drop(['pos_e1_0', 'pos_e2_0', 'enr_e1_0', 'enr_e2_0'], axis=1, inplace=True)

In [267]:
pos_enr_enc.shape

(1, 18)

# word encoding

In [268]:
word_model= Word2Vec.load('word_model.bin')

In [269]:
def encode(sen):
    '''
    each word in 100 dimension, sum up these vectors for each word in sentence, to get one final 100 size vector. 
    '''
    encoding=np.zeros(100)
    words= str(sen).split()
    for word in words:
        if word in word_model:
            encoding += word_model[word]
    return pd.DataFrame(encoding.reshape(1,-1))

# entity embedding

In [270]:
e1_enc= encode(e1)
e2_enc= encode(e2)
e1_enc= e1_enc.add_prefix('e1_')
e2_enc= e2_enc.add_prefix('e2_')
entity_enc= pd.concat([e1_enc, e2_enc], axis=1)

  
  if __name__ == '__main__':


In [271]:
entity_enc

Unnamed: 0,e1_0,e1_1,e1_2,e1_3,e1_4,e1_5,e1_6,e1_7,e1_8,e1_9,...,e2_90,e2_91,e2_92,e2_93,e2_94,e2_95,e2_96,e2_97,e2_98,e2_99
0,0.11142,-0.266119,-0.156917,-0.087371,0.185912,0.275955,0.140905,0.228886,0.107982,-0.241018,...,0.035319,-0.540367,-0.33716,0.092133,0.240597,-0.294627,-0.022181,-0.121581,0.099459,0.011981


# SDP embedding

In [272]:
sdp_enc= encode(SDP)
sdp_enc= sdp_enc.add_prefix('sdp_')

  
  if __name__ == '__main__':


In [273]:
sdp_enc

Unnamed: 0,sdp_0,sdp_1,sdp_2,sdp_3,sdp_4,sdp_5,sdp_6,sdp_7,sdp_8,sdp_9,...,sdp_90,sdp_91,sdp_92,sdp_93,sdp_94,sdp_95,sdp_96,sdp_97,sdp_98,sdp_99
0,0.945116,0.101658,-0.713092,-1.217193,-0.212802,0.113164,0.885969,-0.641224,0.833309,-0.7244,...,-0.178008,-2.576855,-1.642165,1.138858,1.18302,-1.160019,-0.103197,-0.821332,1.523956,1.52422


# words in between embedding

In [274]:
words_bet_enc= encode(words_in_between)
words_bet_enc= words_bet_enc.add_prefix('bet_')

  
  if __name__ == '__main__':


In [275]:
words_bet_enc

Unnamed: 0,bet_0,bet_1,bet_2,bet_3,bet_4,bet_5,bet_6,bet_7,bet_8,bet_9,...,bet_90,bet_91,bet_92,bet_93,bet_94,bet_95,bet_96,bet_97,bet_98,bet_99
0,1.508099,0.128895,-2.242232,-1.71071,-0.234733,1.241015,0.339153,-1.767375,2.063224,-2.154587,...,0.271633,-4.997834,-3.424843,1.567164,2.163761,-2.835466,-0.259345,-1.243096,2.618251,2.045603


# encode root words

In [276]:
root_e1_enc= encode(root_e1)
root_e2_enc= encode(root_e2)
root_e1_enc= root_e1_enc.add_prefix('root_e1_')
root_e2_enc= root_e2_enc.add_prefix('root_e2_')
root_enc= pd.concat([root_e1_enc, root_e2_enc], axis=1)

  
  if __name__ == '__main__':


In [277]:
root_enc

Unnamed: 0,root_e1_0,root_e1_1,root_e1_2,root_e1_3,root_e1_4,root_e1_5,root_e1_6,root_e1_7,root_e1_8,root_e1_9,...,root_e2_90,root_e2_91,root_e2_92,root_e2_93,root_e2_94,root_e2_95,root_e2_96,root_e2_97,root_e2_98,root_e2_99
0,0.11142,-0.266119,-0.156917,-0.087371,0.185912,0.275955,0.140905,0.228886,0.107982,-0.241018,...,0.020098,-0.266751,-0.168431,0.040973,0.119067,-0.143669,-0.004295,-0.056671,0.036887,-0.002716


# concat all features together

In [278]:
pos_enr_e1e2_root_between= pd.concat([pos_enr_enc, entity_enc, root_enc, words_bet_enc, sdp_enc], axis=1)

In [279]:
pos_enr_e1e2_root_between.shape

(1, 618)

In [280]:
cols = pickle.load(open('col.bin', "rb"))

In [281]:
pos_enr_e1e2_root_between.columns= cols

# predict using best model

In [282]:
# load
xgb_model = pickle.load(open('xgboost_model.bin', "rb"))

In [289]:
start_time= time()
print('Model prediction: ', xgb_model.predict(pos_enr_e1e2_root_between)[0])
end_time= time()
print('Time taken to predict: ', end_time-start_time)

Model prediction:  per:employee_of(e1,e2)
Time taken to predict:  0.009009361267089844
