# Library Installation

In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_sm
# !pip install datasets
# !pip install wordninja
# !pip install textblob
# !pip install nltk
# !pip install sentence-transformers
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
# !pip install swifter
# !pip install pyspellchecker

# Import Libraries + Load Models

In [2]:
import pandas as pd, numpy as np
import re
import swifter
import gensim.downloader as api
import spacy
import en_core_web_sm
import string
import wordninja
import warnings


from datasets import load_dataset
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from textblob import Word
from nltk.corpus import wordnet
from sentence_transformers import SentenceTransformer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.exceptions import UndefinedMetricWarning

# suppress UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
# to convert sentences to vector # runs fast
sent2vec_model = SentenceTransformer("all-MiniLM-L6-v2")
# to convert word to vector
word2vec_model = api.load("word2vec-google-news-300")

nlp = spacy.load('en_core_web_sm')

# Fetch Dataset

In [3]:
load_dataset("sem_eval_2010_task_8")

DatasetDict({
    train: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence', 'relation'],
        num_rows: 2717
    })
})

NOTE: This dataset doesnot contain any validation set, hence validation is splitting the training set

In [4]:
train_df = pd.DataFrame(load_dataset("sem_eval_2010_task_8", split = "train"))

In [5]:
test_df = pd.DataFrame(load_dataset("sem_eval_2010_task_8", split = "test"))

# Preprocess Data

Functions to extract features, improve data quality and more

### 1. Extract Entities

In [6]:
train_df.iloc[0]['sentence']

'The system as described above has its greatest application in an arrayed <e1>configuration</e1> of antenna <e2>elements</e2>.'

Sentence contains entities enclosed in '<>'

In [7]:
def extract_entities(sentence):
    try:
        e1 = re.search(r'<e1>(.*?)</e1>', sentence).group(1).lower().strip()
        e2 = re.search(r'<e2>(.*?)</e2>', sentence).group(1).lower().strip()
    except:
        # raise error if entities are not enclosed in '<>' in sentence
        raise ValueError('Sentence passed is not in correct format')
    return pd.Series([e1, e2], index=['e1', 'e2'])

### 2. Text to Vectors

In [8]:
spell_checker = SpellChecker(distance=1)
lemmatizer = WordNetLemmatizer()
count = 0

def text_to_vectors(text,sentence='',flag=0):
    global count
    flag = 0
    return_text = ''
    
    try:
        text = int(text[0])
        text = 'number'
    except:
        pass

    if text == None:
        return_vector = text
        
    elif text in word2vec_model: # 1. search text in word2vec
        return_text = text
    
    elif text.capitalize() in word2vec_model: # 2. search capitalised text in word2vec
        return_text = text.capitalize()
        
    elif text.replace('-','') in word2vec_model: # 3. remove hyphen and search
        return_text = text.replace('-','')
        
    elif str(lemmatizer.lemmatize(text, pos='n')) in word2vec_model: # 4. convert to singular and then search
        return_text = str(lemmatizer.lemmatize(text, pos='n'))
        
    elif str(lemmatizer.lemmatize(text, pos='n')).capitalize() in word2vec_model: # 5. convert to singular & capitalise and then search
        return_text = str(lemmatizer.lemmatize(text, pos='n')).capitalize()
        
    elif str(Word(text).lemmatize()) in word2vec_model: # 6. convert past to present and search
        return_text = str(Word(text).lemmatize())
    
    elif text.replace('-',' ').split()[-1] in word2vec_model: # 7. if >1 words, search last word
        return_text = text.replace('-',' ').split()[-1]
        
    elif text.replace('-',' ').split()[0] in word2vec_model: # 8. if >1 words, search first word
        return_text = text.replace('-',' ').split()[0]
        
    # some custom defined rules
    # 1. 'er' cases
    elif str(Word(text).lemmatize())[-2:]=='er' and str(Word(text).lemmatize())[:-2] in word2vec_model: # 9. remove last 'er' of word and search
        return_text = str(Word(text).lemmatize())[:-2]
        
    # 2. 'ment' cases
    elif str(Word(text).lemmatize())[-4:]=='ment' and str(Word(text).lemmatize())[:-4] in word2vec_model: # 10. remove last 'ment' of word and search
        return_text = str(Word(text).lemmatize())[:-4]
    
    elif spell_checker.correction(text) in word2vec_model: # 11. correct spelling, if any, and search
        return_text = spell_checker.correction(text)
    
    elif wordninja.split(nlp(text)[0].lemma_)!=[] and max(wordninja.split(nlp(text)[0].lemma_), key=len) in word2vec_model: # 12. split words into segment and search the longest word
        return_text = max(wordninja.split(nlp(text)[0].lemma_), key=len)

    elif wordninja.split(nlp(text)[0].lemma_)!=[] and str(wordninja.split(nlp(text)[0].lemma_)[-1]) in word2vec_model: # 13. split words into segment and search the last word
        return_text = str(wordninja.split(nlp(text)[0].lemma_)[-1])
    
    elif wordninja.split(nlp(text)[0].lemma_)!=[] and str(wordninja.split(nlp(text)[0].lemma_)[0]) in word2vec_model: # 14. split words into segment and search the first word
        return_text = str(wordninja.split(nlp(text)[0].lemma_)[0])

    else:
        print ('Word:"'+text+'" NOT FOUND')
        pass    
    
    if return_text =='':
        count = count+1
        flag = flag+1
        return_text = text
        return_vector = np.zeros(word2vec_model.vector_size)
    elif return_text == None:
        return_vector = np.zeros(word2vec_model.vector_size)
    else:
        return_vector = word2vec_model[return_text]
        
    if sentence=='':
        return return_vector
    else:
        return_sentence = sentence.replace(text, return_text)
        match = re.search(r"<e1>(.*?)</e2>", return_sentence)
        if match:
            trimmed_sentence = (match.group(1))  # trim sentence between 2 entities
        else:
            trimmed_sentence = None
        # remove entity tags '<>'
        return_sentence = re.sub(r"<[^>]+>", "",return_sentence)
        trimmed_sentence = re.sub(r"<[^>]+>", "",trimmed_sentence)
        # remove punctuations
        punctuation_set = ''.join(char for char in string.punctuation)
        return_sentence = return_sentence.translate(str.maketrans('', '', punctuation_set))
        trimmed_sentence = trimmed_sentence.translate(str.maketrans('', '', punctuation_set))
        # remove extra spaces
        return_sentence = re.sub(r"\s+", " ", return_sentence).strip()
        trimmed_sentence = re.sub(r"\s+", " ", trimmed_sentence).strip()
        
    
        return pd.Series([return_vector,flag, return_sentence, return_text, trimmed_sentence], index=['e_vector', 'flag', 'corrected_sentence', 'e', 'trimmed_sentence'])
    

### 3. Custom Entity Disambiguation 

In [9]:
def segment_synsets(word):
    words = word.replace('-',' ').split(' ')
    new_words = []
    
    # segment words and check for synset
    # 1.
    for item in words:
        new_words = new_words + wordninja.split(nlp(item)[0].lemma_)
     
    # 2.
    for j in words:
        for i in range(1,len(j)):
            prefix = j[:i]
            suffix = j[i:]
            if prefix in word2vec_model and suffix in word2vec_model:

                new_words = [prefix,suffix] + new_words
            elif prefix in word2vec_model and len(prefix)>=5:
                new_words = [prefix] + new_words
            elif suffix in word2vec_model and len(suffix)>=5:
                new_words = [suffix] + new_words

    new_words = [item for item in new_words if len(item) >= 3]
        
    synsets = []
    for item in new_words:
        item_synset = wordnet.synsets(item)
        synsets = synsets + item_synset
    return synsets

    

def filter_main_tokens(sentence):

    doc = nlp(sentence)
    # filtering based on POS tags
    filtered_tokens = [token.text for token in doc if token.pos_ in ("NOUN", "VERB", "ADJ")]

    return filtered_tokens
    

def disambiguate_entity_in_sentence(sentence, word, flag):
    
    # main tokens of the sentence
    sentence_tokens = filter_main_tokens(sentence.replace(word,''))
    
    # fetch synsets for the word
    synsets = wordnet.synsets(word.replace(' ','_'))
    
    # handle british vs american english
    if len(synsets)==0 and word.count('s')==1:
        synsets = wordnet.synsets(word.replace('s','z'))
    if len(synsets)==0 and word.count('z')==1:
        synsets = wordnet.synsets(word.replace('z','s'))
        
    # custom synset extraction by segmenting words
    if len(synsets)==0:
        synsets = segment_synsets(word)
    
    if len(synsets)==0:
        print ('Synset for "'+word+'" NOT FOUND')  
    
    # fetch the most relatable synset based on sentence
    scores = {}
    for synset in synsets:
        synset_tokens = filter_main_tokens(synset.definition())

        synset_embeddings = [word2vec_model[token] for token in synset_tokens if token in word2vec_model]
        
        if len(synset_embeddings)>0:
            avg_synset_embedding = np.mean(synset_embeddings, axis=0)
            
            # calculate similarity score based on cosine similarity between avg_synset_embedding and each token in sentence
            similarity_scores = [np.dot(avg_synset_embedding, word2vec_model[token])/(np.linalg.norm(avg_synset_embedding)*np.linalg.norm(word2vec_model[token]))
                                 for token in sentence_tokens if token in word2vec_model]
            scores[synset] = np.mean(similarity_scores)
        
        else:
            scores[synset] = 0
    
    # extract the highest score synset
    if len(synsets)>0:
        best_synset = max(scores, key=scores.get)
        
    else:
        best_synset = wordnet.synsets('unavailable')[0]
        flag = flag+1 #update warning flag
    e_definition = best_synset.definition()
    
    return pd.Series([best_synset, e_definition, flag], index=['e_synset', 'e_definition', 'flag'])


In [10]:
def get_hypernym(synset1, synset2):
    common_hypernym = synset1.lowest_common_hypernyms(synset2)
    return common_hypernym[0].lemmas()[0].name() if common_hypernym else None

### 4. Extract Features

In [11]:
def extract_features(e1, e2, sentence):

    
    doc = nlp(sentence)

    e1_pos, e2_pos = None, None
    e1_dep_token, e2_dep_token = 'NA.', 'NA.'
    e1_prev_token, e2_prev_token = 'NA.', 'NA.'
    e1_post_token, e2_post_token = 'NA.', 'NA.'
    
    e1_dep_noun, e1_dep_adj, e1_dep_verb, e1_dep_prep, e1_dep_subj, e1_dep_obj = 0, 0, 0, 0, 0, 0
    e2_dep_noun, e2_dep_adj, e2_dep_verb, e2_dep_prep, e2_dep_subj, e2_dep_obj = 0, 0, 0, 0, 0, 0
    num_verbs, num_nouns, num_prep, num_adj, num_words_btwn = 0, 0, 0, 0, 0 
    start_inbetween_count = False
    
    word_count = 0
    e1_position_set = e2_position_set = False
    e1_post_memory = e2_post_memory = False
    memory = 'NA.'
    e1_position = e2_position = -1
    for token in doc:
        word_count = word_count + 1
        
        if e1_post_memory == True:
            e1_post_token = str(token)
        e1_post_memory = False
        
        if e2_post_memory == True:
            e2_post_token = str(token)
        e2_post_memory = False
        
        # entity 1 dependency distribution
        if str(token) in e1.split():
            e1_pos = token.pos_
            e1_dep_noun = 1 if token.dep_[0]=='n' else 0
            e1_dep_adj = 1 if token.dep_[0]=='a' else 0
            e1_dep_verb = 1 if token.dep_[0]=='v' else 0
            e1_dep_prep = 1 if token.dep_[0]=='p' else 0
            e1_dep_subj = 1 if 'subj' in token.dep_ else 0
            e1_dep_obj = 1 if 'obj' in token.dep_ else 0
            e1_dep_token = str(token.head)
            
            if e1_prev_token == 'NA.':
                e1_prev_token = memory
                
            if e1_position_set == False:
                e1_position = word_count
                e1_position_set = True
                
            e1_post_memory = True
            
        # entity 2 dependency distribution    
        if str(token) in e2.split(): 
            e2_pos = token.pos_
            e2_dep_noun = 1 if token.dep_[0]=='n' else 0
            e2_dep_adj = 1 if token.dep_[0]=='a' else 0
            e2_dep_verb = 1 if token.dep_[0]=='v' else 0
            e2_dep_prep = 1 if token.dep_[0]=='p' else 0
            e2_dep_subj = 1 if 'subj' in token.dep_ else 0
            e2_dep_obj = 1 if 'obj' in token.dep_ else 0
            e2_dep_token = str(token.head)
            
            if e2_prev_token == 'NA.':
                e2_prev_token = memory
                
            if e2_position_set == False:
                e2_position = word_count
                e2_position_set = True
                
            e2_post_memory = True
            
        if (str(token) in [e1.split()[-1],e2.split()[-1]] and start_inbetween_count==False) or (str(token) in [e1.split()[0],e2.split()[0]] and start_inbetween_count==True):
            start_inbetween_count = not(start_inbetween_count)
        
        # words distribution between entities
        if start_inbetween_count==True:
            num_verbs = num_verbs+1 if token.pos_=='VERB' else num_verbs
            num_nouns = num_nouns+1 if token.pos_=='NOUN' else num_nouns
            num_prep = num_prep+1 if token.pos_=='ADP' else num_prep
            num_adj = num_adj+1 if token.pos_=='ADJ' else num_adj
            num_words_btwn = num_words_btwn+1 
            
        memory = str(token)

    # entity position in sentence
    e1_start = sentence.find(e1)
    e1_end = e1_start + len(e1)
    e2_start = sentence.find(e2)
    e2_end = e2_start + len(e2)

    
    return pd.Series([e1_pos, e1_dep_noun, e1_dep_adj, e1_dep_verb, e1_dep_prep, e1_dep_subj, e1_dep_obj, e1_dep_token, e1_prev_token, e1_post_token, e1_start, e1_end, e1_position, e2_pos, e2_dep_noun, e2_dep_adj, e2_dep_verb, e2_dep_prep, e2_dep_subj, e2_dep_obj, e2_dep_token, e2_prev_token, e2_post_token, e2_start, e2_end, e2_position, num_verbs, num_nouns, num_prep, num_adj, num_words_btwn], index=['e1_pos', 'e1_dep_noun', 'e1_dep_adj', 'e1_dep_verb', 'e1_dep_prep', 'e1_dep_subj', 'e1_dep_obj', 'e1_dep_token', 'e1_prev_token', 'e1_post_token', 'e1_start', 'e1_end', 'e1_position', 'e2_pos', 'e2_dep_noun', 'e2_dep_adj', 'e2_dep_verb', 'e2_dep_prep', 'e2_dep_subj', 'e2_dep_obj', 'e2_dep_token', 'e2_prev_token', 'e2_post_token', 'e2_start', 'e2_end', 'e2_position', 'num_verbs', 'num_nouns', 'num_prep', 'num_adj', 'num_words_btwn'])


### 5. Sentence Tuning

In [12]:
def tune_sentence(new_sentence, e1, e2, e1_def, e2_def):
    
    doc = nlp(new_sentence)
    return_sentence = ''
    prev_word_pos = ''
    prev_word = ''
    sentence_entry = False
    
    # strip sentence between entities and select important tokens
    for word in doc:
        if str(word) == e1 and sentence_entry == False:
            sentence_entry = True
            end_type = e2
        if str(word) == e2 and sentence_entry == False:
            sentence_entry = True
            end_type = e1
            
        if (str(word) == e1 or str(word) == e2 or word.pos_ in ['ADP', 'AUX','VERB', 'X']) and sentence_entry==True:

            if word.pos_ =='ADP' and not(prev_word_pos in ['AUX','VERB', 'X'] or prev_word == e1 or prev_word == e2):
                pass
            else:
                return_sentence = return_sentence + ' ' + word.text
            
            prev_word_pos = word.pos_
            prev_word = str(word)
            
            if end_type == str(word):
                sentence_entry = False
                break
                
    # extract entity definition tokens
    def definition_tokens(doc):
        new_def = ''
        for word in doc:
            if (word.pos_ in ['AUX','VERB', 'X', 'NOUN']):
                new_def = new_def + ' ' + word.text
        return new_def
    
    new_e1_def = definition_tokens(nlp(e1_def))
    new_e2_def = definition_tokens(nlp(e2_def))
                  
    return pd.Series([return_sentence[1:], return_sentence[1:].strip()+' where '+e1+' is the '+new_e1_def.strip()+' and '+e2+' is the '+new_e2_def.strip()], index=['custom_sentence', 'corrected_sentence'])
                                                                                                                                          
                                                                                                                                          

### Apply All Functions (1. + 2. + 3. + 4. + 5.)

In [13]:
def preprocess_data(df_original, train=True):
    df = df_original.copy()
    df[['e1','e2']] = df['sentence'].swifter.apply(extract_entities)
    df['warning_flags'] = 0
    df[['e1_vector','warning_flags','corrected_sentence','corrected_e1','trimmed_sentence']] = df.swifter.apply(lambda x: text_to_vectors(x['e1'],x['sentence'],x['warning_flags']),axis=1)
    df[['e2_vector','warning_flags','corrected_sentence','corrected_e2','trimmed_sentence']] = df.swifter.apply(lambda x: text_to_vectors(x['e2'],x['sentence'],x['warning_flags']),axis=1)
    df[['e1_synset','e1_definition','warning_flags']] = df.swifter.apply(lambda x: disambiguate_entity_in_sentence(x['corrected_sentence'],x['corrected_e1'],x['warning_flags']), axis=1)
    df[['e2_synset','e2_definition','warning_flags']] = df.swifter.apply(lambda x: disambiguate_entity_in_sentence(x['corrected_sentence'],x['corrected_e2'],x['warning_flags']), axis=1)
    df['hypernym'] = df.swifter.apply(lambda x: get_hypernym(x['e1_synset'],x['e2_synset']),axis=1)
    df[['e1_pos', 'e1_dep_noun', 'e1_dep_adj', 'e1_dep_verb', 'e1_dep_prep', 'e1_dep_subj', 'e1_dep_obj', 'e1_dep_token', 'e1_prev_token', 'e1_post_token', 'e1_start', 'e1_end', 'e1_position', 'e2_pos', 'e2_dep_noun', 'e2_dep_adj', 'e2_dep_verb', 'e2_dep_prep', 'e2_dep_subj', 'e2_dep_obj', 'e2_dep_token', 'e2_prev_token', 'e2_post_token', 'e2_start', 'e2_end', 'e2_position', 'num_verbs', 'num_nouns', 'num_prep', 'num_adj', 'num_words_btwn']] = df.swifter.apply(lambda x: extract_features(x['corrected_e1'],x['corrected_e2'],x['corrected_sentence']), axis=1)
    df[['custom_sentence','custom_sentence_wdef']] = df.swifter.apply(lambda x: tune_sentence(x['trimmed_sentence'],x['corrected_e1'],x['corrected_e2'],x['e1_definition'],x['e2_definition']),axis=1)
    
    df['hypernym'] = df.swifter.apply(lambda x: text_to_vectors(x['hypernym']),axis=1)
    df['e1_prev_token'] = df.swifter.apply(lambda x: text_to_vectors(x['e1_prev_token']),axis=1)
    df['e2_prev_token'] = df.swifter.apply(lambda x: text_to_vectors(x['e2_prev_token']),axis=1)
    df['e1_post_token'] = df.swifter.apply(lambda x: text_to_vectors(x['e1_post_token']),axis=1)
    df['e2_post_token'] = df.swifter.apply(lambda x: text_to_vectors(x['e2_post_token']),axis=1)
    df['e1_dep_token'] = df.swifter.apply(lambda x: text_to_vectors(x['e1_dep_token']),axis=1)
    df['e2_dep_token'] = df.swifter.apply(lambda x: text_to_vectors(x['e2_dep_token']),axis=1)
    
    if train==True:
        df = df[df['warning_flags']==0]

    return df

In [14]:
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df, False)

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Word:"mesencephalon" NOT FOUND
Word:"azeotrope" NOT FOUND
Word:"tokotoko" NOT FOUND


Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Word:"mesophyll" NOT FOUND


Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Synset for "geoid" NOT FOUND
Synset for "mellitus" NOT FOUND
Synset for "nye" NOT FOUND
Synset for "helicobacter" NOT FOUND
Synset for "Pleiad" NOT FOUND
Synset for "chai" NOT FOUND
Synset for "Pleiad" NOT FOUND
Synset for "santur" NOT FOUND
Synset for "Cavvy" NOT FOUND
Synset for "configurator" NOT FOUND
Synset for "opioids" NOT FOUND
Synset for "Pleiad" NOT FOUND
Synset for "Pleiad" NOT FOUND
Synset for "Wunch" NOT FOUND
Synset for "Pleiad" NOT FOUND
Synset for "Sord" NOT FOUND
Synset for "vulvodynia" NOT FOUND
Synset for "app" NOT FOUND
Synset for "exe" NOT FOUND
Synset for "rar" NOT FOUND
Synset for "joey" NOT FOUND


Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Synset for "depo" NOT FOUND
Synset for "mai" NOT FOUND
Synset for "apps" NOT FOUND
Synset for "mesophyll" NOT FOUND


Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Word:"Ibsens" NOT FOUND
Word:"Cassiolis" NOT FOUND


Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Word:"Neuraths" NOT FOUND
Word:"oxalic" NOT FOUND
Word:"Middlesbroughs" NOT FOUND


Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/8000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Synset for "app" NOT FOUND
Synset for "cha" NOT FOUND
Synset for "sid" NOT FOUND
Synset for "Dule" NOT FOUND
Synset for "Sord" NOT FOUND
Synset for "Dule" NOT FOUND


Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Synset for "yourselfer" NOT FOUND
Synset for "wiki" NOT FOUND
Synset for "prequels" NOT FOUND


Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Word:"hammerbeam" NOT FOUND
Word:"Hackmans" NOT FOUND
Word:"Newsteds" NOT FOUND


Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Word:"abyssinica" NOT FOUND


Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Word:"ploughed" NOT FOUND


Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

Word:"ploughed" NOT FOUND
Word:"manoeuvred" NOT FOUND


Pandas Apply:   0%|          | 0/2717 [00:00<?, ?it/s]

# Modelling

### 1. Model Training

In [15]:
def get_X(df,bow=False, *args):

    train_data = None
    
    for feature in args:
        if isinstance(df[feature].iloc[0], np.ndarray):
            if train_data is None:  
                train_data = np.vstack(df[feature])
            else:
                train_data = np.concatenate((train_data, np.vstack(df[feature])),axis=1)

        elif isinstance(df[feature].iloc[0], np.int64):
            if train_data is None:  
                train_data = np.array(df[feature]).reshape(-1, 1)
            else:
                train_data = np.concatenate((train_data, np.array(df[feature]).reshape(-1, 1)),axis=1)
                           
        elif isinstance(df[feature].iloc[0], str):

            if train_data is None:  
                train_data = np.vstack(sent2vec_model.encode(df[feature].tolist()))
            else:
                train_data = np.concatenate((train_data, np.vstack(sent2vec_model.encode(df[feature].tolist()))),axis=1) 
                           
        else:
            pass
    
    if bow is not None and len(bow)>0:
        if train_data is None:  
            train_data = np.vstack(bow)
        else:
            train_data = np.concatenate((train_data, np.vstack(bow)),axis=1)
    

    return train_data

In [16]:
def get_y(df,feature):
    return np.array(df[feature].tolist())

### 2. Model Implementation

### Experiment 1: Custom Sentence and Entity Vectors

In [17]:
X = train_df.drop(columns=['relation']) 
y = train_df['relation']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

# add bag of words feature
# add features for words occuring frequently: min 25 times
vectorizer = CountVectorizer(min_df=25)
train_bow = vectorizer.fit_transform(X_train['custom_sentence']).toarray()
val_bow = vectorizer.transform(X_val['custom_sentence']).toarray()


X_train = get_X(X_train, train_bow, 'custom_sentence_wdef','e1_vector','e2_vector')
X_val = get_X(X_val, val_bow, 'custom_sentence_wdef','e1_vector','e2_vector')

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.73      0.70        55
           1       0.76      0.86      0.81        91
           2       0.74      0.73      0.73        70
           3       0.49      0.66      0.56        61
           4       0.74      0.78      0.76        65
           5       0.90      0.72      0.80        25
           6       0.72      0.70      0.71       119
           8       0.67      0.70      0.68        77
           9       0.93      0.81      0.86        31
          10       0.56      0.69      0.62        13
          11       0.59      0.67      0.63        61
          12       0.44      0.50      0.47         8
          13       0.77      0.87      0.81        76
          14       0.71      0.74      0.72        73
          15       0.86      0.63      0.73        19
          16       0.84      0.69      0.76        52
          17       0.74      0.60      0.66        67
          18       0.52    

### Experiment 2: Custom Sentence, Entity Vectors and Dependencies

In [18]:
X = train_df.drop(columns=['relation']) 
y = train_df['relation']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

# add bag of words feature
# add features for words occuring frequently: min 25 times
vectorizer = CountVectorizer(min_df=25)
train_bow = vectorizer.fit_transform(X_train['custom_sentence']).toarray()
val_bow = vectorizer.transform(X_val['custom_sentence']).toarray()


X_train = get_X(X_train, train_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token')
X_val = get_X(X_val, val_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token')

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.84      0.80        55
           1       0.84      0.87      0.85        91
           2       0.62      0.79      0.69        70
           3       0.64      0.70      0.67        61
           4       0.82      0.75      0.78        65
           5       0.83      0.76      0.79        25
           6       0.84      0.85      0.85       119
           8       0.74      0.78      0.76        77
           9       0.93      0.84      0.88        31
          10       0.60      0.46      0.52        13
          11       0.68      0.80      0.74        61
          12       0.86      0.75      0.80         8
          13       0.78      0.88      0.83        76
          14       0.74      0.85      0.79        73
          15       0.75      0.63      0.69        19
          16       0.89      0.75      0.81        52
          17       0.74      0.67      0.70        67
          18       0.56    

### Experiment 3: Custom Sentence, Entity Vectors and Dependencies + Additional Definition & Hypernym

In [19]:
X = train_df.drop(columns=['relation']) 
y = train_df['relation']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)


# add bag of words feature
# add features for words occuring frequently: min 25 times
vectorizer = CountVectorizer(min_df=25)
train_bow = vectorizer.fit_transform(X_train['custom_sentence']).toarray()
val_bow = vectorizer.transform(X_val['custom_sentence']).toarray()


X_train = get_X(X_train, train_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token','e1_definition','e2_definition','hypernym')
X_val = get_X(X_val, val_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token','e1_definition','e2_definition','hypernym')

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.76      0.85      0.80        55
           1       0.84      0.86      0.85        91
           2       0.65      0.79      0.71        70
           3       0.69      0.67      0.68        61
           4       0.80      0.80      0.80        65
           5       0.87      0.80      0.83        25
           6       0.84      0.87      0.85       119
           8       0.72      0.74      0.73        77
           9       0.90      0.87      0.89        31
          10       0.73      0.62      0.67        13
          11       0.70      0.82      0.76        61
          12       0.86      0.75      0.80         8
          13       0.78      0.88      0.83        76
          14       0.73      0.84      0.78        73
          15       0.74      0.74      0.74        19
          16       0.82      0.77      0.79        52
          17       0.70      0.70      0.70        67
          18       0.59    

### Experiment 4: All Extracted Features

In [20]:
X = train_df.drop(columns=['relation']) 
y = train_df['relation']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)


# add bag of words feature
# add features for words occuring frequently: min 25 times
vectorizer = CountVectorizer(min_df=25)
train_bow = vectorizer.fit_transform(X_train['custom_sentence']).toarray()
val_bow = vectorizer.transform(X_val['custom_sentence']).toarray()


X_train = get_X(X_train, train_bow, 'custom_sentence_wdef','e1_vector','e2_vector', 'e1_dep_noun', 'e1_dep_adj', 'e1_dep_verb', 'e1_dep_prep', 'e1_dep_subj', 'e1_dep_obj', 'e1_dep_token', 'e1_prev_token', 'e1_post_token', 'e1_start', 'e1_end', 'e1_position', 'e2_dep_noun', 'e2_dep_adj', 'e2_dep_verb', 'e2_dep_prep', 'e2_dep_subj', 'e2_dep_obj', 'e2_dep_token', 'e2_prev_token', 'e2_post_token', 'e2_start', 'e2_end', 'e2_position', 'num_verbs', 'num_nouns', 'num_prep', 'num_adj', 'num_words_btwn', 'e1_definition', 'e2_definition','hypernym')
X_val = get_X(X_val, val_bow, 'custom_sentence_wdef','e1_vector','e2_vector', 'e1_dep_noun', 'e1_dep_adj', 'e1_dep_verb', 'e1_dep_prep', 'e1_dep_subj', 'e1_dep_obj', 'e1_dep_token', 'e1_prev_token', 'e1_post_token', 'e1_start', 'e1_end', 'e1_position', 'e2_dep_noun', 'e2_dep_adj', 'e2_dep_verb', 'e2_dep_prep', 'e2_dep_subj', 'e2_dep_obj', 'e2_dep_token', 'e2_prev_token', 'e2_post_token', 'e2_start', 'e2_end', 'e2_position', 'num_verbs', 'num_nouns', 'num_prep', 'num_adj', 'num_words_btwn', 'e1_definition', 'e2_definition','hypernym')

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.82        55
           1       0.80      0.88      0.84        91
           2       0.67      0.76      0.71        70
           3       0.68      0.69      0.68        61
           4       0.83      0.75      0.79        65
           5       0.83      0.76      0.79        25
           6       0.88      0.88      0.88       119
           8       0.75      0.78      0.76        77
           9       0.89      0.81      0.85        31
          10       0.67      0.62      0.64        13
          11       0.74      0.82      0.78        61
          12       0.75      0.75      0.75         8
          13       0.80      0.91      0.85        76
          14       0.72      0.85      0.78        73
          15       0.78      0.74      0.76        19
          16       0.91      0.77      0.83        52
          17       0.69      0.75      0.72        67
          18       0.58    

### Experiment 5: Best features + Excluding 'Other' type relation

In [21]:
X = train_df[train_df['relation']!=18].drop(columns=['relation'])

y = train_df[train_df['relation']!=18]['relation']  

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)


# add bag of words feature
# add features for words occuring frequently: min 25 times
vectorizer = CountVectorizer(min_df=25)
train_bow = vectorizer.fit_transform(X_train['custom_sentence']).toarray()
val_bow = vectorizer.transform(X_val['custom_sentence']).toarray()


X_train = get_X(X_train, train_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token','e1_definition','e2_definition','hypernym')
X_val = get_X(X_val, val_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token','e1_definition','e2_definition','hypernym')

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_val)
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.87      0.86      0.86        63
           1       0.94      0.85      0.89        92
           2       0.77      0.88      0.82        64
           3       0.81      0.72      0.76        82
           4       0.86      0.86      0.86        63
           5       1.00      1.00      1.00        22
           6       0.88      0.94      0.91       119
           8       0.82      0.82      0.82        76
           9       0.90      0.90      0.90        20
          10       1.00      0.67      0.80        18
          11       0.82      0.82      0.82        57
          12       1.00      0.73      0.84        11
          13       0.89      0.97      0.93        91
          14       0.87      0.90      0.89        84
          15       0.73      0.69      0.71        16
          16       0.85      0.92      0.89        51
          17       0.87      0.81      0.84        57

    accuracy              

In [22]:
print (len(train_df[train_df['relation']==7]))

1


### Analysis

### Final Model

In [23]:
global vectorizer
vectorizer = CountVectorizer(min_df=25)
vectorizer = vectorizer.fit(train_df['custom_sentence'])
train_bow = vectorizer.transform(train_df['custom_sentence']).toarray()
test_bow = vectorizer.transform(test_df['custom_sentence']).toarray()



X_train = get_X(train_df, train_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token','e1_definition','e2_definition','hypernym')
y_train = get_y(train_df, 'relation')
X_test = get_X(test_df, test_bow, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token','e1_definition','e2_definition','hypernym')
y_test = get_y(test_df, 'relation')

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# # Predictions
y_pred = svm_model.predict(X_test)

# # Model evaluation
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.89      0.91      0.90       134
           1       0.88      0.90      0.89       194
           2       0.70      0.77      0.73       162
           3       0.66      0.71      0.68       150
           4       0.81      0.83      0.82       153
           5       0.90      0.69      0.78        39
           6       0.84      0.88      0.86       291
           7       0.00      0.00      0.00         1
           8       0.81      0.82      0.81       211
           9       0.84      0.79      0.81        47
          10       0.53      0.41      0.46        22
          11       0.80      0.78      0.79       134
          12       0.77      0.62      0.69        32
          13       0.83      0.87      0.85       201
          14       0.79      0.82      0.80       210
          15       0.73      0.65      0.69        51
          16       0.81      0.72      0.76       108
          17       0.83    

# Inference Mode

In [28]:
relation_name = {
0: 'Cause-Effect(e1,e2)',
1: 'Cause-Effect(e2,e1)',
2: 'Component-Whole(e1,e2)',
3: 'Component-Whole(e2,e1)',
4: 'Content-Container(e1,e2)',
5: 'Content-Container(e2,e1)',
6: 'Entity-Destination(e1,e2)',
7: 'Entity-Destination(e2,e1)',
8: 'Entity-Origin(e1,e2)',
9: 'Entity-Origin(e2,e1)',
10: 'Instrument-Agency(e1,e2)', 
11: 'Instrument-Agency(e2,e1)',
12: 'Member-Collection(e1,e2)',
13: 'Member-Collection(e2,e1)',
14: 'Message-Topic(e1,e2)',
15: 'Message-Topic(e2,e1)',
16: 'Product-Producer(e1,e2)',
17: 'Product-Producer(e2,e1)',
18: 'Other'
}

display(pd.DataFrame([(key, value) for key, value in relation_name.items()], columns=["relation", "relation type"]))

def inference_mode(sentence):
    if type(sentence)==str:
        sentence = [sentence]
    df = pd.DataFrame(sentence, columns = ['sentence'])
    df = preprocess_data(df, False)

    bow_d = vectorizer.transform(df['custom_sentence']).toarray()

    X_test = get_X(df, bow_d, 'custom_sentence_wdef','e1_vector','e2_vector','e1_dep_token','e2_dep_token','e1_prev_token','e2_prev_token','e1_post_token','e2_post_token','e1_definition','e2_definition','hypernym')

    y_pred = svm_model.predict(X_test)
    df['predicted_relation'] = y_pred
    df['predicted_relation_type'] = df['predicted_relation'].map(relation_name)
    return df[['sentence','predicted_relation_type']]

Unnamed: 0,relation,relation type
0,0,"Cause-Effect(e1,e2)"
1,1,"Cause-Effect(e2,e1)"
2,2,"Component-Whole(e1,e2)"
3,3,"Component-Whole(e2,e1)"
4,4,"Content-Container(e1,e2)"
5,5,"Content-Container(e2,e1)"
6,6,"Entity-Destination(e1,e2)"
7,7,"Entity-Destination(e2,e1)"
8,8,"Entity-Origin(e1,e2)"
9,9,"Entity-Origin(e2,e1)"


### Try Your Sentence

In [27]:
inference_mode('<e1>Global warming</e1> is a result of <e2>deforestation</e2>.')


Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,sentence,predicted_relation_type
0,<e1>Global warming</e1> is a result of <e2>def...,"Cause-Effect(e1,e2)"
