In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('data/train.csv')

In [3]:
train_df.head()

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/emily/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [8]:
print('abatement :', lemmatizer.lemmatize('abatement'))

abatement : abatement


In [9]:
print('abating :', lemmatizer.lemmatize('abating'))

abating : abating


In [11]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/emily/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [18]:
print(nltk.pos_tag(['abating']))

[('abating', 'VBG')]


In [20]:
print(nltk.pos_tag(['abatement']))

[('abatement', 'NN')]


In [22]:
wordnet_pos = {'NN': 'n','JJ': 'a', 'RB': 'r', 'VB': 'v'}

In [25]:
wordnet_pos[nltk.pos_tag(['abating'])[0][1][:2]]

'v'

In [26]:
print('abating :', lemmatizer.lemmatize('abating', wordnet_pos[nltk.pos_tag(['abating'])[0][1][:2]]))

abating : abate


In [27]:
print('abatement :', lemmatizer.lemmatize('abatement', wordnet_pos[nltk.pos_tag(['abatement'])[0][1][:2]]))

abatement : abatement


In [30]:
def get_lemmas(word):
    wordnet_pos = {'NN': 'n','JJ': 'a', 'RB': 'r', 'VB': 'v'}
    return lemmatizer.lemmatize(word, wordnet_pos[nltk.pos_tag([word])[0][1][:2]])

def compare_lemmas(word1, word2):
    if word1 == word2:
        return 1.0
    return 0.0

In [31]:
compare_lemmas(get_lemmas('abating'), get_lemmas('abate'))

1.0

In [33]:
train_df[train_df['score']==1.0].head()

Unnamed: 0,id,anchor,target,context,score
28,473137168ebf7484,abatement,abating,F24,1.0
158,621b048d70aa8867,absorbent properties,absorbent characteristics,D01,1.0
161,bc20a1c961cb073a,absorbent properties,absorption properties,D01,1.0
311,e955700dffd68624,acid absorption,absorption of acid,B08,1.0
315,3a09aba546aac675,acid absorption,acid absorption,B08,1.0


In [34]:
compare_lemmas(get_lemmas('abating'), get_lemmas('abatement'))

0.0

In [35]:
compare_lemmas(get_lemmas('absorption'), get_lemmas('absorbent'))

0.0

In [40]:
def get_lemmas(phrase):
    words = phrase.split(' ')
    wordnet_pos = {'NN': 'n','JJ': 'a', 'RB': 'r', 'VB': 'v'}
    lemmas = []
    for word in words:
        pos = nltk.pos_tag([word])[0][1][:2]
        if pos in wordnet_pos:
            lemmas.append(lemmatizer.lemmatize(word, wordnet_pos[pos]))
    return lemmas

def compare_lemmas(phrase1, phrase2):
    lemmas1 = get_lemmas(phrase1)
    lemmas2 = get_lemmas(phrase2)
    if set(lemmas1) == set(lemmas2):
        return 1.0
    return 0.0

In [41]:
compare_lemmas('acid absorption', 'acid absorption')

1.0

In [42]:
compare_lemmas('acid absorption', 'absorption of acid')

1.0

In [43]:
train_df['lemma_score'] = train_df.apply(lambda x: compare_lemmas(x['anchor'], x['target']), axis=1)

In [44]:
train_df['lemma_score'].sum()

1080.0

In [45]:
train_df[(train_df['lemma_score']==1.0) & (train_df['score']==1.0)].shape

(799, 6)

In [46]:
train_df[(train_df['lemma_score']==1.0) & (train_df['score']!=1.0)].shape

(281, 6)

In [47]:
train_df[(train_df['lemma_score']==1.0) & (train_df['score']!=1.0)].head(25)

Unnamed: 0,id,anchor,target,context,score,lemma_score
214,1b9e8b4b43d11ca6,accept information,accept this information,A45,0.75,1.0
216,c94e7b08d9402064,accept information,accepting information from,A45,0.75,1.0
625,401fdb79d3fbcea9,adaptive linear,linear adaptive,G01,0.75,1.0
648,bb4fe1d01682b78b,adaptive linear,linear adaptive,H04,0.75,1.0
1799,76d4070823a41a20,application messaging,messaging application,G06,0.5,1.0
1828,141a9c7a7bb0d6b3,apply on tube,apply to tube,F16,0.75,1.0
1850,bc9d8f58f814e0cb,apply to anode electrode,apply between anode electrode,H01,0.75,1.0
1899,99d105ef372366ae,apply to requests,apply to both requests,H04,0.75,1.0
2209,12b3c70c1c49d97c,arm roller,roller arm,B27,0.75,1.0
2247,1bb5286edaca253a,arm roller,roller arm,E04,0.5,1.0


In [48]:
def compare_lemmas2(phrase1, phrase2):
    lemmas1 = get_lemmas(phrase1)
    lemmas2 = get_lemmas(phrase2)
    if set(lemmas1) == set(lemmas2):
        if lemmas1 == lemmas2:
            return 1.0
        return 0.75
    return 0.0

In [49]:
train_df['lemma_score2'] = train_df.apply(lambda x: compare_lemmas2(x['anchor'], x['target']), axis=1)

In [50]:
train_df[(train_df['lemma_score2']==1.0) & (train_df['score']==1.0)].shape

(706, 7)

In [51]:
train_df[(train_df['lemma_score2']==1.0) & (train_df['score']!=1.0)].shape

(177, 7)

In [52]:
train_df[(train_df['lemma_score2']==1.0) & (train_df['score']!=1.0)].head(25)

Unnamed: 0,id,anchor,target,context,score,lemma_score,lemma_score2
214,1b9e8b4b43d11ca6,accept information,accept this information,A45,0.75,1.0,1.0
216,c94e7b08d9402064,accept information,accepting information from,A45,0.75,1.0,1.0
1828,141a9c7a7bb0d6b3,apply on tube,apply to tube,F16,0.75,1.0,1.0
1850,bc9d8f58f814e0cb,apply to anode electrode,apply between anode electrode,H01,0.75,1.0,1.0
1899,99d105ef372366ae,apply to requests,apply to both requests,H04,0.75,1.0,1.0
2797,de263d3633fb36e9,azabicyclo,1 azabicyclo,G01,0.5,1.0,1.0
2870,7d7ce6f2840f62df,base fuels,based fuels,C10,0.5,1.0,1.0
3666,f297ea79eab86071,boom hydraulic cylinder,boom a hydraulic cylinder,E02,0.0,1.0,1.0
4369,9bae93608b5c4e52,carpet tiles,carpet in tile,B41,0.75,1.0,1.0
4401,47426c640707f475,carpet tiles,carpet in tile,B44,0.75,1.0,1.0


In [72]:
def compare_lemmas3(phrase1, phrase2):
    lemmas1 = get_lemmas(phrase1)
    lemmas2 = get_lemmas(phrase2)
    if set(lemmas1) == set(lemmas2):
        if lemmas1 == lemmas2:
            if (len(phrase1.split(' ')) == len(lemmas1)) & (len(phrase2.split(' ')) == len(lemmas2)) :
                return 1.0
        return 0.75
    return 0.0

In [60]:
train_df['lemma_score3'] = train_df.apply(lambda x: compare_lemmas3(x['anchor'], x['target']), axis=1)

In [61]:
train_df[(train_df['lemma_score3']==1.0) & (train_df['score']==1.0)].shape

(620, 8)

In [62]:
train_df[(train_df['lemma_score3']==1.0) & (train_df['score']!=1.0)].shape

(19, 8)

In [63]:
train_df[(train_df['lemma_score3']==1.0) & (train_df['score']!=1.0)].head(25)

Unnamed: 0,id,anchor,target,context,score,lemma_score,lemma_score2,lemma_score3
2870,7d7ce6f2840f62df,base fuels,based fuels,C10,0.5,1.0,1.0,1.0
5006,3dc843d0c815f298,central nucleus,central nuclei,C09,0.75,1.0,1.0,1.0
9728,ebc1afcd555078c4,display object,display objects,G04,0.75,1.0,1.0,1.0
10222,7bd34470550626b9,drive balls,driving balls,B05,0.75,1.0,1.0,1.0
11257,1fc8b3f3535d5695,embedding groove,embed groove,B25,0.75,1.0,1.0,1.0
12201,1079dfc256101746,extend areas,extended area,A24,0.75,1.0,1.0,1.0
18739,213a922e11f3957c,lower stretches,low stretches,F03,0.5,1.0,1.0,1.0
24646,14d11fa15acb761b,polls,poll,B21,0.75,1.0,1.0,1.0
26648,1bb3f7eb752be6c4,pulsed plasma,pulse plasma,H05,0.75,1.0,1.0,1.0
26756,87c9433dd134836a,punch face,punch faces,B21,0.75,1.0,1.0,1.0


In [64]:
train_df[(train_df['lemma_score3']==0.75) & (train_df['score']==1.0)].shape

(179, 8)

In [65]:
train_df[(train_df['lemma_score3']==0.75) & (train_df['score']==1.0)].head()

Unnamed: 0,id,anchor,target,context,score,lemma_score,lemma_score2,lemma_score3
311,e955700dffd68624,acid absorption,absorption of acid,B08,1.0,1.0,0.75,0.75
713,f3d0de522d259188,adjacent laterally,laterally adjacent,A41,1.0,1.0,0.75,0.75
744,6eb16b4b2255a4eb,adjacent laterally,laterally adjacent,B23,1.0,1.0,0.75,0.75
826,1e59a41f6ea51c98,adjustable multiple,multiple adjustable,B23,1.0,1.0,0.75,0.75
913,db9cdd7e95b800ad,agitate means,agitate by means,B01,1.0,1.0,1.0,0.75


In [69]:
lemmas1 = get_lemmas(train_df.iloc[744]['anchor'])
lemmas2 = get_lemmas(train_df.iloc[744]['target'])

In [70]:
lemmas1

['adjacent', 'laterally']

In [71]:
lemmas2

['laterally', 'adjacent']

In [75]:
def compare_lemmas4(phrase1, phrase2):
    if phrase1 == phrase2:
        return 1.0
    if set(phrase1.split(' ')) == set(phrase2.split(' ')):
        return 1.0
    lemmas1 = get_lemmas(phrase1)
    lemmas2 = get_lemmas(phrase2)
    if set(lemmas1) == set(lemmas2):
        if lemmas1 == lemmas2:
            if (len(phrase1.split(' ')) == len(lemmas1)) & (len(phrase2.split(' ')) == len(lemmas2) ):
                return 1.0
        return 0.75
    return 0.0

In [76]:
train_df['lemma_score4'] = train_df.apply(lambda x: compare_lemmas4(x['anchor'], x['target']), axis=1)

In [77]:
train_df[(train_df['lemma_score4']==1.0) & (train_df['score']==1.0)].shape

(703, 9)

In [78]:
train_df[(train_df['lemma_score4']==1.0) & (train_df['score']!=1.0)].shape

(73, 9)

In [79]:
train_df[(train_df['lemma_score4']!=1.0) & (train_df['score']==1.0)].shape

(451, 9)

In [80]:
train_df[(train_df['lemma_score4']==0.75) & (train_df['score']==1.0)].shape

(96, 9)

In [81]:
train_df[(train_df['lemma_score4']==0.75) & (train_df['score']==0.75)].shape

(116, 9)

In [82]:
train_df[['score', 'lemma_score4']].corr()

Unnamed: 0,score,lemma_score4
score,1.0,0.368831
lemma_score4,0.368831,1.0


In [83]:
train_df[(train_df['lemma_score4']==1.0) & (train_df['score']!=1.0)].head()

Unnamed: 0,id,anchor,target,context,score,lemma_score,lemma_score2,lemma_score3,lemma_score4
625,401fdb79d3fbcea9,adaptive linear,linear adaptive,G01,0.75,1.0,0.75,0.75,1.0
648,bb4fe1d01682b78b,adaptive linear,linear adaptive,H04,0.75,1.0,0.75,0.75,1.0
1799,76d4070823a41a20,application messaging,messaging application,G06,0.5,1.0,0.75,0.75,1.0
2209,12b3c70c1c49d97c,arm roller,roller arm,B27,0.75,1.0,0.75,0.75,1.0
2247,1bb5286edaca253a,arm roller,roller arm,E04,0.5,1.0,0.75,0.75,1.0
