In [None]:
from nltk.corpus import wordnet

print(wordnet.get_version())

3.0


In [None]:
import pandas as pd

mturk771 = pd.read_csv("data/mturk-771.csv")[["word1", "word2", "similarity"]]
mturk771

Unnamed: 0,word1,word2,similarity
0,access,gateway,3.791667
1,account,explanation,2.000000
2,account,invoice,3.750000
3,account,statement,3.681818
4,acoustic,remedy,1.227273
...,...,...,...
766,ticket,writing,2.375000
767,victory,watch,1.553191
768,washer,worker,2.909091
769,wife,woman,3.884615


In [None]:
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
import numpy as np

def wordnet_similarity(word1, word2):
    res = dict()
    res_output = dict()
    ic = {
        "brown": wordnet_ic.ic("ic-brown.dat"),
        "semcor": wordnet_ic.ic('ic-semcor.dat')
    }
    
    method_nic = { # non-information content
        "path": wn.path_similarity,
        "lch": wn.lch_similarity,
        "wup": wn.wup_similarity
    }
    method_ic = { # information content
        "res": wn.res_similarity,
        "jcn": wn.jcn_similarity,
        "lin": wn.lin_similarity
    }
    method_cal = {
        "max": np.max,
        "ave": np.mean
    }
    synsets_word1 = wn.synsets(word1, pos=wn.NOUN) # only nouns accur in the dataset
    synsets_word2 = wn.synsets(word2, pos=wn.NOUN)
    for s_w1 in synsets_word1:
        for s_w2 in synsets_word2:
            for k_m, v_m in method_nic.items():
                if k_m in res:
                    res[k_m].append(v_m(s_w1, s_w2))
                else:
                    res[k_m] = [v_m(s_w1, s_w2)]
            for k_ic, v_ic in ic.items():
                for k_m, v_m in method_ic.items():
                    method = f"{k_m}_{k_ic}"
                    if method in res:
                        res[method].append(v_m(s_w1, s_w2, v_ic))
                    else:
                        res[method] = [v_m(s_w1, s_w2, v_ic)]
    for k_m, v_m in method_cal.items():
        for k, v in res.items():
            res_output[f"{k}_{k_m}"] = v_m(v)

    return res_output

wordnet_similarity("access", "gateway") # test

{'path_max': 0.3333333333333333,
 'lch_max': 2.538973871058276,
 'wup_max': 0.875,
 'res_brown_max': 7.827032123895868,
 'jcn_brown_max': 0.19039374352829713,
 'lin_brown_max': 0.7487712334180497,
 'res_semcor_max': 7.288423034166368,
 'jcn_semcor_max': 0.22334705684109704,
 'lin_semcor_max': 0.7650207913918325,
 'path_ave': 0.10657051282051282,
 'lch_ave': 1.1197085527167323,
 'wup_ave': 0.25973942208462336,
 'res_brown_ave': 1.438131873141628,
 'jcn_brown_ave': 0.0674445056882194,
 'lin_brown_ave': 0.13578150470157327,
 'res_semcor_ave': 1.317131163925546,
 'jcn_semcor_ave': 0.05932606124215329,
 'lin_semcor_ave': 0.1275034652319721}

In [68]:
# 在mturk771上计算wordnet相似度
import pandas as pd
from tqdm import tqdm

def get_wordnet_similarity_on_mturk771():
    data = mturk771.to_dict(orient="records")
    for item in tqdm(data):
        res = wordnet_similarity(item["word1"], item["word2"])
        item = item.update(res)
    return data

result = pd.DataFrame(get_wordnet_similarity_on_mturk771())

100%|██████████| 771/771 [24:45<00:00,  1.93s/it]


In [77]:
result.to_csv("result/wordnet_mturk771.csv")
result

Unnamed: 0,word1,word2,similarity,path_max,lch_max,wup_max,res_brown_max,jcn_brown_max,lin_brown_max,res_semcor_max,...,lin_semcor_max,path_ave,lch_ave,wup_ave,res_brown_ave,jcn_brown_ave,lin_brown_ave,res_semcor_ave,jcn_semcor_ave,lin_semcor_ave
0,access,gateway,3.791667,0.333333,2.538974,0.875000,7.827032,1.903937e-01,0.748771,7.288423,...,7.650208e-01,0.106571,1.119709,0.259739,1.438132,6.744451e-02,0.135782,1.317131,5.932606e-02,1.275035e-01
1,account,explanation,2.000000,1.000000,3.637586,1.000000,6.809736,1.000000e+300,1.000000,6.945478,...,1.000000e+00,0.129168,1.366648,0.355805,1.359912,3.333333e+298,0.170544,1.580095,3.333333e+298,1.362794e-01
2,account,invoice,3.750000,1.000000,3.637586,1.000000,8.434915,1.000000e+300,1.000000,9.686318,...,1.000000e+00,0.198969,1.521185,0.426144,2.710859,1.000000e+299,0.320952,3.226186,1.000000e+299,3.284880e-01
3,account,statement,3.681818,0.500000,2.944439,0.941176,7.833598,1.663018e+00,0.963038,9.463175,...,9.883473e-01,0.124751,1.434103,0.393200,1.860243,1.197101e-01,0.234428,2.183910,1.360628e-01,2.004264e-01
4,acoustic,remedy,1.227273,0.500000,2.944439,0.941176,9.192600,2.183476e-01,0.800573,9.200810,...,1.840162e-299,0.277778,1.845827,0.523220,4.596300,1.291585e-01,0.400286,4.600405,1.000000e-300,9.200810e-300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
766,ticket,writing,2.375000,0.250000,2.251292,0.769231,4.262897,1.984947e-01,0.628574,4.693338,...,6.110924e-01,0.113037,1.394511,0.397021,1.920487,8.012589e-02,0.225922,2.254029,3.360800e-02,1.163380e-01
767,victory,watch,1.553191,0.100000,1.335001,0.470588,2.036328,7.047236e-02,0.223005,2.254130,...,2.444307e-01,0.082220,1.119373,0.284806,0.877519,6.074299e-02,0.095189,1.011182,5.060190e-02,6.883112e-02
768,washer,worker,2.909091,0.500000,2.944439,0.705882,4.517904,1.005258e-01,0.475982,4.483686,...,8.967372e-300,0.138262,1.415048,0.484413,1.971985,5.753819e-02,0.182652,1.755839,8.750000e-301,3.124804e-300
769,wife,woman,3.884615,0.500000,2.944439,0.631579,6.257381,4.536521e-01,0.850240,5.636710,...,8.976880e-01,0.233333,1.920853,0.484686,3.635083,1.987735e-01,0.457790,3.246251,2.266354e-01,3.701087e-01


In [79]:
result.corr("spearman").to_csv("result/wordnet_spearman.csv")
result.corr("spearman")

Unnamed: 0,similarity,path_max,lch_max,wup_max,res_brown_max,jcn_brown_max,lin_brown_max,res_semcor_max,jcn_semcor_max,lin_semcor_max,path_ave,lch_ave,wup_ave,res_brown_ave,jcn_brown_ave,lin_brown_ave,res_semcor_ave,jcn_semcor_ave,lin_semcor_ave
similarity,1.0,0.490132,0.490132,0.450494,0.404394,0.47748,0.487702,0.346535,0.48596,0.49568,0.392636,0.289612,0.254518,0.343811,0.48934,0.370809,0.333146,0.434607,0.399058
path_max,0.490132,1.0,1.0,0.895215,0.565698,0.78487,0.789916,0.566263,0.689764,0.698269,0.385542,0.20651,0.110418,0.147716,0.662708,0.17871,0.17308,0.571707,0.229306
lch_max,0.490132,1.0,1.0,0.895215,0.565698,0.78487,0.789916,0.566263,0.689764,0.698269,0.385542,0.20651,0.110418,0.147716,0.662708,0.17871,0.17308,0.571707,0.229306
wup_max,0.450494,0.895215,0.895215,1.0,0.690609,0.744616,0.788281,0.68028,0.630978,0.663203,0.293819,0.095825,0.082628,0.184779,0.601213,0.168942,0.212742,0.503496,0.170101
res_brown_max,0.404394,0.565698,0.565698,0.690609,1.0,0.625952,0.737048,0.933879,0.435366,0.506506,0.206661,0.034926,0.113861,0.367208,0.455441,0.225856,0.391582,0.312641,0.073266
jcn_brown_max,0.47748,0.78487,0.78487,0.744616,0.625952,1.0,0.978956,0.632233,0.832455,0.837187,0.242308,0.106753,0.037334,0.105929,0.810975,0.197126,0.14507,0.674054,0.295743
lin_brown_max,0.487702,0.789916,0.789916,0.788281,0.737048,0.978956,1.0,0.736221,0.800202,0.826843,0.237569,0.082625,0.045536,0.165856,0.778421,0.215572,0.202168,0.636488,0.258438
res_semcor_max,0.346535,0.566263,0.566263,0.68028,0.933879,0.632233,0.736221,1.0,0.448853,0.522655,0.146647,-0.016933,0.039111,0.2775,0.43582,0.155593,0.337581,0.304724,0.026034
jcn_semcor_max,0.48596,0.689764,0.689764,0.630978,0.435366,0.832455,0.800202,0.448853,1.0,0.989618,0.260206,0.151915,0.051861,0.070153,0.715753,0.178746,0.123095,0.852783,0.482726
lin_semcor_max,0.49568,0.698269,0.698269,0.663203,0.506506,0.837187,0.826843,0.522655,0.989618,1.0,0.254257,0.136897,0.053842,0.102684,0.706153,0.188294,0.154829,0.835994,0.478703
