# Estonian Wordnet version 2.2

Three path-based measures: path similarity, Leacock & Chodorow and Wu & Palmer are implemented on Estonian Wordnet version 2.2. 

## Imports

In [None]:
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from scipy.stats import kendalltau
import matplotlib.pyplot as plt
from xml.dom import minidom
from collections import defaultdict
import math

## Defining the methods

In [None]:
def getAllPaths(word, dic, pos, synsets):
    """
    returns all the possible paths 
    
    """
    
    if word not in dic.keys():
        return []
    
    senses = dic.get(word)
    
    if pos not in senses.keys():
        return []
    senses = senses.get(pos)
    paths = []

    for sense in senses.keys():
        val = senses.get(sense)
        path = getHypernyms(val,synsets)
        paths.append(path)
    return paths

def getHypernyms(val, synsets):
    """
    returns all the hypernyms of a synset
    
    """
    
    value = synsets.get(val)
    path = []
    path.append(val)
    while True: 
        if 'hypernym' in value.keys():
            hyp = value.get('hypernym')
            path.append(hyp)
            
            if hyp in synsets.keys():
                value = synsets.get(hyp)
            else:
                break
        else:
            break
    return path


def shorthestPath(word1, word2, pos, dic, synsets):
    """
    returns the shorthest path, path to root, paths to root, path length
    
    """
    
    
    p1 = getAllPaths(word1, dic, pos, synsets)
    p2 = getAllPaths(word2, dic, pos, synsets)
    if p1==[] or p2 == []:
        return [[], [], []], [], [[], []], -1
    shortest = []
    p = []
    toroot = []
    s = 1000

    for p_1 in p1:
        for p_2 in p2: 
            if p_1[-1] == p_2[-1]:
                path, to_root, l = getEdgeCount(p_1, p_2)

                if l < s: 
                    s = l
                    p = path
                    toroot = to_root
                    shortest = [p_1, p_2]

            
    if p == []:
        p = [[], [], []]
    if shortest == []:
        shortest = [[], []]
    return p, toroot, shortest, s

def getEdgeCount(p1, p2):
    """
    returns path between two synsets, path from lcs to root, path length
    
    """
    
    pathp1 = []
    pathp2 = []
    lcs = []
    lcs_root = []
    count = 0
    if p1 == p2: 
        return [[],  [p1[0]], []], p1, 0
    if set(p1) < set(p2): #alamhulk 
        for p in p2: 
            if p not in p1: 
                pathp2.append(p) # this is lcs
                count += 1
            else:
                if lcs == []:
                    lcs.append(p) # this is lcs
                lcs_root.append(p)
    elif set(p2) < set(p1):
        for p in p1: 
            if p not in p2: 
                pathp1.append(p)
                count += 1
            else:
                if lcs == []:
                    lcs.append(p) # this is lcs
                lcs_root.append(p)
        
    else:
        for p in p2: 
            if p not in p1: 
                pathp2.append(p)
                count += 1
            else:
                if lcs == []:
                    lcs.append(p) # this is lcs
                lcs_root.append(p)
        for p in p1: 
            if p not in p2: 
                pathp1.append(p)
                count += 1
    return [pathp1, lcs, pathp2], lcs_root, count
        
    
def findDepth(dic, synsets):
    """
    returns a dictionary containing max path lengths from root to leaf
    
    """  
    roots = []
    long = {}
    for key in dic.keys(): 
        vals = dic.get(key)
        for pos in vals.keys():
            senses = vals.get(pos)
            for sense in senses.keys():
                synset = senses.get(sense)
                hypers = getHypernyms(synset, synsets)
                if len(hypers) >0:
                    root = hypers[-1]
                    length = len(hypers)
                    
                    if root not in long.keys():
                        long[root] = length
                    else:
                        if long.get(root) < length: 
                            long[root] = length
    return long
    
def path_similarity(paths): 
    """
    returns path similarity, which is calculates as ps=1/min_path(synset1, synset2)
    
    """
    
    l = len(paths[0]) + len(paths[1]) + len(paths[2])
    if l == 0:
        return -1
                                            
    return 1/l

def lc(paths,depths, root): 
    """
    returns leacock & chodorow similarity, which is calculated as -log(path(synset1, synset2)/(2*depth_graph))
    
    """
    l = len(paths[0]) + len(paths[1]) + len(paths[2])
    if len(root)==0:
        return -1
    depth = depths.get(root[-1])

    if l == 0: 
        return -1
    return -math.log(l/(2*depth))

def wup(p, toroot):
    """
    returns Wu & Palmer similarity, calculated as (2*depth_lcs)/(depth(synset1)+depth(synset2))
    
    """
    
    p1 = len(p[0])
    p2 = len(p[1])
    
    depth_lcs = len(toroot)
    if (p1+p2) == 0:
        return -1
    #return (2*depth_lcs)/(p1+p2+2*depth_lcs)

    return (2*(depth_lcs))/(p1+p2)

def calc_results(df):
    """
    returns dataframe containing correlations scores
    
    """
    results = pd.DataFrame(columns=["measure", "sim_set", "pearson", "spearman", "kendall"]) 
    results = evaluate(df, results, "ESL", "PS")
    results = evaluate(df, results, "ESL", "LC")
    results = evaluate(df, results, "ESL", "WUP")
    results = evaluate(df, results, "SL", "PS")
    results = evaluate(df, results, "SL", "LC")
    results = evaluate(df, results, "SL", "WUP")
    return results
    
    
    
def evaluate(df, results, sim_set, measure):
    """
     calculates correlation coefficients
    
    """
    pearson = round(pearsonr(df[sim_set], df[measure])[0], 3)
    spearman = round(spearmanr(df[sim_set], df[measure])[0], 3)
    kendall = round(kendalltau(df[sim_set], df[measure])[0],3)
    results = results.append({"measure":measure, "sim_set":sim_set, "pearson":pearson, "spearman":spearman, "kendall":kendall},
                            ignore_index=True)
    return results


## Parsing the XML file

Wordnet is in XML file, this file contains LexicalEntry and Synset elements. Dictionary containing synset ids (dic) is created and dictionary for getting synset entries (revers, which is not used in any method, just to understand path better) is created. Relations between synsets are from Synset elements.  
<br>
This XML can be downloaded from here https://gitlab.keeleressursid.ee/avalik/data/blob/master/estwn/estwn-et-2.2.0.xml


In [None]:
mydoc = minidom.parse('estwn-et-2.2.0.xml')
items = mydoc.getElementsByTagName("LexicalEntry")
dic = defaultdict(dict)
revers = defaultdict(str)

for item in items:
    writtenForm = item.getElementsByTagName('Lemma')[0].attributes['writtenForm'].value
    POS = item.getElementsByTagName('Lemma')[0].attributes['partOfSpeech'].value
    senses = {}
    for s in item.getElementsByTagName('Sense'):
        sid = s.attributes['id'].value
        synset = s.attributes['synset'].value
        
        senses[sid] = synset
        if synset not in revers.keys():
            revers[synset] = [sid]
        else:
            r = revers.get(synset)
            r.append(sid)
            revers[synset] = r
    dic[writtenForm][POS] = senses
    
    
synsets = defaultdict(dict)
syns = mydoc.getElementsByTagName("Synset")
for item in syns:
    sid = item.attributes['id'].value
    rels = {}
    for s in item.getElementsByTagName('SynsetRelation'):
        rel = s.attributes['relType'].value
        target = s.attributes['target'].value
        rels[rel] = target
    synsets[sid] = rels

## Calculating similarities and correlations
First, necessary variables are initiated. 

In [None]:
data = pd.read_excel("Ratings.xlsx")
depths = findDepth(dic, synsets)

Similarity between words from the EstSimLex-999 and correlation with human score is calculated

In [None]:
similarity_scores = pd.DataFrame(columns=["sõna1", "sõna2","PS", "LC", "WUP", "ESL", "SL"])

for i, row in data.iterrows():
    s1 = row["sõna 1"]
    s2 = row["sõna 2"]

    pos = row["POS"].lower()
    paths,toroot,s,e = shorthestPath(s1, s2,row["POS"].lower(),dic, synsets)
    path_sim = path_similarity(paths)
    l = lc(paths, depths, toroot ) 
    w = wup(s,toroot)
    sl =row["SimLex999"]
    esl = row["Average"]
    if path_sim != -1:
        similarity_scores = similarity_scores.append({"sõna1":s1, "sõna2":s2, "PS":path_sim, "LC":l, "WUP":w, "ESL":esl,
                                                          "SL":sl}, ignore_index=True)
        
results = calc_results(similarity_scores)     

In [None]:
results.to_excel("wordnet_results.xlsx")