In [1]:
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk as lesk
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/Users/macbookpro/OneDrive/PHD/ArtículoJCR/codeIEEE/words_paper.csv')

GET SYNSETS: Synset instances are the groupings of synonymous words that express the same concept. Some of the words have only one Synset and some have several.

In [3]:
df['w1syns'] = [ wn.synsets(word) for word in df.w1 ]
df['w2syns'] = [ wn.synsets(word) for word in df.w2 ]

Obtener los hiperónimos en común si se tiene un PoS a o n. Debido a que cada palabra puede tener uno o más synsets.

In [4]:

final_res = []
for index in range (len(df)):
    wsd1 = "" 
    wsd2 = ""
    for syn1 in df['w1syns'][index]:
        lch_lst = []
        for syn2 in df['w2syns'][index]:
            if(syn1.pos()==syn2.pos() and (syn1.pos()=='a' or syn1.pos()=='n') and (syn2.pos()=='a' or syn2.pos()=='n')):
                    lch = syn1.lowest_common_hypernyms(syn2)[0] #se obtiene el hiperónimo más cercano entre palabras
                    if lch: #por si no hay hyperonimos en común
                        sch = syn1._shortest_hypernym_paths(syn2)
                        best_hyper = {'w1':syn1, 'w2':syn2, 'lch':lch, 'distance': sch[lch]}
                        lch_lst.append(best_hyper)
                    else:
                        continue
                        
        if lch_lst: #fuerza a que haya al menos un hypernónimo en común
            mindistance =  min(lch_lst, key=lambda x:x['distance'])
            best_hyper = {'w1':mindistance['w1'], 'w2':mindistance['w2'],'lch':mindistance['lch'], 'distance': mindistance['distance'], 'index':index,
                         'w1_def': mindistance['w1'].definition(), 'w2_def':mindistance['w2'].definition()}
            final_res.append(best_hyper)
            
        

                

In [5]:
words_lch = pd.DataFrame(final_res)

El WSD se decide de aquel par de palabras que tengan menor distancia con su hiperónimo en común. Como segundo criterio de selección, se elige aquél que aparezca primero en la lista.

In [6]:
display(words_lch)

Unnamed: 0,w1,w2,lch,distance,index,w1_def,w2_def
0,Synset('gray.n.01'),Synset('blue.n.01'),Synset('color.n.01'),2,0,a neutral achromatic color midway between whit...,blue color or pigment; resembling the color of...
1,Synset('grey.n.06'),Synset('blue.n.02'),Synset('clothing.n.01'),1,0,clothing that is a grey color,blue clothing
2,Synset('grey.n.04'),Synset('blue.n.03'),Synset('organization.n.01'),1,0,any organization or party whose uniforms or ba...,any organization or party whose uniforms or ba...
3,Synset('grey.n.07'),Synset('blue.n.07'),Synset('animal.n.01'),10,0,horse of a light gray or whitish color,any of numerous small butterflies of the famil...
4,Synset('gray.n.05'),Synset('blue.n.01'),Synset('abstraction.n.06'),6,0,the SI unit of energy absorbed from ionizing r...,blue color or pigment; resembling the color of...
5,Synset('gray.n.06'),Synset('amobarbital_sodium.n.01'),Synset('causal_agent.n.01'),5,0,English radiobiologist in whose honor the gray...,the sodium salt of amobarbital that is used as...
6,Synset('gray.n.07'),Synset('amobarbital_sodium.n.01'),Synset('causal_agent.n.01'),5,0,English poet best known for his elegy written ...,the sodium salt of amobarbital that is used as...
7,Synset('gray.n.08'),Synset('amobarbital_sodium.n.01'),Synset('causal_agent.n.01'),4,0,American navigator who twice circumnavigated t...,the sodium salt of amobarbital that is used as...
8,Synset('gray.n.09'),Synset('amobarbital_sodium.n.01'),Synset('causal_agent.n.01'),5,0,United States botanist who specialized in Nort...,the sodium salt of amobarbital that is used as...
9,Synset('coffee.n.01'),Synset('cup.n.06'),Synset('beverage.n.01'),1,1,a beverage consisting of an infusion of ground...,a punch served in a pitcher instead of a punch...


Se elige automáticamente aquél par de palabras cuyo hiperónimo tenga la menor distancia hacia alguna de las palabras del par comparadas. Y como segundo criterio de selección se utiliza el primer par encontrado, dado que la organización de WordNet se presta para que los primeros synsets contengan los términos más comunmente utilizados.

In [7]:
final_lst = []
current_i = 0
for i, row in words_lch.iterrows():
    ind = row['index']
    if current_i == ind:
        if final_lst:
            if(row['distance']<final_lst[len(final_lst)-1]['distance'] and row['index']==current_i):
                final_lst[len(final_lst)-1] = row
            elif (row['index'] != current_i):
                final_lst.append(row)
        else:
            final_lst.append(row)
    else:
        final_lst.append(row)
        current_i = ind
semantics = pd.DataFrame(final_lst)

In [8]:
display(semantics)

Unnamed: 0,w1,w2,lch,distance,index,w1_def,w2_def
1,Synset('grey.n.06'),Synset('blue.n.02'),Synset('clothing.n.01'),1,0,clothing that is a grey color,blue clothing
9,Synset('coffee.n.01'),Synset('cup.n.06'),Synset('beverage.n.01'),1,1,a beverage consisting of an infusion of ground...,a punch served in a pitcher instead of a punch...
13,Synset('coffee.n.01'),Synset('tea.n.01'),Synset('beverage.n.01'),1,2,a beverage consisting of an infusion of ground...,a beverage made by steeping tea leaves in water


In [9]:
def getTree(path, lch):
    tree = []
    for node in path:
        if (node != lch):
            tree.append(node)
        else:
            tree.append(node)
            break
    return tree
    

Se calcula la distancia de Levenstein de acuerdo con los árboles previamente formados

In [10]:
def getDistance(tree1, tree2):
  d=dict()
  for i in range(len(tree1)+1):
     d[i]=dict()
     d[i][0]=i
  for i in range(len(tree2)+1):
     d[0][i] = i
  for i in range(1, len(tree1)+1):
     for j in range(1, len(tree2)+1):
        d[i][j] = min(d[i][j-1]+1, d[i-1][j]+1, d[i-1][j-1]+(not tree1[i-1] == tree2[j-1]))
  return d

In [11]:
def getGEDNormalized(ged, path1,path2):
    Xmax = max([path1,path2])
    res = 1- (ged/(Xmax))
    return res

Aquí se hacen todos los pasos para obtener la distancia

In [12]:
# Se obtienen el path de cada palabra para llegar a la raíz, donde se debe de encontrar el hiperónimo determinado
# en paso anteriores
def getLevenstein(w1, w2, lch):
    path1 = w1._shortest_hypernym_paths(w2)
    path2 = w2._shortest_hypernym_paths(w1)

    tree1 = getTree(path1,lch)
    tree2 = getTree(path2,lch)

    matriz = getDistance(tree1, tree2)
    distance = matriz[len(tree1)][len(tree2)]

    paht1_to_hyper = len(path1)
    paht2_to_hyper = len(path2)
    
    norm = getGEDNormalized(distance, paht1_to_hyper, paht2_to_hyper)
    
    LCh = w1.lch_similarity(w2)
    wup = w1.wup_similarity(w2)
    path_s = w1.path_similarity(w2)
    
    resultado = {'path1':path1, 'path2':path2,'lch':lch, 'tree1':tree1, 'tree2':tree2, 'matriz':matriz, 
                 'path_sim':path_s,'LCh':LCh, 'wup':wup, 'normalizada':norm, 'levenstein':distance}
    
    return resultado

Iterar sobre la lista final

In [13]:
distances = []
for i, row in semantics.iterrows():
    res = getLevenstein(row['w1'], row['w2'], row['lch'])
    distances.append(res)
    

In [14]:
final_results = pd.DataFrame(distances)
display(final_results)

Unnamed: 0,path1,path2,lch,tree1,tree2,matriz,path_sim,LCh,wup,normalizada,levenstein
0,"{Synset('grey.n.06'): 0, Synset('clothing.n.01...","{Synset('blue.n.02'): 0, Synset('clothing.n.01...",Synset('clothing.n.01'),"[Synset('grey.n.06'), Synset('clothing.n.01')]","[Synset('blue.n.02'), Synset('clothing.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.333333,2.538974,0.888889,0.909091,1
1,"{Synset('coffee.n.01'): 0, Synset('beverage.n....","{Synset('cup.n.06'): 0, Synset('punch.n.02'): ...",Synset('beverage.n.01'),"[Synset('coffee.n.01'), Synset('beverage.n.01')]","[Synset('cup.n.06'), Synset('punch.n.02'), Syn...","{0: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}, 1: {...",0.166667,1.845827,0.761905,0.809524,4
2,"{Synset('coffee.n.01'): 0, Synset('beverage.n....","{Synset('tea.n.01'): 0, Synset('beverage.n.01'...",Synset('beverage.n.01'),"[Synset('coffee.n.01'), Synset('beverage.n.01')]","[Synset('tea.n.01'), Synset('beverage.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.333333,2.538974,0.888889,0.928571,1


# Aquí empieza el análisis de la Distancia de Levenstein en Género

In [45]:
df_gender = pd.read_csv('/Users/macbookpro/OneDrive/PHD/ArtículoJCR/codeIEEE/lista_gender.csv')

In [46]:
df_gender['w1syns'] = [ wn.synsets(word) for word in df_gender.w1 ]
df_gender['w2syns'] = [ wn.synsets(word) for word in df_gender.w2 ]

In [47]:
final_res_gender = []
for index in range (len(df_gender)):
    wsd1 = "" 
    wsd2 = ""
    for syn1 in df_gender['w1syns'][index]:
        lch_lst = []
        for syn2 in df_gender['w2syns'][index]:
            if(syn1.pos()==syn2.pos() and (syn1.pos()=='a' or syn1.pos()=='n') and (syn2.pos()=='a' or syn2.pos()=='n')):
                    lch = syn1.lowest_common_hypernyms(syn2)[0] #se obtiene el hiperónimo más cercano entre palabras
                    if lch: #por si no hay hyperonimos en común
                        sch = syn1._shortest_hypernym_paths(syn2)
                        best_hyper = {'w1':syn1, 'w2':syn2, 'lch':lch, 'distance': sch[lch]}
                        lch_lst.append(best_hyper)
                    else:
                        continue
                        
        if lch_lst: #fuerza a que haya al menos un hypernónimo en común
            mindistance =  min(lch_lst, key=lambda x:x['distance'])
            best_hyper = {'w1':mindistance['w1'], 'w2':mindistance['w2'], 'lch':mindistance['lch'], 'distance': mindistance['distance'], 'index':index,
                         'w1_def': mindistance['w1'].definition(), 'w2_def':mindistance['w2'].definition()}
            final_res_gender.append(best_hyper)

In [48]:
words_lch_gender = pd.DataFrame(final_res_gender)

In [49]:
display(words_lch_gender)

Unnamed: 0,w1,w2,lch,distance,index,w1_def,w2_def
0,Synset('man.n.01'),Synset('woman.n.01'),Synset('adult.n.01'),1,0,an adult person who is male (as opposed to a w...,an adult female person (as opposed to a man)
1,Synset('serviceman.n.01'),Synset('charwoman.n.01'),Synset('worker.n.01'),2,0,someone who serves in the armed forces; a memb...,a human female employed to do housework
2,Synset('man.n.03'),Synset('woman.n.01'),Synset('person.n.01'),1,0,the generic use of the word to refer to any hu...,an adult female person (as opposed to a man)
3,Synset('homo.n.02'),Synset('woman.n.01'),Synset('organism.n.01'),8,0,any living or extinct member of the family Hom...,an adult female person (as opposed to a man)
4,Synset('man.n.05'),Synset('charwoman.n.01'),Synset('worker.n.01'),3,0,a male subordinate,a human female employed to do housework
...,...,...,...,...,...,...,...
159,Synset('steward.n.02'),Synset('stewardess.n.01'),Synset('worker.n.01'),5,48,the ship's officer who is in charge of provisi...,a woman steward on an airplane
160,Synset('steward.n.03'),Synset('stewardess.n.01'),Synset('steward.n.03'),0,48,an attendant on an airplane,a woman steward on an airplane
161,Synset('shop_steward.n.01'),Synset('stewardess.n.01'),Synset('person.n.01'),5,48,a union member who is elected to represent fel...,a woman steward on an airplane
162,Synset('custodian.n.01'),Synset('stewardess.n.01'),Synset('person.n.01'),3,48,one having charge of buildings or grounds or a...,a woman steward on an airplane


In [50]:
final_lst_gender = []
current_i = 0
for i, row in words_lch_gender.iterrows():
    ind = row['index']
    if current_i == ind:
        if final_lst_gender:
            if(row['distance']<final_lst_gender[len(final_lst_gender)-1]['distance'] and row['index']==current_i):
                final_lst_gender[len(final_lst_gender)-1] = row
            elif (row['index'] != current_i):
                final_lst_gender.append(row)
        else:
            final_lst_gender.append(row)
    else:
        final_lst_gender.append(row)
        current_i = ind
semantics_gender = pd.DataFrame(final_lst_gender)

In [51]:
distances_gender = []
for i, row in semantics_gender.iterrows():
    res = getLevenstein(row['w1'], row['w2'], row['lch'])
    distances_gender.append(res)

In [52]:
res_gender = pd.DataFrame(distances_gender)

In [53]:
display(res_gender)

Unnamed: 0,path1,path2,lch,tree1,tree2,matriz,path_sim,LCh,wup,normalizada,levenstein
0,"{Synset('man.n.01'): 0, Synset('adult.n.01'): ...","{Synset('woman.n.01'): 0, Synset('adult.n.01')...",Synset('adult.n.01'),"[Synset('man.n.01'), Synset('adult.n.01')]","[Synset('woman.n.01'), Synset('adult.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.333333,2.538974,0.666667,0.916667,1
1,"{Synset('father.n.01'): 0, Synset('parent.n.01...","{Synset('mother.n.01'): 0, Synset('parent.n.01...",Synset('parent.n.01'),"[Synset('father.n.01'), Synset('parent.n.01')]","[Synset('mother.n.01'), Synset('parent.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.333333,2.538974,0.923077,0.933333,1
2,"{Synset('male_child.n.01'): 0, Synset('male.n....","{Synset('girl.n.01'): 0, Synset('woman.n.01'):...",Synset('person.n.01'),"[Synset('male_child.n.01'), Synset('male.n.02'...","[Synset('girl.n.01'), Synset('woman.n.01'), Sy...","{0: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}, 1: {...",0.166667,1.845827,0.631579,0.692308,4
3,"{Synset('uncle.n.01'): 0, Synset('kinsman.n.01...","{Synset('aunt.n.01'): 0, Synset('kinswoman.n.0...",Synset('relative.n.01'),"[Synset('uncle.n.01'), Synset('kinsman.n.01'),...","[Synset('aunt.n.01'), Synset('kinswoman.n.01')...","{0: {0: 0, 1: 1, 2: 2, 3: 3}, 1: {0: 1, 1: 1, ...",0.2,2.028148,0.6,0.833333,2
4,"{Synset('husband.n.01'): 0, Synset('spouse.n.0...","{Synset('wife.n.01'): 0, Synset('spouse.n.01')...",Synset('spouse.n.01'),"[Synset('husband.n.01'), Synset('spouse.n.01')]","[Synset('wife.n.01'), Synset('spouse.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.333333,2.538974,0.6,0.9375,1
5,"{Synset('actor.n.01'): 0, Synset('performer.n....","{Synset('actress.n.01'): 0, Synset('actor.n.01...",Synset('actor.n.01'),[Synset('actor.n.01')],"[Synset('actress.n.01'), Synset('actor.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 1}}",0.5,2.944439,0.952381,0.923077,1
6,"{Synset('prince.n.01'): 0, Synset('aristocrat....","{Synset('princess.n.01'): 0, Synset('aristocra...",Synset('aristocrat.n.01'),"[Synset('prince.n.01'), Synset('aristocrat.n.0...","[Synset('princess.n.01'), Synset('aristocrat.n...","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.333333,2.538974,0.9,0.916667,1
7,"{Synset('waiter.n.01'): 0, Synset('dining-room...","{Synset('waitress.n.01'): 0, Synset('waiter.n....",Synset('waiter.n.01'),[Synset('waiter.n.01')],"[Synset('waitress.n.01'), Synset('waiter.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 1}}",0.5,2.944439,0.956522,0.928571,1
8,"{Synset('cock.n.04'): 0, Synset('chicken.n.02'...","{Synset('hen.n.01'): 0, Synset('chicken.n.02')...",Synset('chicken.n.02'),"[Synset('cock.n.04'), Synset('chicken.n.02')]","[Synset('hen.n.01'), Synset('chicken.n.02')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.333333,2.538974,0.928571,0.933333,1
9,"{Synset('stallion.n.01'): 0, Synset('male_hors...","{Synset('mare.n.01'): 0, Synset('horse.n.01'):...",Synset('horse.n.01'),"[Synset('stallion.n.01'), Synset('male_horse.n...","[Synset('mare.n.01'), Synset('horse.n.01')]","{0: {0: 0, 1: 1, 2: 2}, 1: {0: 1, 1: 1, 2: 2},...",0.25,2.251292,0.909091,0.894737,2


In [54]:
res_gender.to_csv('/Users/macbookpro/OneDrive/PHD/ArtículoJCR/codeIEEE/gender_results.csv')

# Comparación de métricas