# Synonym detection
This script output a list of candidates for sections 'synonyms' 
Potential synonyms must:
    * Co-occur with similar sections (measured with tfidf metric, threshold fixed in minSimilarity parameter)
    * Don't co-ocurr with between them more than a certain treshold (maxCooccur parameter)
Additioanlly, other features are added for later evaluatio
    * editdistance
    * fasttext distance

Inputs: 
    * Sections per article contained in ../gap/multiLanguageFromDumpsSec/sections-articles_lang.json, in format {articleId_1:[sec_a,sec_b...], articleId_2:[sec_x,sec_y], ..., article_n:[sec_i...]}
    
(The actual values uploaded to gdocs are generated with the .py version in this same folder)

In [1]:
import pandas as pd
import json
from collections import Counter
import gzip
import json
import itertools
import networkx as nx
from functools import reduce
from itertools import combinations
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import numpy as np
import editdistance
from fastText_multilingual.fasttext import FastVector
import re

def fasttextDistance(sec1,sec2,vectors):
    '''
    Take two sections, create a vector for each of them summing all the words
    return cosine similarity
    '''
    sec1 = sec1.lower().split()
    sec2 = sec2.lower().split()
    sec1Vector  = np.sum([vectors[word] for word in sec1 if word in vectors],axis=0)/len(sec1)
    sec2Vector  = np.sum([vectors[word] for word in sec2 if word in vectors],axis=0)/len(sec2)
    distance  = vectors.cosine_similarity(sec1Vector,sec2Vector)
    if not isinstance(distance,float): #when at least one of the sections is not the vectorial space, the result is 'nan'
        return 0
    else:
        return vectors.cosine_similarity(sec1Vector,sec2Vector)


## Parameters

In [2]:
#Parameters 

#langs=['es','en','ar','ja','ru','fr']##define languages
langs = ['ru']
p = 0.75 #percentage of sections occurrences to be corevered 
maxCooccur = 3 #Maximum of coocurrences between pair of sections to be considered synonyms
minSimilarity = .6# Miminum cosine similarity to be consider synonyms
bucketSize = 50 #for stratified sample


## Find and save candidates

In [3]:
dfs = {}
for lang in langs:
    print(lang)
    output = []
    coOccur = {}
    sectionsAll = []
    #Load Sections
    with open('../gap/multiLanguageFromDumpsSec/sections-articles_%s.json' % lang) as f: 
        sections = json.load(f)
    ##get most frequent sections
    for secs in sections.values():
        for secName in secs:
                cleanSection = re.sub('[=\]\[]','',secName).strip()
                if cleanSection: #check that string is not empty
                    sectionsAll.append(cleanSection.strip())
    sectionsFreq = Counter(sectionsAll)
    total = sum(sectionsFreq.values())
    acc =0
    secsToEval = []
    for n,(sec,freq) in enumerate(sectionsFreq.most_common()):
        acc+= freq
        secsToEval.append(sec)

        if acc/total > p: #using sections that cover 80% of total
                break
    ## Get fasttext vectors for lang
    wordVectors = FastVector(vector_file='fastText_multilingual/vectors/wiki.%s.vec' % lang)
    ## Count Coocurrences of sections
    for page,secs in sections.items():
        for sec1,sec2 in combinations(secs,2):
                coOccur[sec1] = coOccur.get(sec1,{})
                coOccur[sec2] = coOccur.get(sec2,{})
                coOccur[sec1][sec2] = coOccur[sec1].get(sec2,0)
                coOccur[sec2][sec1] = coOccur[sec2].get(sec1,0)
                coOccur[sec1][sec2] += 1
                coOccur[sec2][sec1] += 1
    
    #Compute the IDF, different from working with words, sections names can just occur ones per doc
    idf = {}
    for sec in coOccur.keys():
        idf[sec] = math.log(len(sectionsFreq) / (1 + sectionsFreq[sec]))
    #compute TFIDF
    tfidf = {}
    for sec1,secs in coOccur.items():
        if (sec1 in secsToEval):
            tfidf[sec1] = {}
            for sec2,tf in secs.items():
                tfidf[sec1][sec2] = tf * idf[sec2]

    #Transform dictionary to sparse matrix
    v = DictVectorizer()
    tfidfVectors = v.fit_transform(tfidf.values())
    tfidfKeys = tfidf.keys()
    
    #Compute pairwise cosine similariry
    S = cosine_similarity(tfidfVectors)
    
    #Find most similar pairs
    np.fill_diagonal(S, -1) #'remove' diagional 
    tri_upper_diag = np.triu(S, k=0) #given that the matrix is symetric I take just thre upper triangle
    mostSimilar = np.where( tri_upper_diag > minSimilarity)
    

    indexes = {n:k for n,k in enumerate(tfidfKeys)}
    for sec1,sec2 in zip(mostSimilar[0],mostSimilar[1]):
        if coOccur[indexes[sec2]].get(indexes[sec1],0) <= maxCooccur:
            sec1Name = indexes[sec1]
            sec2Name = indexes[sec2]
            tfIdfsimilarity = tri_upper_diag[sec1][sec2]
            editDistance = editdistance.eval(sec1Name, sec2Name)
            isSubSet = (sec1Name.lower() in sec2Name.lower()) or (sec2Name.lower() in sec1Name.lower()) 
            vectorDistance = fasttextDistance(sec1Name,sec2Name,wordVectors)
            output.append({'Sec_A':indexes[sec1],'Sec_B':indexes[sec2],
                           'coOccurs':coOccur[indexes[sec2]].get(indexes[sec1],0),
                           'tfIdfSimilarity':round(tri_upper_diag[sec1][sec2],2),
                           'editDistance': editDistance,
                           'isSubSet': isSubSet,
                           'vectorDistance':vectorDistance,                           
                          })
    #save results in xls
    df = pd.DataFrame(output)
    df = df.sort_values(['tfIdfSimilarity','vectorDistance','editDistance','isSubSet'],ascending=False)
    print(df)
    df.to_excel('%sSynonyms.xls' % lang,index=False)
    print(df.corr())
    dfs[lang] = df


ru
reading word vectors from fastText_multilingual/vectors/wiki.ru.vec


  (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


                                       Sec_A  \
78                Состав сельского поселения   
154                    Международная карьера   
72                                    Подвиг   
138                                  В ролях   
73                                       СМИ   
245                          Основные работы   
176                            Роли в театре   
267                        Ординарии епархии   
67                          Интересные факты   
74                                   Фамилия   
112                    Адрес местного совета   
282                           Основные труды   
218                         Известные жители   
253                                Видеоклип   
283                           Основные труды   
238                          Жизнь и карьера   
44               Известные жители и уроженцы   
205                          Избранные труды   
203                          Избранные труды   
192                         Награды и зв

                 coOccurs  editDistance  isSubSet  tfIdfSimilarity  \
coOccurs         1.000000      0.137955 -0.076813        -0.080079   
editDistance     0.137955      1.000000 -0.247429        -0.250520   
isSubSet        -0.076813     -0.247429  1.000000         0.082985   
tfIdfSimilarity -0.080079     -0.250520  0.082985         1.000000   
vectorDistance  -0.174302     -0.489959  0.478896         0.415833   

                 vectorDistance  
coOccurs              -0.174302  
editDistance          -0.489959  
isSubSet               0.478896  
tfIdfSimilarity        0.415833  
vectorDistance         1.000000  


In [16]:
df.to_csv('%sSynonyms.csv' % lang)


## Statrified sample

* Here, we repeat the same procedure, but generating an stratified sample considering tfidf and fasttext similarity.
* Buckets are defined by rounding to the first decimal of those metrics (ex. tfidfSimilariry = 0.11231, is in the bucket tfidfSimilarity 0.1, for each metric whe consider ten buckets 0.1, 0.2 ..., 1 and the size of each bucket is defined by the bucketSize parameter, in this example we use bucketSize=50

In [13]:
dfs = {}
for lang in langs:
    print(lang)
    toDF = []
    coOccur = {}
    sectionsAll = []
    #Load Sections
    with open('../gap/multiLanguageFromDumpsSec/sections-articles_%s.json' % lang) as f: 
        sections = json.load(f)
    ##get most frequent sections
    for secs in sections.values():
        for secName in secs:
                cleanSection = re.sub('[=\]\[]','',secName).strip()
                if cleanSection:#check string is not empty
                    sectionsAll.append(cleanSection.strip())
    sectionsFreq = Counter(sectionsAll)
    total = sum(sectionsFreq.values())
    acc =0
    secsToEval = []
    for n,(sec,freq) in enumerate(sectionsFreq.most_common()):
        acc+= freq
        secsToEval.append(sec)

        if acc/total > p: #using sections that cover 80% of total
                break
    ## Get fasttext vectors for lang
    wordVectors = FastVector(vector_file='fastText_multilingual/vectors/wiki.%s.vec' % lang)
    ## Count Coocurrences of sections
    for page,secs in sections.items():
        for sec1,sec2 in combinations(secs,2):
                coOccur[sec1] = coOccur.get(sec1,{})
                coOccur[sec2] = coOccur.get(sec2,{})
                coOccur[sec1][sec2] = coOccur[sec1].get(sec2,0)
                coOccur[sec2][sec1] = coOccur[sec2].get(sec1,0)
                coOccur[sec1][sec2] += 1
                coOccur[sec2][sec1] += 1
    
    #Compute the IDF, different from working with words, sections names can just occur ones per doc
    idf = {}
    for sec in coOccur.keys():
        idf[sec] = math.log(len(sectionsFreq) / (1 + sectionsFreq[sec]))
    #compute TFIDF
    tfidf = {}
    for sec1,secs in coOccur.items():
        if (sec1 in secsToEval):
            tfidf[sec1] = {}
            for sec2,tf in secs.items():
                tfidf[sec1][sec2] = tf * idf[sec2]

    #Transform dictionary to sparse matrix
    v = DictVectorizer()
    tfidfVectors = v.fit_transform(tfidf.values())
    tfidfKeys = tfidf.keys()
    
    #Compute pairwise cosine similariry
    S = cosine_similarity(tfidfVectors)
    
    #Get the upper matrix, and remove diagonal
    np.fill_diagonal(S, -2) #'remove' diagional 
    tri_upper_diag = np.triu(S, k=0) #given that the matrix is symetric I take just thre upper triangle
    
    indexes = {n:k for n,k in enumerate(tfidfKeys)}
    for x in range(tri_upper_diag.shape[0]):
        for y in range(tri_upper_diag.shape[1]):
            if coOccur[indexes[x]].get(indexes[y],0) <= maxCooccur:
                if tri_upper_diag[x][y] > 0:
                    sec1Name = indexes[x]
                    sec2Name = indexes[y]
                    toDF.append({'Sec_A':sec1Name,'Sec_B':sec2Name,'tfIdfSimilarity':round(tri_upper_diag[x][y],1) } )
    df = pd.DataFrame(toDF)
    df['vectorDistance'] = df.apply(lambda row: round(fasttextDistance(row['Sec_A'],row['Sec_B'],wordVectors),1), axis=1)

    dfStratifiedTF = df.groupby('tfIdfSimilarity', group_keys=False).apply(lambda x: x.sample(min(len(x),bucketSize)))
    dfStratifiedVec =  df.groupby('tfIdfSimilarity', group_keys=False).apply(lambda x: x.sample(min(len(x),bucketSize)))
    dfStratified = pd.concat([dfStratifiedTF,dfStratifiedVec]).drop_duplicates()
    dfStratified['editDistance'] = dfStratified.apply(lambda row: editdistance.eval(row['Sec_A'],row['Sec_B']), axis=1)
    dfStratified['coOccurs'] = dfStratified.apply(lambda row: coOccur[row['Sec_A']].get(row['Sec_B'],0), axis=1)
    dfStratified['isSubSet'] = dfStratified.apply(lambda row:(row['Sec_A'].lower() in row['Sec_B'].lower()) or (row['Sec_B'].lower() in row['Sec_A'].lower()),axis=1)
    dfStratified.to_excel('%sSynonyms_Stratified.xls' % lang,index=False)

es
reading word vectors from fastText_multilingual/vectors/wiki.es.vec
en
reading word vectors from fastText_multilingual/vectors/wiki.en.vec


  (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
  (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


ar
reading word vectors from fastText_multilingual/vectors/wiki.ar.vec
ja
reading word vectors from fastText_multilingual/vectors/wiki.ja.vec
ru
reading word vectors from fastText_multilingual/vectors/wiki.ru.vec
fr
reading word vectors from fastText_multilingual/vectors/wiki.fr.vec


# Classification task

Using the manually labeled pairs, we train a set of classifiers

### Example with russian

#### Join the data

In [4]:
import pandas as pd 

features = pd.read_excel('ruSynonyms_Stratified.xls')
labels = pd.read_csv('Synonym mapping - ru-r1.csv')

In [6]:
ru = pd.merge(labels, features, how='left', left_on=['Section title 1', 'Section title 2'],right_on=['Sec_A', 'Sec_B'])
ru = ru[['Section title 1', 'Section title 2','Relation - Assessment 1', 'coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]

#### Machine Learning model

In [55]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [56]:
Y = ru['Relation - Assessment 1']
X = ru[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]

In [57]:
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)


#### XGBClassifier

In [58]:
# fit model no training data
ruModel = XGBClassifier()
ruModel.fit(X_train, y_train)
# make predictions for test data
y_pred = ruModel.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
confusion_matrix(y_test, y_pred)

Accuracy: 87.41%


array([[212,   2,   2],
       [  7,   1,   7],
       [ 13,   4,  30]])

In [12]:
list(zip(X_test.columns,ruModel.feature_importances_))

[('coOccurs', 0.087499999),
 ('editDistance', 0.40000001),
 ('isSubSet', 0.00125),
 ('tfIdfSimilarity', 0.23875),
 ('vectorDistance', 0.27250001)]

#### Random Forest

TODO: gridSearch for set find parameters

In [67]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

ruClf = RandomForestClassifier(n_estimators=40)
ruClf.fit(X_train, y_train)
y_pred = ruClf.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
confusion_matrix(y_test, y_pred)

Accuracy: 87.05%


array([[210,   4,   2],
       [  7,   2,   6],
       [ 10,   7,  30]])

In [68]:
list(zip(X_test.columns,ruClf.feature_importances_))

[('coOccurs', 0.074764465232192212),
 ('editDistance', 0.19436460115881357),
 ('isSubSet', 0.055364454750572459),
 ('tfIdfSimilarity', 0.2008658834259458),
 ('vectorDistance', 0.474640595432476)]

### Example with French

In [79]:
def translateLabelsFrToEn(label):
    if label =='différent':
        return 'not related'
    if label == 'liés':
        return 'related'
    if label =='identique':
        return 'synonym'
    
features = pd.read_excel('frSynonyms_Stratified.xls')
labels = pd.read_csv('Synonym mapping - fr-r1.csv')

fr = pd.merge(labels, features, how='left', left_on=['Titre de section 1', 'Titre de section 2'],right_on=['Sec_A', 'Sec_B'])
fr['Relation - Assessment 1'] = fr['Relation - Estimation 1'].map(translateLabelsFrToEn)
fr = fr[['Titre de section 1', 'Titre de section 2','Relation - Assessment 1', 'coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]



#### Use Russian Model in French

In [80]:
Y_fr = fr['Relation - Assessment 1']
X_fr = fr[['coOccurs','editDistance','isSubSet','tfIdfSimilarity','vectorDistance']]
y_pred_fr = ruClf.predict(X_fr)
# evaluate predictions
accuracy = accuracy_score(Y_fr, y_pred_fr )
print("Accuracy: %.2f%%" % (accuracy * 100.0))
confusion_matrix(Y_fr, y_pred_fr)

Accuracy: 85.19%


array([[626,  12,  19],
       [ 25,  15,  43],
       [  7,  10,  26]])

#### Train a model in Frenc and test in Russsian

In [81]:
frClf = RandomForestClassifier(n_estimators=40)
frClf.fit(X_fr, Y_fr)
y_pred = frClf.predict(X) # X is the russian data
# evaluate predictions
accuracy = accuracy_score(y_pred, Y)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
confusion_matrix(Y, y_pred)

Accuracy: 80.60%


array([[647,   9,   3],
       [ 42,  18,   5],
       [ 39,  65,  12]])

In [53]:
list(zip(X.columns,frClf.feature_importances_))

[('coOccurs', 0.14157460368812624),
 ('editDistance', 0.26964823551097561),
 ('isSubSet', 0.021122694673829621),
 ('tfIdfSimilarity', 0.33671485982281807),
 ('vectorDistance', 0.23093960630425051)]

In [54]:
labels.columns

Index(['Titre de section 1', 'Titre de section 2', 'Relation - Estimation 1'], dtype='object')

In [63]:
fr

Unnamed: 0,Titre de section 1,Titre de section 2,Relation - Estimation 1,coOccurs,editDistance,isSubSet,tfIdfSimilarity,vectorDistance
0,Carrière en club,Transferts,différent,0,14,False,0.2,0.3
1,Culture locale et patrimoine,Personnalités liées,différent,3,23,False,0.6,0.3
2,Nom,Structure,différent,3,9,False,0.3,0.1
3,Compléments,Distribution et habitat,différent,0,21,False,0.6,0.2
4,Équipes,Médaillés,différent,1,7,False,0.2,0.4
5,Théâtre,Apparitions,différent,1,10,False,0.1,0.2
6,Dans la culture populaire,Vie,différent,0,23,False,0.3,0.4
7,Pseudonyme,Géologie,différent,0,8,False,0.1,0.1
8,Note,Anecdotes,différent,2,6,False,0.2,0.3
9,Sélections,Tour préliminaire,liés,0,15,False,0.1,0.2


In [64]:
ru


Unnamed: 0,Section title 1,Section title 2,Relation - Assessment 1,coOccurs,editDistance,isSubSet,tfIdfSimilarity,vectorDistance
0,История изучения,Охранный статус,not related,1,16,False,0.2,0.2
1,Отзывы и критика,Рецензии,related,0,14,False,0.6,0.7
2,Результаты выступлений,Возможности,not related,0,18,False,0.2,0.3
3,Отзывы,Оценки,related,0,5,False,0.7,0.5
4,Краткая биография,Звания,not related,0,14,False,0.6,0.3
5,Состав сельского поселения,Коммуны кантона,related,0,22,False,0.6,0.4
6,Флора и фауна,Биология,related,0,11,False,0.2,0.5
7,Строение,Внешний вид,not related,0,10,False,0.4,0.4
8,Некоторые научные работы,Международная карьера,not related,0,20,False,0.0,0.3
9,Сочинения,Родословная,not related,2,8,False,0.3,0.3
