# Synonym detection
This script output a list of candidates for sections 'synonyms' 
Potential synonyms must:
    * Co-occur with similar sections (measured with tfidf metric, threshold fixed in minSimilarity parameter)
    * Don't co-ocurr with between them more than a certain treshold (maxCooccur parameter)
Additioanlly, other features are added for later evaluatio
    * editdistance
    * fasttext distance

Inputs: 
    * Sections per article contained in ../gap/multiLanguageFromDumpsSec/sections-articles_lang.json, in format {articleId_1:[sec_a,sec_b...], articleId_2:[sec_x,sec_y], ..., article_n:[sec_i...]}
    
(The actual values uploaded to gdocs are generated with the .py version in this same folder)

In [8]:
import pandas as pd
import json
from collections import Counter
import gzip
import json
import itertools
import networkx as nx
from functools import reduce
from itertools import combinations
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import numpy as np
import editdistance
from fastText_multilingual.fasttext import FastVector
import re

def fasttextDistance(sec1,sec2,vectors):
    '''
    Take two sections, create a vector for each of them summing all the words
    return cosine similarity
    '''
    sec1 = sec1.lower().split()
    sec2 = sec2.lower().split()
    sec1Vector  = np.sum([vectors[word] for word in sec1 if word in vectors],axis=0)/len(sec1)
    sec2Vector  = np.sum([vectors[word] for word in sec2 if word in vectors],axis=0)/len(sec2)
    distance  = vectors.cosine_similarity(sec1Vector,sec2Vector)
    if not isinstance(distance,float): #when at least one of the sections is not the vectorial space, the result is 'nan'
        return 0
    else:
        return vectors.cosine_similarity(sec1Vector,sec2Vector)


## Parameters

In [12]:
#Parameters 

langs=['es','en','ar','ja','ru','fr']##define languages
#langs = ['en','es']
p = 0.75 #percentage of sections occurrences to be corevered 
maxCooccur = 3 #Maximum of coocurrences between pair of sections to be considered synonyms
minSimilarity = .6# Miminum cosine similarity to be consider synonyms
bucketSize = 50 #for stratified sample


## Find and save candidates

In [3]:
dfs = {}
for lang in langs:
    print(lang)
    output = []
    coOccur = {}
    sectionsAll = []
    #Load Sections
    with open('../gap/multiLanguageFromDumpsSec/sections-articles_%s.json' % lang) as f: 
        sections = json.load(f)
    ##get most frequent sections
    for secs in sections.values():
        for secName in secs:
                cleanSection = re.sub('[=\]\[]','',secName).strip()
                if cleanSection: #check that string is not empty
                    sectionsAll.append(cleanSection.strip())
    sectionsFreq = Counter(sectionsAll)
    total = sum(sectionsFreq.values())
    acc =0
    secsToEval = []
    for n,(sec,freq) in enumerate(sectionsFreq.most_common()):
        acc+= freq
        secsToEval.append(sec)

        if acc/total > p: #using sections that cover 80% of total
                break
    ## Get fasttext vectors for lang
    wordVectors = FastVector(vector_file='fastText_multilingual/vectors/wiki.%s.vec' % lang)
    ## Count Coocurrences of sections
    for page,secs in sections.items():
        for sec1,sec2 in combinations(secs,2):
                coOccur[sec1] = coOccur.get(sec1,{})
                coOccur[sec2] = coOccur.get(sec2,{})
                coOccur[sec1][sec2] = coOccur[sec1].get(sec2,0)
                coOccur[sec2][sec1] = coOccur[sec2].get(sec1,0)
                coOccur[sec1][sec2] += 1
                coOccur[sec2][sec1] += 1
    
    #Compute the IDF, different from working with words, sections names can just occur ones per doc
    idf = {}
    for sec in coOccur.keys():
        idf[sec] = math.log(len(sectionsFreq) / (1 + sectionsFreq[sec]))
    #compute TFIDF
    tfidf = {}
    for sec1,secs in coOccur.items():
        if (sec1 in secsToEval):
            tfidf[sec1] = {}
            for sec2,tf in secs.items():
                tfidf[sec1][sec2] = tf * idf[sec2]

    #Transform dictionary to sparse matrix
    v = DictVectorizer()
    tfidfVectors = v.fit_transform(tfidf.values())
    tfidfKeys = tfidf.keys()
    
    #Compute pairwise cosine similariry
    S = cosine_similarity(tfidfVectors)
    
    #Find most similar pairs
    np.fill_diagonal(S, -1) #'remove' diagional 
    tri_upper_diag = np.triu(S, k=0) #given that the matrix is symetric I take just thre upper triangle
    mostSimilar = np.where( tri_upper_diag > minSimilarity)
    

    indexes = {n:k for n,k in enumerate(tfidfKeys)}
    for sec1,sec2 in zip(mostSimilar[0],mostSimilar[1]):
        if coOccur[indexes[sec2]].get(indexes[sec1],0) <= maxCooccur:
            sec1Name = indexes[sec1]
            sec2Name = indexes[sec2]
            tfIdfsimilarity = tri_upper_diag[sec1][sec2]
            editDistance = editdistance.eval(sec1Name, sec2Name)
            isSubSet = (sec1Name.lower() in sec2Name.lower()) or (sec2Name.lower() in sec1Name.lower()) 
            vectorDistance = fasttextDistance(sec1Name,sec2Name,wordVectors)
            output.append({'Sec_A':indexes[sec1],'Sec_B':indexes[sec2],
                           'coOccurs':coOccur[indexes[sec2]].get(indexes[sec1],0),
                           'tfIdfSimilarity':round(tri_upper_diag[sec1][sec2],2),
                           'editDistance': editDistance,
                           'isSubSet': isSubSet,
                           'vectorDistance':vectorDistance,                           
                          })
    #save results in xls
    df = pd.DataFrame(output)
    df = df.sort_values(['tfIdfSimilarity','vectorDistance','editDistance','isSubSet'],ascending=False)
    print(df)
    df.to_excel('%sSynonyms.xls' % lang,index=False)
    print(df.corr())
    dfs[lang] = df


en
reading word vectors from fastText_multilingual/vectors/wiki.en.vec


  (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


                           Sec_A                             Sec_B  coOccurs  \
1366                       Cause                            Causes         0   
2437                 Stud career                       Stud record         0   
1370             Adverse effects                      Side effects         0   
739                Junction list               Major intersections         1   
2283                Music career                    Musical career         0   
733   Formats and track listings        Track listings and formats         0   
1546                      Battle                        The battle         0   
465                       Sequel                           Sequels         1   
2625             Works published                         New books         0   
734   Formats and track listings                    Track listings         1   
1932              Certifications          Certifications and sales         0   
625            Managerial career        

                 coOccurs  editDistance  isSubSet  tfIdfSimilarity  \
coOccurs         1.000000      0.010577 -0.107733        -0.086038   
editDistance     0.010577      1.000000 -0.179947        -0.132228   
isSubSet        -0.107733     -0.179947  1.000000         0.261901   
tfIdfSimilarity -0.086038     -0.132228  0.261901         1.000000   
vectorDistance  -0.188042     -0.171692  0.533698         0.389103   

                 vectorDistance  
coOccurs              -0.188042  
editDistance          -0.171692  
isSubSet               0.533698  
tfIdfSimilarity        0.389103  
vectorDistance         1.000000  
es
reading word vectors from fastText_multilingual/vectors/wiki.es.vec
                                           Sec_A  \
66                                      Miembros   
234                                  Comentarios   
139                              Cabeza de serie   
62                             Ciudades hermanas   
126                       Premios y nominaci

## Statrified sample

* Here, we repeat the same procedure, but generating an stratified sample considering tfidf and fasttext similarity.
* Buckets are defined by rounding to the first decimal of those metrics (ex. tfidfSimilariry = 0.11231, is in the bucket tfidfSimilarity 0.1, for each metric whe consider ten buckets 0.1, 0.2 ..., 1 and the size of each bucket is defined by the bucketSize parameter, in this example we use bucketSize=50

In [13]:
dfs = {}
for lang in langs:
    print(lang)
    toDF = []
    coOccur = {}
    sectionsAll = []
    #Load Sections
    with open('../gap/multiLanguageFromDumpsSec/sections-articles_%s.json' % lang) as f: 
        sections = json.load(f)
    ##get most frequent sections
    for secs in sections.values():
        for secName in secs:
                cleanSection = re.sub('[=\]\[]','',secName).strip()
                if cleanSection:#check string is not empty
                    sectionsAll.append(cleanSection.strip())
    sectionsFreq = Counter(sectionsAll)
    total = sum(sectionsFreq.values())
    acc =0
    secsToEval = []
    for n,(sec,freq) in enumerate(sectionsFreq.most_common()):
        acc+= freq
        secsToEval.append(sec)

        if acc/total > p: #using sections that cover 80% of total
                break
    ## Get fasttext vectors for lang
    wordVectors = FastVector(vector_file='fastText_multilingual/vectors/wiki.%s.vec' % lang)
    ## Count Coocurrences of sections
    for page,secs in sections.items():
        for sec1,sec2 in combinations(secs,2):
                coOccur[sec1] = coOccur.get(sec1,{})
                coOccur[sec2] = coOccur.get(sec2,{})
                coOccur[sec1][sec2] = coOccur[sec1].get(sec2,0)
                coOccur[sec2][sec1] = coOccur[sec2].get(sec1,0)
                coOccur[sec1][sec2] += 1
                coOccur[sec2][sec1] += 1
    
    #Compute the IDF, different from working with words, sections names can just occur ones per doc
    idf = {}
    for sec in coOccur.keys():
        idf[sec] = math.log(len(sectionsFreq) / (1 + sectionsFreq[sec]))
    #compute TFIDF
    tfidf = {}
    for sec1,secs in coOccur.items():
        if (sec1 in secsToEval):
            tfidf[sec1] = {}
            for sec2,tf in secs.items():
                tfidf[sec1][sec2] = tf * idf[sec2]

    #Transform dictionary to sparse matrix
    v = DictVectorizer()
    tfidfVectors = v.fit_transform(tfidf.values())
    tfidfKeys = tfidf.keys()
    
    #Compute pairwise cosine similariry
    S = cosine_similarity(tfidfVectors)
    
    #Get the upper matrix, and remove diagonal
    np.fill_diagonal(S, -2) #'remove' diagional 
    tri_upper_diag = np.triu(S, k=0) #given that the matrix is symetric I take just thre upper triangle
    
    indexes = {n:k for n,k in enumerate(tfidfKeys)}
    for x in range(tri_upper_diag.shape[0]):
        for y in range(tri_upper_diag.shape[1]):
            if coOccur[indexes[x]].get(indexes[y],0) <= maxCooccur:
                if tri_upper_diag[x][y] > 0:
                    sec1Name = indexes[x]
                    sec2Name = indexes[y]
                    toDF.append({'Sec_A':sec1Name,'Sec_B':sec2Name,'tfIdfSimilarity':round(tri_upper_diag[x][y],1) } )
    df = pd.DataFrame(toDF)
    df['vectorDistance'] = df.apply(lambda row: round(fasttextDistance(row['Sec_A'],row['Sec_B'],wordVectors),1), axis=1)

    dfStratifiedTF = df.groupby('tfIdfSimilarity', group_keys=False).apply(lambda x: x.sample(min(len(x),bucketSize)))
    dfStratifiedVec =  df.groupby('tfIdfSimilarity', group_keys=False).apply(lambda x: x.sample(min(len(x),bucketSize)))
    dfStratified = pd.concat([dfStratifiedTF,dfStratifiedVec]).drop_duplicates()
    dfStratified['editDistance'] = dfStratified.apply(lambda row: editdistance.eval(row['Sec_A'],row['Sec_B']), axis=1)
    dfStratified['coOccurs'] = dfStratified.apply(lambda row: coOccur[row['Sec_A']].get(row['Sec_B'],0), axis=1)
    dfStratified['isSubSet'] = dfStratified.apply(lambda row:(row['Sec_A'].lower() in row['Sec_B'].lower()) or (row['Sec_B'].lower() in row['Sec_A'].lower()),axis=1)
    dfStratified.to_excel('%sSynonyms_Stratified.xls' % lang,index=False)

es
reading word vectors from fastText_multilingual/vectors/wiki.es.vec
en
reading word vectors from fastText_multilingual/vectors/wiki.en.vec


  (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
  (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))


ar
reading word vectors from fastText_multilingual/vectors/wiki.ar.vec
ja
reading word vectors from fastText_multilingual/vectors/wiki.ja.vec
ru
reading word vectors from fastText_multilingual/vectors/wiki.ru.vec
fr
reading word vectors from fastText_multilingual/vectors/wiki.fr.vec
