# Synonym detection
This script output a list of candidates for sections 'synonyms' 
Potential synonyms must:
    * Co-occur with similar sections (measured with tfidf metric, threshold fixed in minSimilarity parameter)
    * Don't co-ocurr with between them more than a certain treshold (maxCooccur parameter)
Additioanlly, other features are added for later evaluatio
    * editdistance
    * fasttext distance

Inputs: 
    * Sections per article contained in ../gap/multiLanguageFromDumpsSec/sections-articles_lang.json, in format {articleId_1:[sec_a,sec_b...], articleId_2:[sec_x,sec_y], ..., article_n:[sec_i...]}
    
(The actual values uploaded to gdocs are generated with the .py version in this same folder)

In [1]:
import pandas as pd
import json
from collections import Counter
import gzip
import json
import itertools
import networkx as nx
from functools import reduce
from itertools import combinations
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
import numpy as np
import editdistance
from fastText_multilingual.fasttext import FastVector
import re

def fasttextDistance(sec1,sec2,vectors):
    '''
    Take two sections, create a vector for each of them summing all the words
    return cosine similarity
    '''
    sec1 = sec1.lower().split()
    sec2 = sec2.lower().split()
    sec1Vector  = sum([vectors[word] for word in sec1 if word in vectors])/len(sec1)
    sec2Vector  = sum([vectors[word] for word in sec2 if word in vectors])/len(sec2)
    distance  = vectors.cosine_similarity(sec1Vector,sec2Vector)
    if not isinstance(distance,float): #when at least one of the sections is not the vectorial space, the result is 'nan'
        return 0
    else:
        return vectors.cosine_similarity(sec1Vector,sec2Vector)


## Parameters

In [2]:
#Parameters 

#langs=['es','en','ar','ja','ru','fr']##define languages
langs = ['en','es']
p = 0.75 #percentage of sections occurrences to be corevered 
maxCooccur = 3 #Maximum of coocurrences between pair of sections to be considered synonyms
minSimilarity = .6# Miminum cosine similarity to be consider synonyms

## Find and save candidates

In [None]:
dfs = {}
for lang in langs:
    print(lang)
    output = []
    coOccur = {}
    sectionsAll = []
    #Load Sections
    with open('../gap/multiLanguageFromDumpsSec/sections-articles_%s.json' % lang) as f: 
        sections = json.load(f)
    ##get most frequent sections
    for secs in sections.values():
        for secName in secs:
                cleanSection = re.sub('[=\]\[]','',secName).strip()
                sectionsAll.append(cleanSection.strip())
    sectionsFreq = Counter(sectionsAll)
    total = sum(sectionsFreq.values())
    acc =0
    secsToEval = []
    for n,(sec,freq) in enumerate(sectionsFreq.most_common()):
        acc+= freq
        secsToEval.append(sec)

        if acc/total > p: #using sections that cover 80% of total
                break
    ## Get fasttext vectors for lang
    wordVectors = FastVector(vector_file='fastText_multilingual/vectors/wiki.%s.vec' % lang)
    ## Count Coocurrences of sections
    for page,secs in sections.items():
        for sec1,sec2 in combinations(secs,2):
                coOccur[sec1] = coOccur.get(sec1,{})
                coOccur[sec2] = coOccur.get(sec2,{})
                coOccur[sec1][sec2] = coOccur[sec1].get(sec2,0)
                coOccur[sec2][sec1] = coOccur[sec2].get(sec1,0)
                coOccur[sec1][sec2] += 1
                coOccur[sec2][sec1] += 1
    
    #Compute the IDF, different from working with words, sections names can just occur ones per doc
    idf = {}
    for sec in coOccur.keys():
        idf[sec] = math.log(len(sectionsFreq) / (1 + sectionsFreq[sec]))
    #compute TFIDF
    tfidf = {}
    for sec1,secs in coOccur.items():
        if (sec1 in secsToEval):
            tfidf[sec1] = {}
            for sec2,tf in secs.items():
                tfidf[sec1][sec2] = tf * idf[sec2]

    #Transform dictionary to sparse matrix
    v = DictVectorizer()
    tfidfVectors = v.fit_transform(tfidf.values())
    tfidfKeys = tfidf.keys()
    
    #Compute pairwise cosine similariry
    S = cosine_similarity(tfidfVectors)
    
    #Find most similar pairs
    np.fill_diagonal(S, -1) #'remove' diagional 
    tri_upper_diag = np.triu(S, k=0) #given that the matrix is symetric I take just thre upper triangle
    mostSimilar = np.where( tri_upper_diag > minSimilarity)
    

    indexes = {n:k for n,k in enumerate(tfidfKeys)}
    for sec1,sec2 in zip(mostSimilar[0],mostSimilar[1]):
        if coOccur[indexes[sec2]].get(indexes[sec1],0) <= maxCooccur:
            sec1Name = indexes[sec1]
            sec2Name = indexes[sec2]
            tfIdfsimilarity = tri_upper_diag[sec1][sec2]
            editDistance = editdistance.eval(sec1Name, sec2Name)
            isSubSet = (sec1Name.lower() in sec2Name.lower()) or (sec2Name.lower() in sec1Name.lower()) 
            vectorDistance = fasttextDistance(sec1Name,sec2Name,wordVectors)
            output.append({'Sec_A':indexes[sec1],'Sec_B':indexes[sec2],
                           'coOccurs':coOccur[indexes[sec2]].get(indexes[sec1],0),
                           'tfIdfSimilarity':round(tri_upper_diag[sec1][sec2],2),
                           'editDistance': editDistance,
                           'isSubSet': isSubSet,
                           'vectorDistance':vectorDistance,                           
                          })
    #save results in xls
    df = pd.DataFrame(output)
    df = df.sort_values(['tfIdfSimilarity','vectorDistance','editDistance','isSubSet'],ascending=False)
    print(df)
    df.to_excel('%sSynonyms.xls' % lang,index=False)
    print(df.corr())
    df[lang] = df


en
