# Section Recommendation / Inter-lingual approach

This notebook implements section recommendation based on the Section Aligments. 
Given an article in language a target language T, though Wikidata retrieves the section of the same in article in other languages, and provides a recommendations.

In [1]:
import pandas as pd
import requests
import re

#Config
suportedLangs = ['fr','en','es','ja','ar','ru']

In [470]:
#Loads alignments 

import os 
rec = {}
for f in os.listdir('recSheetsTSV/'):
    lang1 = f[0:2]
    lang2 = f[6:8]
    rec[lang1] = rec.get(lang1,{})
    rec[lang1][lang2] = pd.read_csv('recSheetsTSV/'+f,sep='\t')
    rec[lang1][lang2].set_index('secFrom',inplace=True)




In [473]:
## Section parser
sections_RE = re.compile(r'(^|[^=])==([^=\n\r]+)==([^=]|$)')
def extract_sections(text):
    for m in sections_RE.finditer(text):
        yield m.group(2).strip()

#Get articles
def getContent(title,lang):
    url = "https://%s.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&formatversion=2&titles=%s" % (lang,title)
    response = requests.get(url)
    content = response.json()['query']['pages'][0]['revisions'][0]['content']
    return content

def getPages(title,lang,target=suportedLangs):
    """
    title: page title in target language
    lang: target language
    target: List of Pages 
    returns a dictionary 'x_wiki':x_title
    """
    response= requests.get("https://www.wikidata.org/w/api.php?action=wbgetentities&sites=%swiki&titles=%s&props=sitelinks&format=json" % (lang,title))
    output ={}
    assert list(response.json()['entities'].values())[0]['sitelinks'], "Oh no! This assertion failed!"
    links = list(response.json()['entities'].values())[0]['sitelinks']
    for t in target:
        if t+'wiki' in links:
            output[t] = links[t+'wiki' ]['title']
        
    return output

def getAllLangs(title,lang):
    """
    title: page title in target language
    lang: target language
    returns a dictionary 'x_wiki':list_of_sections_in_x 
    """
    secs = {}
    for l,page in getPages(title,lang).items():
        print(l,page)
        secs[l]  = list(extract_sections(getContent(page,l)))
    return secs


In [442]:
import networkx as nx
import json

#For each section in language X, we have N possible mappings with certain probability. 
#Ex: 'Vida temprana' in Spanish maps to 'Early Life,p=0.9','Early Years,p=.8','References,p=0.3' in English
#Given two languages X and Y, we want to find the most similar clusters considering the mapped sections
def getMostSimilarClusters(c1,c2): 
    """
    c1: dictionary of sections in language 1 (allready mapped to target language) with a giving probability
    c2: dictionary of sections in language 2 (allready mapped to target languages) with a giving probability
    """
    G = nx.Graph()
    mostSimilar = []
    for pos1,dict_1 in enumerate(c1):
        for pos2,dict_2 in enumerate(c2):
            dot_product = sum(dict_1[key]*dict_2.get(key, 0) for key in dict_1)
            G.add_edge((1,pos1),(2,pos2))
            G[(1,pos1)][(2,pos2)]['w'] = dot_product
    for a, b, data in sorted(G.edges(data=True), key=lambda x: x[2]['w'],reverse=True):
        try:
            G.remove_node(a)
            G.remove_node(b)   
            sortedNodes  = sorted([a,b])
            mostSimilar.append((sortedNodes[0][1],sortedNodes[1][1]))
        except:
            pass #one of the nodes was already paried
    return mostSimilar

#Given a template dictionary of sections c1
#and another dictionary of sections c2
#update  probabilities on c1 based on information c2
def updateClusterWeights(c1,c2): #c1 is the template to be updated
    """
    c1: dictionary of sections in language 1 (allready mapped to target language) with a giving probability
    c2: dictionary of sections in language 2 (allready mapped to target languages) with a giving probability
    """
    for s1,s2 in getMostSimilarClusters(c1,c2):
        for s in c1[s1].keys() & c2[s2].keys():
            c1[s1][s] = c1[s1][s] + c2[s2][s]
    return c1
        

In [510]:
def getRecs(title,TargetLang,verbose=False):
    '''
    title: Article to get recommendations (ex:'Quilombo')
    TargetLang: Language of the article (ex:'en')
    verbose: return explanations
    '''
    #load translations
    global rec
    # get sections in all Languages
    secs = getAllLangs(title,TargetLang)
    # get a list of the sources languages
    sourceLangs = set(secs.keys()) - {TargetLang}
    #count amount of sections in the target language
    lenTarget = len(secs[TargetLang])
    #count the number of sections in each source lang, produce a tuple (SecCount,Lang)
    lenSources = [(len(s),l) for l,s in secs.items() if l != TargetLang]
    #use the language with more sections as template
    templateLang = max(lenSources)[1]
    secsMapped = {}
    #For all source languages S, take all sections in S and map to the target languge T, with it's probability
    for lang in sourceLangs:
        df = rec[lang][TargetLang]
        secsMapped[lang] = []
        for sec in secs[lang]:
            tmp = df[df.index ==sec][['langTo','prob']]
            secsMapped[lang].append(dict(zip(tmp.langTo,tmp.prob)))
    #Use the language with more sections as template
    templateRec = secsMapped[templateLang]
    #Update the template using the remaining languages
    for lang in sourceLangs:
        if lang != templateLang:
            templateRec = updateClusterWeights(templateRec,secsMapped[lang])
    finalRecs = []
    for cluster in templateRec:
        if cluster: #check cluster is not empty
            candidates = [recTuple[0] for recTuple in sorted(cluster.items(),  key=lambda x: x[1],reverse=True)][:3]
            if not (set(candidates) & set(secs[TargetLang])):
                finalRecs.append(candidates[0])
    if verbose:
        output = {}
        output['context'] = {}
        output['Recommendations'] = json.dumps(list(set(finalRecs)))
        output['context'] ['CurrentSections'] = secs[TargetLang]
        #create a copy of sections
        otherLangs = secs.copy()
        del(otherLangs[TargetLang])
        output['context']['SectionsInOtherLanguages'] = otherLangs
    else:
        output = json.dumps(list(set(finalRecs)))
    return output

In [507]:
 getRecs('Gabriel_García_Márquez','en')

ar غابرييل غارثيا ماركيث
fr Gabriel García Márquez
ru Гарсиа Маркес, Габриэль
en Gabriel García Márquez
ja ガブリエル・ガルシア＝マルケス
es Gabriel García Márquez


'["Works"]'

In [508]:
 getRecs('Quilombo','en')

ru Киломбу
en Quilombo
fr Quilombo (esclavage)
es Quilombo


'["Economy", "Infrastructure", "Organization"]'

In [511]:
getRecs('Axioma','es',verbose=True)

ar بديهية (فلسفة)
fr Axiome
ru Аксиома
en Axiom
ja 公理
es Axioma


{'Recommendations': '["Historia"]',
 'context': {'CurrentSections': ['Etimología',
   'Legado helénico',
   'Lógica',
   'Matemáticas',
   'Limitaciones de los sistemas axiomáticos',
   'Véase también',
   'Referencias',
   'Bibliografía',
   'Enlaces externos'],
  'SectionsInOtherLanguages': {'ar': ['بديهيات', 'انظر أيضا', 'مراجع'],
   'en': ['Etymology',
    'Historical development',
    'Mathematical logic',
    'See also',
    'References',
    'Further reading',
    'External links'],
   'fr': ['Histoire', 'Description', 'Références', 'Voir aussi'],
   'ja': ['公理の例', '公理の必要性', '歴史', '公理の形式性', '公理の直観的・歴史的な妥当性', '脚注', '関連項目'],
   'ru': ['Назначение',
    'История',
    'Примеры',
    'См. также',
    'Литература',
    'Ссылки',
    'Примечания']}}}