In [193]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

In [194]:
PATH_FOLDER = 'wikispeedia_paths-and-graph/'
LINKS_DATA = PATH_FOLDER + "links.tsv"
PATH_FINISHED_DATA = PATH_FOLDER + "paths_finished.tsv"

In [195]:
links = pd.read_csv(LINKS_DATA, sep='\t', header=None, names=["linkSource", 'linkTarget'], comment='#')
path_finished = pd.read_csv(PATH_FINISHED_DATA, sep='\t', header=None, names=['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating'], comment='#')

In [196]:
from urllib.parse import unquote

def unquote_df(df, columns):
    '''Inputs:
            df: panda dataframe
            columns: string or string array containing column names to be url-decoded
        Return:
            panda dataframe with url-decoded column names
    '''
    for column in columns:
        N = len(df[column])
        for i in range(N):
            df.loc[i,column] = unquote(df.loc[i,column])
            
    return df

In [197]:
def check_character(headline, character):
    '''Inputs:
            headline: string
            list_words: string
        Return:
            1 if the character is in the headline
            0 otherwise
    '''
    if character in headline: #
        return 1 
    else:
         return 0

In [198]:
links = unquote_df(links, ['linkSource','linkTarget'])
path_finished = unquote_df(path_finished, ['path'])
#path_finished['back'] = path_finished['path'].apply(lambda x : check_character(x, '<'))
#path_finished = path_finished[path_finished['back']==0]


A partir de maintenant, uniquement mon code (avant: recopie pour run notebook proprement)

We build a directed graph with the wikipedia articles to perform a fictional random walk on it using the pagerank alorithm, which will be our normalization for the distance metric.

In [199]:
Wikigraph=nx.DiGraph()
e=zip(links['linkSource'], links['linkTarget'])
Wikigraph.add_edges_from(e)


In [200]:
#nx.draw(Wikigraph)
#plt.show()
print(list(Wikigraph.successors('Bede')))


['Abbot', 'Dante_Alighieri', 'Durham_Cathedral', 'England', 'Great_Britain', 'Hebrew_language', 'Julius_Caesar', 'Middle_Ages', 'Music', 'Paul_of_Tarsus', 'Season', 'Virgil']


In [201]:
GooglePageRank=nx.pagerank(Wikigraph, alpha=0.85)
print(GooglePageRank.get('United_States')) 
print(max(GooglePageRank.values())) #we see that the maximum value is reached for the US, which means it will be reached more times than any other article on a RW

0.00956554538310564
0.00956554538310564


In [202]:
def convert(string):
    return list(string.split(';'))

path_finished['path']=path_finished['path'].apply(lambda x:convert(x))

In [203]:
def count_a(article, goal, paths):
    '''This function counts the number of times an article was encountered on all paths with a specified goal
    Input: article encountered, goal of the paths, object with all paths
    Output: number of times the article was encountered summed on all paths with same given goal'''
    count=0
    for path in paths:
        if path[-1]==goal: #checks that the last element is the correct goal
            for art in path:
                if art==article:
                    count+=1
    return count
#note that we decide to count all the times an article could appear in the same path, because it would mean that it has a more significant value than if it just appeared once

In [204]:
from itertools import cycle

In [223]:
def count_aprime(aprime, article, goal, paths):
    count=0
    if aprime=='<' or article=='<': #the comeback sign is not part of the count
        return 0

    for path in paths:
        if path[-1]==goal:
            pathcycle=cycle(path)
            next_art=next(pathcycle)
            for _ in range(len(path)-1):
                art, next_art=next_art, next(pathcycle)
                if art==article:
                    if next_art==aprime:
                            count+=1    
    return count


In [206]:
path_finished['path'][1000]

['AT&T', 'United_States', 'Agriculture', 'Vegetable']

In [207]:

count=count_a('United_States','South_America',path_finished['path'])
count #seems to work

4

In [208]:
count_successor=count_aprime('Nature','Science','Rainbow',path_finished['path'])
print(count_successor)
#note that '<' is not included in the Wikigraph
#ATTENTION: what to do when aprime then <? remove them?
#seems to work otherwise too


2


In [233]:
def posterior_click_probability(aprime, article, goal, paths, alpha=0):
    '''Calculates the posterior click probability to reach an article given the previous article and the goal, after seeing all the data
    Input: aprime: article on which the proba is done, article:previous article, goal:final article, paths:evaluated on all those paths,
    alpha:Dirichlet parameter representing initial confidence in uniform prior distribution
    Output: posterior click probability for aprime, given article and goal'''
    
    k_a=len(list(Wikigraph.successors(article)))#number of out-degree links for article
    if count_a(article, goal, paths)!=0 and count_aprime(aprime, article, goal, paths):
        proba=(count_aprime(aprime, article, goal, paths)+alpha)/(count_a(article, goal, paths)+alpha*k_a)
    else: proba=1 #because of the log

    return proba



In [221]:
#proba=posterior_click_probability('Time','14th_century','Rainbow',path_finished['path'])
proba=posterior_click_probability('Dutch_language','Darth_Vader','Roman_Catholic_Church',path_finished['path'])
proba
#seems to work as well


0.16666666666666666

How to determine alpha parameter???

In [211]:
def path_distance(a_i, goal, path):
    sum=0
    i=-1
    if a_i!=goal:
        for a in path:
            if a==a_i:
                i=path.index(a)
        if i==-1:
            print('Error: The article looked for is not in the path')
            return 0

        pathcycle=cycle(path[i:])
        next_a=next(pathcycle)
        for _ in range(len(path)-1):
            a_i, next_a=next_a, next(pathcycle)
            if a_i!='<':
                p=posterior_click_probability(next_a, a_i, goal, path_finished['path'])
                sum-=np.log(p)
    
    return sum/(-np.log(GooglePageRank.get(goal)))
                
    

In [212]:
path_distance('15th_century','African_slave_trade',path_finished['path'][1])

Error: The article looked for is not in the path


0

In [213]:
def semantic_distance(article, goal,paths):
    dist=0
    m=0
    pat=[]

    for path in paths:
        if path[-1]==goal:
            for a in path:
                if a==article:
                    dist+=path_distance(article, goal, path)
                    m+=1
                    pat.append(path)

    if m==0: 
        print('Error, no such path exists')
        return 0,0,0

    return dist/m, m, pat
    


In [238]:
sdist1, m1, pat1=semantic_distance('Noam_Chomsky','Linguistics',path_finished['path'])
#sdist2, m2, pat2=semantic_distance('Noam_Chomsky','Communication',path_finished['path'])
#sdist3, m3, pat3=semantic_distance('Noam_Chomsky','Language',path_finished['path'])
#sdist4, m4, pat4=semantic_distance('Noam_Chomsky','Rainbow',path_finished['path'])

print(sdist1, m1)

Error, no such path exists
Error, no such path exists
Error, no such path exists
0.0 1


In [215]:
Wikigraph.has_node('Minneapolis')
print(pat1,pat4)

0 0


In [216]:
Wikigraph.has_node('Language')

True

In [217]:
print(list(Wikigraph.successors('Noam_Chomsky')))

['Belarus', 'Bertrand_Russell', 'Cape_Town', 'Chinese_language', 'Computer_science', 'Dutch_language', 'English_language', 'Fascism', 'French_language', 'German_language', 'Hebrew_language', 'Human_rights', 'Jerry_Fodor', 'John_Locke', 'Language', 'Linguistics', 'Mass_media', 'Nazism', 'New_Delhi', 'Philosophy', 'Philosophy_of_mind', 'Propaganda', 'Psychology', 'Socialism', 'Spanish_language', 'Ukraine', 'United_States', 'University_of_Cambridge', 'Vietnam_War']


In [242]:
noam_data = path_finished['path'][path_finished['path'].isin(['Noam_Chomsky']).any()]

KeyError: False