In [1]:
import numpy as np

In [2]:
import pandas as pd
import os


data_path='/Users/ginevralarroux/Desktop/EPFL courses/Applied data analysis/ADA project/data/'
parent_folder_path = data_path + 'wikispeedia_paths-and-graph/'

paths_finished_df=(pd.read_csv(os.path.join(parent_folder_path, 'paths_finished.tsv'), 
                               sep='\t', skiprows=15, header=None)
                   .rename(columns={0:"ip",
                                    1:"timestamp",
                                    2:"duration",
                                    3:"path",
                                    4:"rating"}))

paths_unfinished_df=(pd.read_csv(os.path.join(parent_folder_path, 'paths_unfinished.tsv'), 
                               sep='\t', skiprows=16, header=None)
                     .rename(columns={0:"ip",
                                      1:"timestamp",
                                      2:"duration",
                                      3:"path",
                                      4:"target",
                                      5:"type"}))

In [3]:
# first read the shortest path matrix
shortest_path_matrix = []

with open(parent_folder_path +'shortest-path-distance-matrix.txt', 'r') as f:
    # the first 17 lines (indexed from 0) is the file description 
    for line in f.readlines()[17:]:
        shortest_path_matrix.append(line)
        
# the matrix now stores all shortest paths from all source articles
len(shortest_path_matrix)

4604

In [4]:
# next, we need a list of all the article names. The order of the articles 
# is the same as the shortest_path_matrix as per the file descriptions

import urllib.parse
def str_url_format(word):
    """
    Article name preprocessing.
    
    Apply this function any time a new dataframe is loaded.
    """
    return (urllib.parse.unquote(word)
            .replace("_", " ")
            .strip()
            .lower())

article_names_cleaned = (pd.read_csv(data_path + 'wikispeedia_paths-and-graph/articles.tsv', 
                                     sep='\t', 
                                     skiprows=11,
                                     header=None)[0]
                         .apply(str_url_format)
                         .values
                         .tolist()
                        )

article_names_cleaned[:3]

['áedán mac gabráin', 'åland', 'édouard manet']

In [5]:
# for each human path, perform the following steps:
#      1. extract the source and target article
#      2. find the *index* in the article names list that corresponds to the source and target article
#      3. the corresponding *index* row in the shortest path matrix corresponds to the source article. 
#         from this list of numbers, use the target article *index* to find the *shortest path length*

def augment_with_shortest_path(df: pd.DataFrame, successful: bool) -> pd.DataFrame:
    """
    this function takes a Series and returns a DataFrame with the following columns:
       1. path
       2. source article
       3. target article
       4. shortest path length
       
    input:
       df: the Pandas DataFrame containing all the human navigation paths
       successful: a boolean indicating whether the paths were successful or not
    """
    # remove all paths with back-tracks
    df = df[~df['path'].str.contains('<')]        
    paths, human_path_lengths, source_articles, target_articles, shortest_paths = [], [], [], [], []
    
    # all information required for successful paths is in the path itself
    if successful:
        for human_path in df['path']:

            split_path = str_url_format(human_path).split(";")
            
            paths.append(";".join(split_path))
            # subtract 1 because we do not count the source article
            human_path_lengths.append(len(split_path)-1)

            source = split_path[0]
            target = split_path[-1]
            source_articles.append(source)
            target_articles.append(target)
    
    # unsuccessful paths require extraction of the target article from a separate column
    else:
        for human_path, target in zip(df['path'], df['target']):
            
            split_path = str_url_format(human_path).split(";")
            
            paths.append(";".join(split_path))
            # subtract 1 because we do not count the source article
            human_path_lengths.append(len(split_path)-1)
            source = split_path[0]
            source_articles.append(source)
            target_articles.append(target)
        
    for source, target in zip(source_articles, target_articles):
        source_index = article_names_cleaned.index(source)
        # there are target articles that were not provided in the plain text files
        try:
            target_index = article_names_cleaned.index(target)
        except Exception:
            shortest_paths.append("N/A")
            continue
            
        # query the shortest path matrix to get the correct vector (corresponding to the source article)
        shortest_path_vector = shortest_path_matrix[source_index]
        # now find the target article indexed integer in the vector
        shortest = shortest_path_vector[target_index]
        # it's not always possible to get to the target article. Impossible navigation is denoted by "_"
        if shortest == "_":
            shortest_paths.append("Impossible")
        else:
            shortest_paths.append(int(shortest))
        
        
    # create the augmented DataFrame
    out = pd.DataFrame({
                      'path': paths,
                      'source_article': source_articles,
                      'target_article': target_articles,
                      'human_path_length': human_path_lengths,
                      'shortest_path_length': shortest_paths
                      })
    
    return out

In [6]:
successful_df = augment_with_shortest_path(df=paths_finished_df, successful=True)
# to investigate human behaviour, we remove all "Impossible paths" and also shortest_path_length = 0
successful_df = successful_df[(successful_df['shortest_path_length'].apply(lambda x: x != 'Impossible' and x != 0))]
# next, we will keep only shortest_path_lengths >= 3
successful_df = successful_df[(successful_df['shortest_path_length'].apply(lambda x: x >= 3))]

successful_df.head(3)

Unnamed: 0,path,source_article,target_article,human_path_length,shortest_path_length
0,14th century;15th century;16th century;pacific...,14th century,african slave trade,8,3
1,14th century;europe;africa;atlantic slave trad...,14th century,african slave trade,4,3
2,14th century;niger;nigeria;british empire;slav...,14th century,african slave trade,7,3


In [7]:
unsuccessful_df = augment_with_shortest_path(df=paths_unfinished_df, successful=False)

# some target articles for unsuccessful paths were not provided in the plain_text folder, denoted in
# the DataFrame as "N/A". Remove these
unsuccessful_df = unsuccessful_df[(unsuccessful_df['shortest_path_length'].apply(lambda x: x != 'N/A' and x != 'Impossible'))]
# some unsuccesful paths only contain 1 article because the user didn't click anything
# these are less meaningful paths to analyze as we are interested in users who tried but failed
# we will keep paths where the human clicked at least 3 articles
unsuccessful_df = unsuccessful_df[(unsuccessful_df['shortest_path_length'].apply(lambda x: x >= 3))]
unsuccessful_df = unsuccessful_df[(unsuccessful_df['human_path_length'].apply(lambda x: x >= 3))]
unsuccessful_df.head(3)

Unnamed: 0,path,source_article,target_article,human_path_length,shortest_path_length
1197,ozone;gas;plasma (physics);phase (matter);ther...,ozone,2-8-0,5,4
10039,jake gyllenhaal;mozambique;1st century;4th cen...,jake gyllenhaal,4-2-0,3,5
15522,rutherford b. hayes;american civil war;mississ...,rutherford b. hayes,2-6-0,5,5


In [8]:
#GINEVRA'S CODE FROM HERE 

In [9]:
most_freq_positioning_df=pd.read_csv('./most_freq_positioning_df.csv', index_col=0)
most_freq_positioning_df.head()

Unnamed: 0,link,most_freq_positioning
0,,bottom
1,"""6 villages for 2006",center-bottom
2,"""6 villages for 2006""",center-top
3,"""capitalist rule""",top
4,"""consumption"" (tuberculosis)",top


In [10]:
def find_features(human_path):
    '''this function determines the categorical positioning feature of each hyperlink in the human path'''
    return [most_freq_positioning_df.loc[link].values.item() 
                         if link in most_freq_positioning_df.index \
                         else np.random.choice(['top', 'center-top','center','center-bottom', 'bottom'])
                         for link in human_path] #certain hyperlinks were not classified 

successful_features=successful_df['path'].apply(lambda x: x.split(";")).apply(lambda x: find_features(x))
successful_features=unsuccessful_df['path'].apply(lambda x: x.split(";")).apply(lambda x: find_features(x))

In [11]:
def find_features_frequency(features):
    '''this function counts the frequency of the categorical positioning features of the hyperlinks in each human path'''
    t, ct, c, cb, b=[], [], [], [], []

    for path in features:
        t.append(path.count('top'))
        ct.append(path.count('center-top'))
        c.append(path.count('center'))
        cb.append(path.count('center-bottom'))
        b.append(path.count('bottom'))
    return pd.DataFrame({'top': t, 'center_top': ct, 'center': c, 'center_bottom': cb, 'bottom': b})

successful_features_freq=find_features_frequency(successful_features)
unsuccessful_features_freq=find_features_frequency(unsuccessful_features)

NameError: name 'unsuccessful_features' is not defined

In [12]:

#standardize features 

def standardize_features(df):
    df_std=pd.DataFrame()
    for c in df.columns:
        df_std[c]=(df[c]-df[c].mean())/df[c].std()
    return df_std

successful_features_std=standardize_features(successful_features_freq)
unsuccessful_features_std=standardize_features(unsuccessful_features_freq)

NameError: name 'unsuccessful_features_freq' is not defined

In [13]:
links_in_images_unique=pd.read_csv('./links_in_images_unique.csv', index_col=0)
links_in_images_unique.head()

Unnamed: 0,link
0,bangladesh
1,portsmouth
2,england
3,india
4,pompeii


In [None]:
successful_img_features=successful_df['path'].apply(lambda x: x.split(";")).apply(lambda x: np.isin(x, links_in_images_unique).sum() \
                                                          if np.isin(x, links_in_images_unique).sum() else 0)

In [None]:
len(successful_img_features)