In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import re

import urllib

from scipy import stats

</div>
<div class="1. Header" style='background-color:#08135c; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:white;">
       <h2>1. Data</h2>
        <h4>In this first section we import and rearrange the data.</h4>
    </span>
</div>
</div>

</div>
<div class="1.1 Header" style='background-color:#08135c; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:white">
       <h3>1.1 Data acquisition</h3>
    </span>
</div>
</div>
<div class="1.1 explanation" style='background-color:#faefe1; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:black">
        Here we load the relevant data using the <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">pd.read_csv()</span> method. The data we are dealing with follows a relational model. The specific navigation paths can be represented as graphs where the nodes concern the wikipedia pages that are connected through (directional) edges representing the hyperlinks.
    </span>
</div>

In [2]:
#Import relevant DataFrames

#Articles data
articles = pd.read_csv("data/articles.tsv", skiprows=11, names=["article"])
articles['article'] = articles['article'].apply(urllib.parse.unquote) #Parsing URL encoding

#Category data
categories = pd.read_csv("data/categories.tsv", sep="\t", skiprows=12, names=["article", "category"])
categories["article"] = categories["article"].apply(urllib.parse.unquote) #Parsing URL encoding

#Links data
links = pd.read_csv("data/links.tsv", sep="\t", skiprows=11, names=["article", "link"])
links["article"] = links["article"].apply(urllib.parse.unquote) #Parsing URL encoding
links["link"] = links["link"].apply(urllib.parse.unquote) #Parsing URL encoding

#Finished paths
paths_finished = pd.read_csv("data/paths_finished.tsv", sep="\t", skiprows=15, names=["hashedIpAddress",
                                                                                      "timestamp",
                                                                                      "durationInSec",
                                                                                      "path",
                                                                                      "rating"])
paths_finished["path"] = paths_finished["path"].apply(urllib.parse.unquote) #Parsing URL encoding

#Unfinished paths
paths_unfinished = pd.read_csv("data/paths_unfinished.tsv", sep="\t", skiprows=16, names=["hashedIpAddress",
                                                                                          "timestamp",
                                                                                          "durationInSec",
                                                                                          "path",
                                                                                          "target",
                                                                                          "type"])
paths_unfinished["path"] = paths_unfinished["path"].apply(urllib.parse.unquote) #Parsing URL encoding
paths_unfinished["target"] = paths_unfinished["target"].apply(urllib.parse.unquote) #Parsing URL encoding

# Add the length of the paths
paths_unfinished['pathLength'] = paths_unfinished['path'].apply(lambda x : len(str(x).split(';')))
paths_finished['pathLength'] = paths_finished['path'].apply(lambda x : len(str(x).split(';')))

#shortest path matrix
with open('data/shortest-path-distance-matrix.txt', 'r') as file:
    
    #Initialize shortest_path_distance list
    shortest_path_distance = []
    
    for line in file:
        
        #Check if the first character of the line is either a digit or underscore
        if re.search("([0-9]|_)", line[0]): 
            
            #Append list to shortest_path_distance
            shortest_path_distance.append([np.nan if x=='_' else int(x) for x in line.strip()])
    
#Convert to numpy ndarray
shortest_path_distance = np.array(shortest_path_distance) 

</div>
<div class="1.2 Header" style='background-color:#08135c; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:white">
       <h3>1.2 Data concatenation</h3>
    </span>
</div>
</div>
<div class="1.1 explanation" style='background-color:#faefe1; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:black">
        We want to merge both the <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">paths_finished</span> and <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">paths_unfinished</span> DataFrame. This make the analysis more convenient when comparing both types. More specifically we add the <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">target</span> column to and drop the <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">rating </span> column of <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">paths_finished</span>. We can then concatenate the respective DataFrames together. 
</div>

In [3]:
#Merge all the paths 
paths_finished['target'] = paths_finished['path'].apply(lambda x : str(x).split(';')[-1])

#Add and remove (ir)relevant colums.
paths_finished_ = paths_finished.copy()
paths_unfinished_ = paths_unfinished.copy()
paths_finished_['type'] = 'finished'
paths_finished_ = paths_finished_.drop('rating',axis =1)

#concatenate data
paths_all = pd.concat([paths_finished_, paths_unfinished_]) 

#print head of resulting DataFrame
paths_all.head(4)

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,pathLength,target,type
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,9,African_slave_trade,finished
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,5,African_slave_trade,finished
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,8,African_slave_trade,finished
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,4,Greece,finished


</div>
<div class="4. Header" style='background-color:#08135c; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:white;">
       <h2>1. Between game analysis</h2>
        <h4>Here we will focus on the level of an individual player and the history of the different Wikispeedia games (i.e. between game) that have been played by that person consecutively.</h4>
    </span>
</div>
</div>

</div>
<div class="1.1 explanation" style='background-color:#08135c; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:white">
       <h3>1.1 Game distribution per player</h3>
    </span>
</div>
</div>
<div class="1.1 explanation" style='background-color:#faefe1; border-left: solid #darkblue 4px; border-radius: 4px; padding:0.7em;'>
    <span style="color:black">
        We can globally categorize the <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">between_game</span> game history (chronologically) in 3 groups:
        <ul>
            <li> <code class='python'>finished</code>: The player's last game was a <code class='python'>finished</code> path (e.g. <code class='python'>[timeout, restart, finished]</code>)</li>
            <li> <code class='python'>timeout</code>: The player's last game was a <code class='python'>finished</code> timeout (e.g. <code class='python'>[timeout, restart, timeout]</code>)</li>
            <li> <code class='python'>restart</code>: The player's last game was a <code class='python'>finished</code> restart (e.g. <code class='python'>[timeout, restart, restart]</code>)</li>
        </ul>
        Per category, we want to find and display the distribution for the number of succesfully <code class='python'>finished</code> games. Firsts we need to extract the  <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">between_game</span> history per player. From that, we can extract the <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">last_game</span> and <span style="font-family: monospace; background-color:#fadfb9; padding-left:4px; padding-right:4px; border-radius: 5px;">nb_finished_games</span> columns
    </span>
</div>


In [13]:
#Create DataFrame with chronological (between_game) history of games player
players_record = pd.DataFrame(paths_all.sort_values('timestamp').groupby('hashedIpAddress')['type'].apply(list))
players_record = players_record.rename(columns={'type': 'between_game'})

#Display snipped of result
players_record.head(4)

Unnamed: 0_level_0,between_game
hashedIpAddress,Unnamed: 1_level_1
000386124f307de8,"[finished, timeout]"
0007183207b6008f,"[finished, finished, finished]"
000a09e202e88d10,[finished]
000a5bac5b7781ea,[finished]


In [28]:
#Now extract last_game column
players_record['last_game'] = players_record['between_game'].apply(lambda l : l[-1])

#Now extract nb_finished_games, nb_restart_games and nb_timeout_games and nb_total_games column
players_record['nb_finished_games'] = players_record['between_game'].apply(lambda x: x.count('finished'))
players_record['nb_restart_games'] = players_record['between_game'].apply(lambda x: x.count('restart'))
players_record['nb_timeout_games'] = players_record['between_game'].apply(lambda x: x.count('timeout'))
players_record['nb_total_games'] = players_record['between_game'].apply(lambda x: len(x))

#Display snipped of results
players_record.head(10)

Unnamed: 0_level_0,between_game,last_game,nb_games_finished,nb_finished_games,nb_restart_games,nb_timeout_games,nb_total_games
hashedIpAddress,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
000386124f307de8,"[finished, timeout]",timeout,1,1,0,1,2
0007183207b6008f,"[finished, finished, finished]",finished,3,3,0,0,3
000a09e202e88d10,[finished],finished,1,1,0,0,1
000a5bac5b7781ea,[finished],finished,1,1,0,0,1
000a6e585b776c96,[timeout],timeout,0,0,0,1,1
000bc9fe5640ed31,"[finished, finished]",finished,2,2,0,0,2
000d397013f09039,"[finished, finished]",finished,2,2,0,0,2
000e5d4c2c8e921f,"[finished, restart, restart, timeout]",timeout,1,1,2,1,4
000e954305ddb434,[finished],finished,1,1,0,0,1
0011acb93ed0090b,"[restart, timeout]",timeout,0,0,1,1,2


In [21]:
#Distribution



0.317391095988882


In [26]:
players_record[players_record['nb_games_finished'] == 0]['nb_total_games']

hashedIpAddress
000a6e585b776c96    1
0011acb93ed0090b    2
001556137480170f    1
0015865e516d4932    1
00210d6f590f218c    1
                   ..
7fee3c54038bf1b9    1
7feed00176af2c7a    2
7ff611c003666e0f    1
7ff6291c7269c4f4    1
7fff57a336c1f472    1
Name: nb_total_games, Length: 6623, dtype: int64