In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [16]:
file_finished = "data/wikispeedia_paths-and-graph/paths_finished.tsv"
column_names = ["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"]
finished_paths = pd.read_csv(file_finished, sep="\t", comment="#", names=column_names)

In [30]:
# Split the path into a list
finished_paths["path"] = finished_paths["path"].apply(lambda x: x.split(";"))

# Count the number of clicks in each path (including back clicks)
finished_paths = finished_paths.assign(nb_clicks=finished_paths["path"].apply(lambda x: len(x)-1)) # -1 because we don't count the source article

# Count the number of back clicks in each path
finished_paths = finished_paths.assign(nb_back_clicks=finished_paths["path"].apply(lambda x: x.count("<")))

# Count the path length (excluding back clicks)
finished_paths = finished_paths.assign(path_length=finished_paths["nb_clicks"] - 2 * finished_paths["nb_back_clicks"])

# Convert the rating to int
finished_paths["rating"] = finished_paths["rating"].fillna(-1).astype(int) #0 means no rating

finished_paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,nb_clicks,nb_back_clicks,path_length
0,6a3701d319fc3754,1297740409,166,"[14th_century, 15th_century, 16th_century, Pac...",-1,8,0,8
1,3824310e536af032,1344753412,88,"[14th_century, Europe, Africa, Atlantic_slave_...",3,4,0,4
2,415612e93584d30e,1349298640,138,"[14th_century, Niger, Nigeria, British_Empire,...",-1,7,0,7
3,64dd5cd342e3780c,1265613925,37,"[14th_century, Renaissance, Ancient_Greece, Gr...",-1,3,0,3
4,015245d773376aab,1366730828,175,"[14th_century, Italy, Roman_Catholic_Church, H...",3,6,0,6
...,...,...,...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,"[Yagan, Ancient_Egypt, Civilization]",-1,2,0,2
51314,2ef7ac844cefda58,1300254138,165,"[Yagan, Folklore, Brothers_Grimm, <, 19th_cent...",3,8,1,6
51315,12863abb7887f890,1385095372,228,"[Yagan, Australia, England, France, United_Sta...",-1,6,0,6
51316,19f8284371753362,1298792567,56,"[Yarralumla%2C_Australian_Capital_Territory, A...",1,3,0,3


# TOP 10 PLAYERS

Look at evolution of the 10 players who have played the most. Do they improve over time ?

In [31]:
#group by player defined as the hashed IP address
grouped_players = finished_paths.groupby("hashedIpAddress")

#count the number of paths each player has taken and sort by number of paths in descending order
player_path_counts = grouped_players.size()
player_path_counts = player_path_counts.sort_values(ascending=False)

In [32]:
top10_players = player_path_counts.head(10)
top10_players

hashedIpAddress
0d57c8c57d75e2f5    4865
473d6ac602c2b198     545
7d5624e35c9523ef     292
5bb5836b425e6bfe     243
6b039e9953cf075e     231
321e4b101c5b58ff     196
0299542414c3f20a     172
6d136e371e42474f     170
66b196465d2b5d38     168
7d29e9ab07e833d4     153
dtype: int64

In [35]:
top10_players_plays = finished_paths[finished_paths["hashedIpAddress"].isin(top10_players.index)]

In [36]:
#group by player and rank the paths by the order they were taken
grouped_top10_paths = top10_players_plays.groupby("hashedIpAddress")

ordered_top10_paths = grouped_top10_paths.apply(lambda x: x.sort_values(by="timestamp"))

  ordered_top10_paths = grouped_top10_paths.apply(lambda x: x.sort_values(by="timestamp"))
