In [1]:
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Load the finished_paths from tsv file
file = "../wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/paths_finished.tsv"
"""
# Successful (i.e., finished) Wikispeedia paths.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# Articles in a path are separated by ";".
# Back clicks are represented as "<".
# Ratings are optionally given by the user after finishing the game and range from 1 ("easy") to 5 ("brutal").
# Missing ratings are represented as "NULL".
# FORMAT:   hashedIpAddress   timestamp   durationInSec   path   rating
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:
#     Wikispeedia: An Online Game for Inferring Semantic Distances between Concepts.
#     21st International Joint Conference on Artificial Intelligence (IJCAI), 2009.
"""
finished_paths = pd.read_csv(file, sep='\t', header=None)
finished_paths.columns = ["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"]
finished_paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,Yagan;Ancient_Egypt;Civilization,
51314,2ef7ac844cefda58,1300254138,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0
51315,12863abb7887f890,1385095372,228,Yagan;Australia;England;France;United_States;T...,
51316,19f8284371753362,1298792567,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0


In [3]:
finished_paths["rating"] = finished_paths["rating"].fillna(-1)
finished_paths = finished_paths.dropna()
finished_paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,-1.0
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,-1.0
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,-1.0
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,Yagan;Ancient_Egypt;Civilization,-1.0
51314,2ef7ac844cefda58,1300254138,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0
51315,12863abb7887f890,1385095372,228,Yagan;Australia;England;France;United_States;T...,-1.0
51316,19f8284371753362,1298792567,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0


In [4]:
#transform timestamp to datetime
finished_paths["timestamp"].apply(lambda x: pd.to_datetime(x, unit='s'))

0       2011-02-15 03:26:49
1       2012-08-12 06:36:52
2       2012-10-03 21:10:40
3       2010-02-08 07:25:25
4       2013-04-23 15:27:08
                ...        
51313   2012-10-03 02:23:35
51314   2011-03-16 05:42:18
51315   2013-11-22 04:42:52
51316   2011-02-27 07:42:47
51317   2011-10-09 01:11:41
Name: timestamp, Length: 51315, dtype: datetime64[ns]

In [5]:
# count the number of steps
finished_paths = finished_paths.assign(nb_steps=finished_paths["path"].apply(lambda x: len(x.split(";"))+1))

In [6]:
finished_paths["rating"] = finished_paths["rating"].astype(int)

In [7]:
finished_paths["path"] = finished_paths["path"].apply(lambda x: x.split(";"))
finished_paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,nb_steps
0,6a3701d319fc3754,1297740409,166,"[14th_century, 15th_century, 16th_century, Pac...",-1,10
1,3824310e536af032,1344753412,88,"[14th_century, Europe, Africa, Atlantic_slave_...",3,6
2,415612e93584d30e,1349298640,138,"[14th_century, Niger, Nigeria, British_Empire,...",-1,9
3,64dd5cd342e3780c,1265613925,37,"[14th_century, Renaissance, Ancient_Greece, Gr...",-1,5
4,015245d773376aab,1366730828,175,"[14th_century, Italy, Roman_Catholic_Church, H...",3,8
...,...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,"[Yagan, Ancient_Egypt, Civilization]",-1,4
51314,2ef7ac844cefda58,1300254138,165,"[Yagan, Folklore, Brothers_Grimm, <, 19th_cent...",3,10
51315,12863abb7887f890,1385095372,228,"[Yagan, Australia, England, France, United_Sta...",-1,8
51316,19f8284371753362,1298792567,56,"[Yarralumla%2C_Australian_Capital_Territory, A...",1,5


In [None]:
unfinished_paths = pd.read_csv("../wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/paths_unfinished.tsv", sep='\t', header=None)

"""
# Unsuccessful (i.e., unfinished) Wikispeedia paths.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# Articles in a path are separated by ";".
# Back clicks are represented as "<".
# There are two types of quitting:
# (1) "timeout" means that no click was made for 30 minutes;
# (2) "restart" means that the user started a new game without finishing the current one.
# FORMAT:   hashedIpAddress   timestamp   durationInSec   path   target   type
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:
#     Wikispeedia: An Online Game for Inferring Semantic Distances between Concepts.
#     21st International Joint Conference on Artificial Intelligence (IJCAI), 2009."""

unfinished_paths.columns = ["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"]
unfinished_paths

In [None]:
unfinished_paths["nb_steps"] = unfinished_paths["path"].apply(lambda x: len(x.split(";")) + 1)
unfinished_paths["play_duration"] = unfinished_paths["durationInSec"]
#remove 1800 seconds from the play duration if the type is timeout
unfinished_paths.loc[unfinished_paths["type"] == "timeout", "play_duration"].apply(lambda x: max(x - 1800, 0))
unfinished_paths["path"] = unfinished_paths["path"].apply(lambda x: x.split(";"))
import matplotlib.pyplot as plt
unfinished_paths.plot.hist(y="play_duration", bins=100, title="Distribution of play duration of unfinised paths", xlabel="Duration in seconds", ylabel="Number of paths", log=True)
plt.show()
finished_paths.plot.hist(y="durationInSec", bins=100, title="Distribution of duration of finised paths", xlabel="Duration in seconds", ylabel="Number of paths", log=True)


In [None]:
unfinished_paths

In [None]:


"""
# Hierarchical categories of all articles.
# Many articles have more than one category. Some articles have no category.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# FORMAT:   article   category
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:
#     Wikispeedia: An Online Game for Inferring Semantic Distances between Concepts.
#     21st International Joint Conference on Artificial Intelligence (IJCAI), 2009."""

categories = pd.read_csv("../wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/categories.tsv", sep='\t', header=None)
categories.columns = ["article", "category"]
categories["category"] =categories["category"].apply(lambda x: x.split(".")[1:])
categories

In [None]:
def compare_categories(category1, category2):
    return len(set(category1).intersection(set(category2)))

In [None]:
unfinished_paths["source"] = unfinished_paths["path"].apply(lambda x: x[0])
finished_paths["source"] = finished_paths["path"].apply(lambda x: x[0])
finished_paths["target"] = finished_paths["path"].apply(lambda x: x[-1])

In [None]:
unfinished_paths = unfinished_paths.merge(categories, left_on="source", right_on="article").rename(columns={"category": "source_category"})
unfinished_paths = unfinished_paths.merge(categories, left_on="target", right_on="article").rename(columns={"category": "target_category"})
finished_paths = finished_paths.merge(categories, left_on="source", right_on="article").rename(columns={"category": "source_category"})
finished_paths = finished_paths.merge(categories, left_on="target", right_on="article").rename(columns={"category": "target_category"})
unfinished_paths.drop(columns=["article_x", "article_y"], inplace=True)
finished_paths.drop(columns=["article_x", "article_y"], inplace=True)

In [None]:
unfinished_paths

In [None]:
finished_paths

In [None]:
categories.to_csv("clean_categories.csv", index=False)
finished_paths.to_csv("clean_finished_paths.csv", index=False)
unfinished_paths.to_csv("clean_unfinished_paths.csv", index=False)