In [2]:
import matplotlib.pyplot as plt
import pandas as pd

## Finished Paths

In [2]:
# Load the finished_paths from tsv file
file = "./wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/paths_finished.tsv"
"""
# Successful (i.e., finished) Wikispeedia paths.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# Articles in a path are separated by ";".
# Back clicks are represented as "<".
# Ratings are optionally given by the user after finishing the game and range from 1 ("easy") to 5 ("brutal").
# Missing ratings are represented as "NULL".
# FORMAT:   hashedIpAddress   timestamp   durationInSec   path   rating
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:
#     Wikispeedia: An Online Game for Inferring Semantic Distances between Concepts.
#     21st International Joint Conference on Artificial Intelligence (IJCAI), 2009.
"""
finished_paths = pd.read_csv(file, sep='\t', header=None)
finished_paths.columns = ["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"]
finished_paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,Yagan;Ancient_Egypt;Civilization,
51314,2ef7ac844cefda58,1300254138,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0
51315,12863abb7887f890,1385095372,228,Yagan;Australia;England;France;United_States;T...,
51316,19f8284371753362,1298792567,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0


In [3]:
# Load the finished_paths from tsv file
file = "wikispeedia_paths-and-graph/paths_finished.tsv"
"""
# Successful (i.e., finished) Wikispeedia paths.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# Articles in a path are separated by ";".
# Back clicks are represented as "<".
# Ratings are optionally given by the user after finishing the game and range from 1 ("easy") to 5 ("brutal").
# Missing ratings are represented as "NULL".
# FORMAT:   hashedIpAddress   timestamp   durationInSec   path   rating
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:
#     Wikispeedia: An Online Game for Inferring Semantic Distances between Concepts.
#     21st International Joint Conference on Artificial Intelligence (IJCAI), 2009.
"""
finished_paths = pd.read_csv(file, sep='\t', header=None)
finished_paths.columns = ["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"]
finished_paths

FileNotFoundError: [Errno 2] No such file or directory: 'wikispeedia_paths-and-graph/paths_finished.tsv'

In [None]:
# Transform timestamp to datetime
finished_paths['timestamp'] =  finished_paths["timestamp"].apply(lambda x: pd.to_datetime(x, unit='s'))

# Split the path into a list
finished_paths["path"] = finished_paths["path"].apply(lambda x: x.split(";"))

# Count the number of clicks in each path (including back clicks)
finished_paths = finished_paths.assign(nb_clicks=finished_paths["path"].apply(lambda x: len(x)-1)) # -1 because we don't count the source article

# Count the number of back clicks in each path
finished_paths = finished_paths.assign(nb_back_clicks=finished_paths["path"].apply(lambda x: x.count("<")))

# Count the path length (excluding back clicks)
finished_paths = finished_paths.assign(path_length=finished_paths["nb_clicks"] - 2 * finished_paths["nb_back_clicks"])

# Convert the rating to int
finished_paths["rating"] = finished_paths["rating"].astype(int)

finished_paths

In [None]:
finished_paths.describe()

In [None]:
duration_mask = finished_paths["durationInSec"] == 0
finished_paths[duration_mask]

# Gives same result as above
length_mask = finished_paths["path_length"] == 0
finished_paths[length_mask]

In [None]:
# Remove the paths with a duration of 0 (target=source)
finished_paths = finished_paths[~duration_mask]
finished_paths

In [None]:
finished_paths[finished_paths["path_length"] == 419]

In [None]:
# Compare duration of paths with path length
plt.scatter(finished_paths["path_length"], finished_paths["durationInSec"])
plt.xlabel("Path length")
plt.ylabel("Duration (s)")
plt.title("Duration of paths compared to path length")
plt.show()

In [None]:
mask = finished_paths["path_length"] < 100
plt.scatter(finished_paths[mask]["path_length"], finished_paths[mask]["durationInSec"])
plt.xlabel("Path length")
plt.ylabel("Duration (s)")
plt.title("Duration of paths compared to path length (path_length < 100)")
plt.show()

## Unfinished Paths

In [None]:
unfinished_paths = pd.read_csv("wikispeedia_paths-and-graph/paths_unfinished.tsv", sep='\t', header=None)

"""
# Unsuccessful (i.e., unfinished) Wikispeedia paths.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# Articles in a path are separated by ";".
# Back clicks are represented as "<".
# There are two types of quitting:
# (1) "timeout" means that no click was made for 30 minutes;
# (2) "restart" means that the user started a new game without finishing the current one.
# FORMAT:   hashedIpAddress   timestamp   durationInSec   path   target   type
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:
#     Wikispeedia: An Online Game for Inferring Semantic Distances between Concepts.
#     21st International Joint Conference on Artificial Intelligence (IJCAI), 2009."""

unfinished_paths.columns = ["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"]
unfinished_paths

In [None]:
# Transform timestamp to datetime
unfinished_paths['timestamp'] =  unfinished_paths["timestamp"].apply(lambda x: pd.to_datetime(x, unit='s'))

# Split the path into a list
unfinished_paths["path"] = unfinished_paths["path"].apply(lambda x: x.split(";"))

# Count the number of clicks in each path (including back clicks)
unfinished_paths["nb_clicks"] = unfinished_paths["path"].apply(lambda x: len(x) - 1) # -1 because we don't count the source article

# Count the number of back clicks in each path
unfinished_paths["nb_back_clicks"] = unfinished_paths["path"].apply(lambda x: x.count("<"))

# Count the path length (excluding back clicks)
unfinished_paths["path_length"] = unfinished_paths["nb_clicks"] - unfinished_paths["nb_back_clicks"]

# Create a column containing the play duration
unfinished_paths["play_duration"] = unfinished_paths["durationInSec"]

# Remove 1800 seconds from the play duration if the type is timeout and assign the result to the play duration
unfinished_paths.loc[unfinished_paths["type"] == "timeout", "play_duration"] = unfinished_paths.loc[unfinished_paths["type"] == "timeout", "play_duration"].apply(lambda x: max(x - 1800, 0)) # ensures that the play duration is not negative

In [None]:
unfinished_paths.plot.hist(y="play_duration", bins=100, title="Distribution of play duration of unfinished paths", xlabel="Duration in seconds", ylabel="Number of paths", log=True)
plt.show()
finished_paths.plot.hist(y="durationInSec", bins=100, title="Distribution of duration of finished paths", xlabel="Duration in seconds", ylabel="Number of paths", log=True)
plt.show()

## Categories

In [None]:
"""
# Hierarchical categories of all articles.
# Many articles have more than one category. Some articles have no category.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# FORMAT:   article   category
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:
#     Wikispeedia: An Online Game for Inferring Semantic Distances between Concepts.
#     21st International Joint Conference on Artificial Intelligence (IJCAI), 2009."""

categories = pd.read_csv("wikispeedia_paths-and-graph/wikispeedia_paths-and-graph/categories.tsv", sep='\t', header=None)
categories.columns = ["article", "category"]
categories["category"] = categories["category"].apply(lambda x: x.split(".")[1:])
categories

In [None]:
def compare_categories(category1, category2):
    return len(set(category1).intersection(set(category2)))

In [None]:
# Get source and target articles for each path
unfinished_paths["source"] = unfinished_paths["path"].apply(lambda x: x[0])
finished_paths["source"] = finished_paths["path"].apply(lambda x: x[0])
finished_paths["target"] = finished_paths["path"].apply(lambda x: x[-1])

In [None]:
# Get the categories of the source and target articles for each path
unfinished_paths = unfinished_paths.merge(categories, left_on="source", right_on="article").rename(columns={"category": "source_category"})
unfinished_paths = unfinished_paths.merge(categories, left_on="target", right_on="article").rename(columns={"category": "target_category"})
finished_paths = finished_paths.merge(categories, left_on="source", right_on="article").rename(columns={"category": "source_category"})
finished_paths = finished_paths.merge(categories, left_on="target", right_on="article").rename(columns={"category": "target_category"})
unfinished_paths.drop(columns=["article_x", "article_y"], inplace=True)
finished_paths.drop(columns=["article_x", "article_y"], inplace=True)

In [None]:
unfinished_paths

In [None]:
finished_paths

In [None]:
unfinished_paths["source_general_category"] = unfinished_paths["source_category"].apply(lambda x: x[0])
unfinished_paths["target_general_category"] = unfinished_paths["target_category"].apply(lambda x: x[0])
finished_paths["source_general_category"] = finished_paths["source_category"].apply(lambda x: x[0])
finished_paths["target_general_category"] = finished_paths["target_category"].apply(lambda x: x[0])

In [None]:
categories.to_csv("clean_categories.csv", index=False)
finished_paths.to_csv("clean_finished_paths.csv", index=False)
unfinished_paths.to_csv("clean_unfinished_paths.csv", index=False)

In [None]:
finished_grouped_by_categories = finished_paths[["source_general_category", "target_general_category", "nb_clicks"]].groupby(["source_general_category", "target_general_category"]).agg({"nb_clicks": "mean"}).reset_index()
finished_grouped_by_categories

In [None]:
unfinished_grouped_by_categories = unfinished_paths[["source_general_category", "target_general_category", "nb_clicks"]].groupby(["source_general_category", "target_general_category"]).agg({"nb_clicks": "median"}).reset_index()
unfinished_grouped_by_categories

In [None]:
# Plot the average number of clicks from a source category to a target category
import seaborn as sns
finished_matrix = finished_grouped_by_categories.pivot(index="source_general_category", columns="target_general_category", values="nb_clicks")
finished_matrix = finished_matrix.reindex(finished_matrix.sum().sort_values(ascending=False).index, axis=1)
sns.heatmap(finished_matrix, cmap="YlGnBu")
plt.show()


In [None]:

unfinished_matrix = unfinished_grouped_by_categories.pivot(index="source_general_category", columns="target_general_category", values="nb_clicks")
#order by the categories with the most paths
unfinished_matrix = unfinished_matrix.reindex(unfinished_matrix.sum().sort_values(ascending=False).index, axis=1)
sns.heatmap(unfinished_matrix, cmap="YlGnBu")
plt.show()

In [None]:
target_categories_distribution = finished_paths["target_general_category"].value_counts()
target_categories_distribution.plot.bar()

In [None]:
unfinished_target_categories_distribution = unfinished_paths["target_general_category"].value_counts()
unfinished_target_categories_distribution.plot.bar()