In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import os
import urllib
import datetime as datetime
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy import stats 

# load config and extract variables
import config
DATA_PATH = config.PATH_TO_DATA

### Loading and Preparing the Data
Load and clean up the paths, load into weighted graph structure etc.

#### Load Data

In [None]:
# load in all data (except wikipedia articles)
finished_paths = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/paths_finished.tsv"), sep='\t', skiprows=15, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"])
unfinished_paths = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/paths_unfinished.tsv"), sep='\t', skiprows=16, names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"])
edges = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/links.tsv"), sep='\t', skiprows=15, names=["start", "end"], encoding="utf-8")
articles = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/articles.tsv"), sep='\t', skiprows=12, names=["article"], encoding="utf-8")
categories = pd.read_csv(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/categories.tsv"), sep='\t', skiprows=13, names=["article", "category"], encoding="utf-8")
shortest_paths = np.genfromtxt(os.path.join(DATA_PATH, "wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt"), delimiter=1, dtype=np.uint8)

#### Clean up

In [None]:
# Clean up edge list
display(edges.head())
edges["start"] = edges.start.apply(urllib.parse.unquote)
edges["end"] = edges.end.apply(urllib.parse.unquote)
display(edges.head())

In [None]:
# format datetime as datetime object
finished_paths["datetime"] = finished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
unfinished_paths["datetime"] = unfinished_paths.timestamp.apply(datetime.datetime.fromtimestamp)
display(unfinished_paths.head())

In [None]:
# clean up url encoding for articles
display(articles.head())
articles["article"] = articles.article.apply(urllib.parse.unquote)
display(articles.head())

In [None]:
# clean up url encoding for categories
display(categories.head())
categories["article"] = categories.article.apply(urllib.parse.unquote)
display(categories.head())

In [None]:
# merge articles and categories
articles_categories = pd.merge(articles, categories, how="left", on="article")
display(articles_categories.head())
# 6 articles without category!
print("Merge introduced {} NAs in category columns:".format(articles_categories.category.isna().sum()))
articles_categories[articles_categories.category.isna()]

In [None]:
# Convert paths to a readable format (lists)

finished_paths_readable = finished_paths.copy()
finished_paths_readable["readable_path"] = finished_paths_readable["path"].apply(lambda x: x.split(";"))
finished_paths_readable["readable_path"] = finished_paths_readable["readable_path"].apply(lambda x: [urllib.parse.unquote(y) for y in x])

unfinished_paths_readable = unfinished_paths.copy()
unfinished_paths_readable["readable_path"] = unfinished_paths_readable["path"].apply(lambda x: x.split(";"))
unfinished_paths_readable["readable_path"] = unfinished_paths_readable["readable_path"].apply(lambda x: [urllib.parse.unquote(y) for y in x])
unfinished_paths_readable["target"] = unfinished_paths_readable["target"].apply(urllib.parse.unquote)

In [None]:
# functions to get all links between articles
from itertools import tee
def pairwise(iterable):
    # from python docs - will be introduced in version 3.10
    # pairwise('ABCDEFG') --> AB BC CD DE EF FG
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)


def get_all_links(df, path_colname="path"):
    edge_counter = {}
    for _, row in df.iterrows():
        links = row['path'].split(';')
        
        edges = list(pairwise(links))

        for edge in edges:
            if edge in edge_counter:
                edge_counter[edge] += 1
            else:
                edge_counter[edge] = 1

    out = pd.Series(edge_counter).reset_index()
    out.columns = ["source", "target", "weight"]
    return out



In [None]:
# get all finished links
finished_links = get_all_links(finished_paths)
finished_links.sort_values(by="weight", ascending=False) # TODO: what is up with these <<< signs?


In [None]:
# get all unfinished links
unfinished_links = get_all_links(unfinished_paths)
unfinished_links.sort_values(by="weight", ascending=False) # TODO: what is up with these <<< signs?


In [None]:
# create newtorkx graph from finished paths
finished_graph = nx.from_pandas_edgelist(finished_links,source="source", target="target", edge_attr="weight")
hist = nx.degree_histogram(finished_graph)
plt.bar(range(len(hist)), hist)
pd.Series(hist).describe()

In [None]:
# create newtorkx graph from unfinished paths
unfinished_graph = nx.from_pandas_edgelist(unfinished_links,source="source", target="target", edge_attr="weight")
hist = nx.degree_histogram(unfinished_graph)
plt.bar(range(len(hist)), hist)
pd.Series(hist).describe()

In [None]:
# Seeing which categories are most represented in articles

broad_categories = categories.copy()
broad_categories["broad_category"] = broad_categories["category"].apply(lambda x: x.split(".")[1])

count_articles = broad_categories.groupby("broad_category").size()

print("Below shows how many articles each of the broad categories are represented by")
display(count_articles)

In [None]:
# Create dictionaries for easy discovery of what categories an article belongs to
article_to_category = {}
article_to_broad_category = {}
for i in range(len(broad_categories)):
    if broad_categories.iloc[i]["article"] in article_to_category:
        article_to_category[broad_categories.iloc[i]["article"]].append(broad_categories.iloc[i]["category"])
        article_to_broad_category[broad_categories.iloc[i]["article"]].append(broad_categories.iloc[i]["broad_category"])
    else:
        article_to_category[broad_categories.iloc[i]["article"]] = [broad_categories.iloc[i]["category"]]
        article_to_broad_category[broad_categories.iloc[i]["article"]] = [broad_categories.iloc[i]["broad_category"]]

# Backclick analysis

In [None]:
def backclick_lambda(l):
    """
    Returns a list of all the pages that were backclicked on in a path l.
    """
    if "<" in l:
        s = []
        res = []
        for i in range(len(l)):
            if l[i] == "<":
                res.append(s.pop())
            else:
                s.append(l[i])
        return res
    else:
        return pd.NA

#### We define ***backclick rate*** of a player on a category, as the number of times that player has backclicked a page of that category, over the total number he/she visited pages of that category. Or in other words, the how likely that player is to backclick a page of that category

In [None]:
# Filter out data before 2011-02-07 (unfinished paths were not recorded)
filtered_finished = finished_paths_readable[finished_paths_readable["datetime"] >= unfinished_paths_readable.sort_values(by="datetime").datetime[0]].copy()

filtered_finished["backclicked_pages"] = filtered_finished["readable_path"].apply(lambda x: backclick_lambda(x))
# NOTE: using all available categories per each page, not only the first
filtered_finished["category_visits"] = filtered_finished["readable_path"].apply(lambda x: [article_to_broad_category[page][i] for page in x if page in article_to_broad_category.keys() for i in range(len(article_to_broad_category[page]))])
filtered_finished["category_backclicks"] = filtered_finished["backclicked_pages"].apply(lambda x: [article_to_broad_category[page][i] for page in x if page in article_to_broad_category.keys() for i in range(len(article_to_broad_category[page]))]
                                                                                                    if x is not pd.NA else pd.NA)
filtered_finished = filtered_finished[["hashedIpAddress", "category_visits", "category_backclicks"]]

# Count the occurrences of each category for visited pages
visited_counts = filtered_finished[["hashedIpAddress", "category_visits"]].explode("category_visits").groupby(["hashedIpAddress", "category_visits"]).size().reset_index(name="visited_count")
backclicked_counts = filtered_finished[["hashedIpAddress", "category_backclicks"]].explode("category_backclicks").groupby(["hashedIpAddress", "category_backclicks"]).size().reset_index(name="backclicked_counts")

finished_result = pd.merge(visited_counts, backclicked_counts, how="outer", left_on=["hashedIpAddress", "category_visits"], right_on=["hashedIpAddress", "category_backclicks"])
finished_result = finished_result[["hashedIpAddress", "category_visits", "visited_count", "backclicked_counts"]].fillna(0)
finished_result["rate"] = finished_result["backclicked_counts"] / finished_result["visited_count"]

finished_result = finished_result[["category_visits", "rate"]].groupby(by="category_visits").aggregate(
    avg_rate=pd.NamedAgg(column="rate", aggfunc="mean"),
    counts=pd.NamedAgg(column="rate", aggfunc="count"),
).reset_index()
finished_result["SE"] = np.sqrt(finished_result["avg_rate"] * (1 - finished_result["avg_rate"]) / finished_result["counts"])

In [None]:
# Filter out data before 2011-02-07 (ununfinished paths were not recorded)
unfinished_cp = unfinished_paths_readable.copy()

unfinished_cp["backclicked_pages"] = unfinished_cp["readable_path"].apply(lambda x: backclick_lambda(x))
# NOTE: using all available categories per each page, not only the first
unfinished_cp["category_visits"] = unfinished_cp["readable_path"].apply(lambda x: [article_to_broad_category[page][i] for page in x if page in article_to_broad_category.keys() for i in range(len(article_to_broad_category[page]))])
unfinished_cp["category_backclicks"] = unfinished_cp["backclicked_pages"].apply(lambda x: [article_to_broad_category[page][i] for page in x if page in article_to_broad_category.keys() for i in range(len(article_to_broad_category[page]))]
                                                                                                    if x is not pd.NA else pd.NA)
unfinished_cp = unfinished_cp[["hashedIpAddress", "category_visits", "category_backclicks"]]

# Count the occurrences of each category for visited pages
visited_counts = unfinished_cp[["hashedIpAddress", "category_visits"]].explode("category_visits").groupby(["hashedIpAddress", "category_visits"]).size().reset_index(name="visited_count")
backclicked_counts = unfinished_cp[["hashedIpAddress", "category_backclicks"]].explode("category_backclicks").groupby(["hashedIpAddress", "category_backclicks"]).size().reset_index(name="backclicked_counts")

unfinished_result = pd.merge(visited_counts, backclicked_counts, how="outer", left_on=["hashedIpAddress", "category_visits"], right_on=["hashedIpAddress", "category_backclicks"])
unfinished_result = unfinished_result[["hashedIpAddress", "category_visits", "visited_count", "backclicked_counts"]].fillna(0)
unfinished_result["rate"] = unfinished_result["backclicked_counts"] / unfinished_result["visited_count"]

unfinished_result = unfinished_result[["category_visits", "rate"]].groupby(by="category_visits").aggregate(
    avg_rate=pd.NamedAgg(column="rate", aggfunc="mean"),
    counts=pd.NamedAgg(column="rate", aggfunc="count"),
).reset_index()
unfinished_result["SE"] = np.sqrt(unfinished_result["avg_rate"] * (1 - unfinished_result["avg_rate"]) / unfinished_result["counts"])

In [None]:
merged_result = pd.merge(unfinished_result, finished_result, on='category_visits', suffixes=('_unfinished', '_finished'))
merged_result.sort_values(by='avg_rate_unfinished', inplace=True, ascending=False)

# Create a grouped bar chart with different colors
plt.figure(figsize=(10, 6))

bar_width = 0.4
index = np.arange(len(merged_result['category_visits']))
bar_positions_df1 = index - bar_width / 2
bar_positions_df2 = index + bar_width / 2

plt.bar(bar_positions_df1, merged_result['avg_rate_finished'], 
        yerr=merged_result['SE_finished'],
        alpha=0.7, capsize=5, width=bar_width, label='Finished paths')

plt.bar(bar_positions_df2, merged_result['avg_rate_unfinished'], 
        yerr=merged_result['SE_unfinished'],
        alpha=0.7, capsize=5, width=bar_width, label='Unfinished paths', color='darkorange')

plt.xlabel('Category')
plt.ylabel('Backclick Rate')
plt.xticks(index, merged_result['category_visits'], rotation=90)
plt.legend()
plt.title("Back-click rate per category with CI")
plt.savefig("img.png", bbox_inches="tight")
plt.show()


In [None]:
# Plot the results as a bar chart with the copmute confidence intervals
plt.figure(figsize=(10, 5))
plt.bar(finished_result["category_visits"], finished_result["avg_rate"], color=plt.cm.tab20(range(15)))
plt.errorbar(finished_result["category_visits"], finished_result["avg_rate"], finished_result["SE"], fmt='none', ecolor='black', capsize=3)
plt.xticks(rotation=90)
plt.title("Back-click rate per category for finished paths")
plt.ylabel("Ratio of backclicks to visits")
plt.xlabel("Category")
plt.show()


In [None]:
# Plot the results as a bar chart with the copmute confidence intervals
plt.figure(figsize=(10, 5))
plt.bar(unfinished_result["category_visits"], unfinished_result["avg_rate"], color=plt.cm.tab20(range(15)))
plt.errorbar(unfinished_result["category_visits"], unfinished_result["avg_rate"], unfinished_result["SE"], fmt='none', ecolor='black', capsize=3)
plt.xticks(rotation=90)
plt.title("Back-click rate per category for unfinished paths")
plt.ylabel("Ratio of backclicks to visits")
plt.xlabel("Category")
plt.savefig("img2.png", bbox_inches="tight")
plt.show()