In [None]:
import re
import os
import scipy
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import statsmodels.formula.api as smf

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
df_continents = pd.read_csv(os.path.join("Data", "continents.csv"))

display(df_continents)

### Colors for all plots

In [None]:
continents = df_continents['continent'].unique()
random_colors = sns.color_palette('husl', n_colors=len(continents))
continents_colors = {}
continents_colors_int = {}
for i in range(len(continents)):
    continents_colors[continents[i]] = random_colors[i]
    continents_colors_int[continents[i]] = tuple(map(lambda x: int(255 * x), random_colors[i]))
    continents_colors_int[continents[i]] = "#{0:02x}{1:02x}{2:02x}".format(*continents_colors_int[continents[i]])
print(continents_colors)
print(continents_colors_int)


### Articles per continent

In [None]:
continents_grouped = df_continents.groupby(['continent']).size()
continents_grouped = continents_grouped.sort_values(ascending=False)
continents_grouped

In [None]:
num_categories = len(continents_grouped.keys())
ax = plt.bar(continents_grouped.keys(), continents_grouped.values, color=[continents_colors[continent] for continent in continents_grouped.keys()])
plt.xticks(rotation=45)

fig = px.bar(
    x=list(continents_grouped.keys()),
    y=list(continents_grouped.values),
    labels={"x": "Continent", "y": "Count"},
    title="Count of Categories by Continent",
    color=list(continents_grouped.keys()),
)

# fig.update_layout(xaxis=dict(tickangle=-45))
fig.update_layout(showlegend=False)

file_path = os.path.join(plots_path, "count_of_categories_by_continent.html")
fig.write_html(file_path)

fig.show()


In [None]:
plots_path = "plots"
os.makedirs(plots_path, exist_ok=True)

fig = px.bar(
    x=list(continents_grouped.keys()),
    y=list(continents_grouped.values),
    labels={"x": "Continent", "y": "Count"},
    title="Count of Categories by Continent",
    color=[continents_colors_int[continent] for continent in continents_grouped.index],
    color_discrete_map="identity"
)

# fig.update_layout(xaxis=dict(tickangle=-45))
fig.update_layout(showlegend=False)

file_path = os.path.join(plots_path, "count_of_categories_by_continent.html")
fig.write_html(file_path)

fig.show()

## Number of articles by category

In [None]:
df_categories = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "categories.tsv"),
    delimiter="\t",
    header=None,
    names=["article", "category"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

display(df_categories.head())
print("Size:", df_categories.shape)

In [None]:
df_articles = pd.merge(df_categories, df_continents, on="article", how="left")
display(df_articles.head())
print("Size:", df_articles.shape, "Missing values:", df_articles.isna().any().any())

In [None]:
main_cats = []
for category in df_articles['category'].values:
    main_cats.append(category.split('.')[1])

df_articles['categoryMain'] = main_cats
display(df_articles)
print("Size:", df_articles.shape)

In [None]:
articles_cat_continent = df_articles.groupby(['categoryMain', 'continent']).size()
display(articles_cat_continent[('Geography', 'International')])

In [None]:
display(articles_cat_continent)

In [None]:
categories = df_articles['categoryMain'].unique()
continents = df_articles['continent'].unique()
continents.sort()

#In case we do not want international
mask = (continents != 'International')
continents = continents[mask]

category_positions = np.arange(len(categories))
bar_width = 0.5

continents_values = {}
for i, continent in enumerate(continents):
    frequencies = []
    for category in categories:
        try:
            value = articles_cat_continent[(category, continent)]
        except KeyError:
            value = 0
        frequencies.append(value)
    continents_values[continent] = frequencies

df_continent_frequencies = pd.DataFrame(continents_values).T
df_continent_frequencies.columns = categories
display(df_continent_frequencies)
print("Size:", df_continent_frequencies.shape)

fig = px.bar(
    df_continent_frequencies.T,
    orientation='h',
    title='Frequency of Continents in Each Category',
    labels={"index": "Category", "value": "Frequency"},
    #category_orders={"index": categories},
)

file_path = os.path.join(plots_path, "frequency_continents_in_each_category.html")
fig.write_html(file_path)

for continent in continents:
    ax.barh(categories, continents_values[continent], label=continent, color=continents_colors[continent], edgecolor='w', height=0.5, left=bottom)
    bottom += continents_values[continent]

ax.set_yticks(category_positions)
ax.set_yticklabels(categories)
ax.set_xlabel('Frequency')
ax.set_ylabel('Category')
plt.title('Frequency of Continents in Each Category')
plt.legend()

plt.show()


In [None]:
continent_sorted = df_continent_frequencies.sum(axis="columns").sort_values(ascending=False).index

fig = px.bar(
    df_continent_frequencies.loc[continent_sorted].T.loc[df_continent_frequencies.sum(axis="index").sort_values().index],
    orientation ='h',
    title='Frequency of Continents in Each Category',
    labels={"index": "Category", "value": "Frequency"},
    color_discrete_sequence=[continents_colors_int[continent] for continent in continent_sorted],
)

file_path = os.path.join(plots_path, "frequency_continents_in_each_category.html")
fig.write_html(file_path)

fig.show()

In [None]:
category_counts = df_articles["categoryMain"].value_counts()

display(category_counts)

df_articles["categoryCounts"] = df_articles["categoryMain"].replace(category_counts)
df_articles = df_articles.sort_values("categoryCounts", ascending=False).drop_duplicates("article")

display(df_articles)


## Length of articles

In [None]:
all_articles = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "articles.tsv"),
    sep="\t",
    header=None,
    names=["article"],
    skip_blank_lines=True,
    comment="#",
).article.tolist()

print("Number of articles:", len(all_articles))
print("Articles without category:")

invalid_articles = set(all_articles) - set(df_articles.article)
for article in invalid_articles:
    print(article)

In [None]:
plaintext_path = os.path.join('Data', 'plaintext_articles')

word_counts = []
for article_name in df_articles.article:
    file_path = os.path.join(plaintext_path, article_name + '.txt')

    with open(file_path, 'r', encoding='utf-8') as file:

        _ = file.readline() # Skip the first line because it contains the word #copyright
        content = file.read()

    content = content[:re.search("Retrieved from", content).start(0)]
    word_count = len(content.split())

    word_counts.append(word_count)

df_articles['length'] = word_counts

display(df_articles.head())
print("Size:", df_articles.shape)

In [None]:
continent_length = df_articles.groupby(['continent']).length.agg(scipy.stats.gmean)
continent_length = continent_length.sort_values(ascending=False)

display(continent_length)
print("Size:", continent_length.shape)

In [None]:
ax = plt.bar(continent_length.index,continent_length.values.reshape(continent_length.shape[0]), color=[continents_colors[continent] for continent in continent_length.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Length')
plt.title('Length of articles by continent (Geometric mean)')
plt.show()

In [None]:
fig = px.bar(
    x=continent_length.index,
    y=continent_length.values.reshape(continent_length.shape[0]),
    labels={"x": "Continent", "y": "Length"},
    title="Length of articles by continent (Geometric mean)",
    color=[continents_colors_int[continent] for continent in continent_length.index],
    color_discrete_map="identity"
)

fig.update_layout(showlegend=False)

file_path = os.path.join(plots_path, "average_length_of_articles_by_continent.html")
fig.write_html(file_path)

fig.show()

## Pagerank

In [None]:
# Load pagerank
pagerank_df = pd.read_csv(os.path.join("Data", "page_rank.csv"))

In [None]:
display(pagerank_df.head())
print("Size:", pagerank_df.shape)

In [None]:
df_articles = pd.merge(df_articles, pagerank_df, on="article", how="left").fillna(1e-6)

display(df_articles.head())
print("Size:", df_articles.shape)

In [None]:
# Compute mean
pagerank_continent_mean = df_articles.groupby("continent")["pageRank"].agg(scipy.stats.gmean)
pagerank_continent_mean = pagerank_continent_mean.sort_values(ascending=False)

# Compute mean
pagerank_continent_median = df_articles.groupby("continent")["pageRank"].median()
pagerank_continent_median = pagerank_continent_median.sort_values(ascending=False)

In [None]:
ax = plt.bar(pagerank_continent_mean.index, pagerank_continent_mean, color=[continents_colors[continent] for continent in pagerank_continent_mean.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Pagerank')
plt.title('PageRank by continent (Geometric Mean)')

plt.show()

In [None]:
pagerank_continent_mean

In [None]:
fig = px.bar(
    x=pagerank_continent_mean.index,
    y=pagerank_continent_mean,
    labels={"x": "Continent", "y": "PageRank"},
    title="PageRank by continent (Geometric Mean)",
    color=[continents_colors_int[continent] for continent in pagerank_continent_mean.index],
    color_discrete_map="identity"
)

fig.update_layout(showlegend=False)
fig.update_yaxes(tickformat=".1e")

file_path = os.path.join(plots_path, "pagerank_mean_by_continent.html")
fig.write_html(file_path)

fig.show()

In [None]:
ax = plt.bar(pagerank_continent_median.index, pagerank_continent_median, color=[continents_colors[continent] for continent in pagerank_continent_median.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Pagerank')
plt.title('PageRank by continent (Median)')

plt.show()

In [None]:
fig = px.bar(
    x=pagerank_continent_median.index,
    y=pagerank_continent_median,
    labels={"x": "Continent", "y": "PageRank"},
    title="PageRank by continent (Median)",
    color=[continents_colors_int[continent] for continent in pagerank_continent_median.index],
    color_discrete_map="identity"
)

fig.update_layout(showlegend=False)
fig.update_yaxes(tickformat=".1e")

file_path = os.path.join(plots_path, "pagerank_median_by_continent.html")
fig.write_html(file_path)

fig.show()

## Analysis of paths

Group paths by continents, one path will be assigned to the continent corresponding to the GOAL article

Compute the number of "backclicks" in each path

Compute the length of each path

In [None]:
df_paths_finished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_finished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"],
    skip_blank_lines=True,
    comment="#"
)
df_paths_unfinished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_unfinished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "motif"],
    skip_blank_lines=True,
    comment="#"
)

df_paths_finished['backclicks'] = df_paths_finished['path'].apply(lambda x: x.count('<'))
df_paths_finished['pathSteps'] = df_paths_finished['path'].apply(lambda x: x.count(';') + 1)
df_paths_finished["uniqueArticles"] = df_paths_finished['pathSteps'] - df_paths_finished['backclicks']
df_paths_finished["path"] = df_paths_finished['path'].apply(lambda x: x.split(';'))
df_paths_finished["start"] = df_paths_finished["path"].str[0]
df_paths_finished["target"] = df_paths_finished["path"].str[0]
df_paths_finished["isFinished"] = True

df_paths_unfinished['backclicks'] = df_paths_unfinished['path'].apply(lambda x: x.count('<'))
df_paths_unfinished['pathSteps'] = df_paths_unfinished['path'].apply(lambda x: x.count(';') + 1)
df_paths_unfinished["uniqueArticles"] = df_paths_unfinished['pathSteps'] - df_paths_unfinished['backclicks']
df_paths_unfinished["path"] = df_paths_unfinished['path'].apply(lambda x: x.split(';'))
df_paths_unfinished["start"] = df_paths_unfinished["path"].str[0]
df_paths_unfinished["isFinished"] = False

display(df_paths_finished.head())
display(df_paths_unfinished.head())

df_paths = pd.concat([df_paths_finished, df_paths_unfinished])
display(df_paths.head())

In [None]:
"sdasADfds".title()

In [None]:
tmp = df_articles.copy()
tmp.columns = [column[0].upper() + column[1:] for column in tmp.columns]
tmp = tmp.add_prefix("target")

df_analysis = pd.merge(df_paths, tmp, left_on="target", right_on="targetArticle", suffixes=["", ]).drop(columns="targetArticle")

tmp = df_articles.copy()
tmp.columns = [column[0].upper() + column[1:] for column in tmp.columns]
tmp = tmp.add_prefix("start")
df_analysis = pd.merge(df_analysis, tmp, left_on="start", right_on="startArticle", suffixes=["", ]).drop(columns="startArticle")

display(df_analysis.head())
print("Size:", df_analysis.shape)


In [None]:
df_analysis["isFinishedInt"] = df_analysis["isFinished"].astype(int)
df_analysis_finished = df_analysis[df_analysis["isFinished"]]
df_analysis_unfinished = df_analysis[~df_analysis["isFinished"]]

df_analysis_per_continent = pd.DataFrame()
for name in ["start", "target"]:
    # Create a column to show the number of articles in each continent
    df_analysis_per_continent[f"{name}Articles"] = df_analysis.groupby(f"{name}Continent").size()

    # Create columns for av. backclicks
    df_analysis_per_continent[f"{name}BackcliksFinished"] =  df_analysis_finished.groupby(f"{name}Continent")["backclicks"].agg(scipy.stats.gmean)
    df_analysis_per_continent[f"{name}BackcliksUnfinished"] =  df_analysis_unfinished.groupby(f"{name}Continent")["backclicks"].agg(scipy.stats.gmean)

    # Create columns for av. path steps
    df_analysis_per_continent[f"{name}PathStepsFinished"] =  df_analysis_finished.groupby(f"{name}Continent")["pathSteps"].agg(scipy.stats.gmean)
    df_analysis_per_continent[f"{name}PathStepsUnfinished"] =  df_analysis_unfinished.groupby(f"{name}Continent")["pathSteps"].agg(scipy.stats.gmean)

    # Create columns for the number of finished and unfinished paths for each continent
    df_analysis_per_continent[f"{name}PathsFinished"] = df_analysis_finished[f"{name}Continent"].value_counts()
    df_analysis_per_continent[f"{name}PathsUnfinished"] = df_analysis_unfinished[f"{name}Continent"].value_counts()
    df_analysis_per_continent[f'{name}PathsFinishedPercentage'] = df_analysis.groupby(f"{name}Continent")["isFinishedInt"].mean() * 100

df_analysis_per_continent.index.name = "continent"

In [None]:
continent_data = df_analysis_per_continent.sort_values(by='targetPathsFinishedPercentage',ascending=False)
display(continent_data)
print("Size:", continent_data.shape)

In [None]:
continent_data = df_analysis_per_continent.sort_values(by='startPathsFinishedPercentage',ascending=False)
display(continent_data)
print("Size:", continent_data.shape)

In [None]:
ax = plt.bar(continent_data.index,continent_data['targetPathsFinishedPercentage'], color=[continents_colors[continent] for continent in continent_data.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Percentage')
plt.title('Percentage of finished paths by continent')
plt.show()

In [None]:
fig = px.bar(
    x=continent_data.index,
    y=continent_data['targetPathsFinishedPercentage'],
    labels={"x": "Continent", "y": "Percentage"},
    title="Percentage of finished paths by continent",
    color=[continents_colors_int[continent] for continent in continent_data.index],
    color_discrete_map="identity"
)

fig.update_layout(showlegend=False)

file_path = os.path.join(plots_path, "percentage_finished_paths_by_continent.html")
fig.write_html(file_path)

fig.show()

In [None]:
ax = plt.bar(continent_data.index,continent_data['targetArticles'], color=[continents_colors[continent] for continent in continent_data.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Count')
plt.title('Target articles by continent')
plt.show()

In [None]:
fig = px.bar(
    x=continent_data.index,
    y=continent_data['targetArticles'],
    labels={"x": "Continent", "y": "Count"},
    title="Number of target articles by continent",
    color=[continents_colors_int[continent] for continent in continent_data.index],
    color_discrete_map="identity"
)

fig.update_layout(showlegend=False)

file_path = os.path.join(plots_path, "count_target_articles_by_continent.html")
fig.write_html(file_path)

fig.show()

In [None]:
ax = plt.bar(continent_data.index,continent_data['startArticles'], color=[continents_colors[continent] for continent in continent_data.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Count')
plt.title('Start articles by continent')
plt.show()

In [None]:
fig = px.bar(
    x=continent_data.index,
    y=continent_data['startArticles'],
    labels={"x": "Continent", "y": "Count"},
    title="Number of start articles by continent",
    color=[continents_colors_int[continent] for continent in continent_data.index],
    color_discrete_map="identity"
)

fig.update_layout(showlegend=False)

file_path = os.path.join(plots_path, "count_start_articles_by_continent.html")
fig.write_html(file_path)

fig.show()

In [None]:
df_articles["targetFinished"] = df_articles["article"].map(df_paths_finished["target"].value_counts()).fillna(0)
df_articles["targetUnfinished"] = df_articles["article"].map(df_paths_unfinished["target"].value_counts()).fillna(0)

df_articles["startFinished"] = df_articles["article"].map(df_paths_finished["start"].value_counts()).fillna(0)
df_articles["startUnfinished"] = df_articles["article"].map(df_paths_unfinished["start"].value_counts()).fillna(0)

paths_finished = pd.Series(np.concatenate(df_paths_finished.path.values))
paths_unfinished = pd.Series(np.concatenate(df_paths_unfinished.path.values))

# Create columns to count the number of times each article appears in general in finished and unfinished paths
df_articles["anyFinished"] = df_articles["article"].map(paths_finished.value_counts()).fillna(0)
df_articles["anyUnfinished"] = df_articles["article"].map(paths_unfinished.value_counts()).fillna(0)

# Let's calculate the probability of finding an article (we can compare this with the pagerank)
df_articles["anyPercentage"] = (df_articles["anyFinished"] + df_articles["anyUnfinished"]) / (len(paths_finished) + len(paths_unfinished))

In [None]:
display(df_articles.sort_values("anyPercentage", ascending=False))
print("Size:", df_articles.shape)

In [None]:
tmp = df_articles.groupby("continent")
tmp = tmp.anyPercentage.median()

ax = plt.bar(tmp.index, tmp, color=[continents_colors[continent] for continent in tmp.index])
plt.xticks(rotation=45)

plt.xlabel('Continent')
plt.ylabel('Percentage')
plt.title('Percentage of visiting article per continent (Median)')
plt.show()

In [None]:
fig = px.bar(
    x=tmp.index,
    y=tmp,
    labels={"x": "Continent", "y": "Percentage"},
    title="Occurrence of article in path",
    color=[continents_colors_int[continent] for continent in tmp.index],
    color_discrete_map="identity"
)

fig.update_layout(showlegend=False)

file_path = os.path.join(plots_path, "article_occurrence_by_continent.html")
fig.write_html(file_path)

fig.show()

# Testing dependance of variables

In [None]:
for col in ["startContinent", "startCategory", "startCategoryMain", "targetContinent", "targetCategory", "targetCategoryMain"]:
    df_analysis[col + "Codes"] = df_analysis[col].astype('category').cat.codes

In [None]:
corr_cols = ["backclicks", "pathSteps", "uniqueArticles", "targetLength", "targetPageRank", "targetContinentCodes", "targetCategoryMainCodes", "startLength", "startPageRank", "startContinentCodes", "startCategoryMainCodes", "isFinished"]
display(df_analysis[corr_cols].corr()["isFinished"])
display(df_analysis[corr_cols].corr("spearman")["isFinished"])


In [None]:
sp_lenght = []

with open(os.path.join("Data", "wikispeedia_paths-and-graph", "shortest-path-distance-matrix.txt")) as file:
    for line in file:
        line = line.strip()
        if line == "" or line.startswith("#"):
            continue
    
        sp_lenght.append(list(map(lambda x: -1 if x == "_" else int(x), list(line))))
        
sp_lenght = np.array(sp_lenght)

print(sp_lenght[:10, :10])