### Imports

In [None]:
import re
import os

import seaborn as sns
import plotly.subplots
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import scipy
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf


### Derfaults

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
REMOVE_INTERNATIONAL = True
INTERNATIONAL_LABEL = "International"

PLOTS_PATH = "plots"
PLOTS_PATH_PLT = os.path.join(PLOTS_PATH, "plt")
PLOTS_PATH_PX = os.path.join(PLOTS_PATH, "px")
PLOTS_PATH_HTML = os.path.join(PLOTS_PATH, "html")

FIGURE_WIDTH = 800
FIGURE_HEIGHT = 600

for path in [PLOTS_PATH_PLT, PLOTS_PATH_PX, PLOTS_PATH_HTML]: 
    os.makedirs(path, exist_ok=True)

# Continent Labels

Each article has assigned one continent label.

In [None]:
df_continents = pd.read_csv(os.path.join("Data", "continents.csv"))

if REMOVE_INTERNATIONAL:
    labeled_articles_all_count = len(df_continents)
    df_continents = df_continents[df_continents.continent != INTERNATIONAL_LABEL]
    labeled_articles_count = len(df_continents)
    print(f"Removing articles labeled as {INTERNATIONAL_LABEL}, Removed articles: {labeled_articles_all_count - labeled_articles_count}")

display(df_continents.head())
print("Size:", df_continents.shape)

### Colors for all plots

In [None]:
continents = df_continents["continent"].unique()
random_colors = sns.color_palette("husl", n_colors=len(continents))
continents_colors = {}
continents_colors_int = {}
for i in range(len(continents)):
    continents_colors[continents[i]] = random_colors[i]
    continents_colors_int[continents[i]] = tuple(map(lambda x: int(255 * x), random_colors[i]))
    continents_colors_int[continents[i]] = "#{0:02x}{1:02x}{2:02x}".format(*continents_colors_int[continents[i]])
print(continents_colors)
print(continents_colors_int)

CONTINENTS_NUM = len(continents_colors)

In [None]:
continents_grouped = df_continents.groupby(["continent"]).size()
continents_grouped = continents_grouped.sort_values(ascending=False)

display(continents_grouped)

In [None]:
fig_name = "articles_count_per_continent"

num_categories = len(continents_grouped.keys())
ax = plt.bar(continents_grouped.keys(), continents_grouped.values, color=[continents_colors[continent] for continent in continents_grouped.keys()])
plt.xticks(rotation=45)
plt.title("Number of articles per continent")
plt.ylabel("Count")
plt.xlabel("Continent")
plt.tight_layout()
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()


fig = go.Figure(data=[go.Bar(
    x=continents_grouped.index,
    y=continents_grouped.values,
    marker_color=[continents_colors_int[continent] for continent in continents_grouped.index],
)])
fig.update_layout(
    title_text="Number of articles per continent",
    title_x=0.5,
    #xaxis=dict(tickangle=-45),
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT,
)
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.show()


fig = go.Figure(data=[go.Pie(
    labels=continents_grouped.index,
    values=continents_grouped.values,
    pull=[0.2] + [0] * (CONTINENTS_NUM - 1),
    marker_colors=[continents_colors_int[continent] for continent in continents_grouped.index]
)])

fig.update_layout(
    title_text="Number of articles per continent",
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT,
)
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_pie.pdf"))
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_pie.html"))
fig.show()


# Article wikispeedia category

In [None]:
df_categories = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "categories.tsv"),
    delimiter="\t",
    header=None,
    names=["article", "category"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

display(df_categories.head())
print("Size:", df_categories.shape)

In [None]:
df_continents_categories = pd.merge(df_continents, df_categories, on="article")

display(df_continents_categories.head())
print("Size:", df_continents_categories.shape)

In [None]:
main_categories = []
for category in df_continents_categories["category"].values:
    main_categories.append(category.split(".")[1])

df_continents_categories["categoryMain"] = main_categories

display(df_continents_categories)
print("Size:", df_continents_categories.shape)

In [None]:
continents_categories = df_continents_categories.groupby(["categoryMain", "continent"]).size()

if not REMOVE_INTERNATIONAL:
    display(continents_categories[("Geography", "International")])

display(continents_categories)

In [None]:
categories = df_continents_categories["categoryMain"].unique()
continents = df_continents_categories["continent"].unique()
continents.sort()

category_positions = np.arange(len(categories))
bar_width = 0.5

continents_values = {}
for i, continent in enumerate(continents):
    frequencies = []
    for category in categories:
        try:
            value = continents_categories[(category, continent)]
        except KeyError:
            value = 0
        frequencies.append(value)
    continents_values[continent] = frequencies

df_continent_frequencies = pd.DataFrame(continents_values).T
df_continent_frequencies.columns = categories
display(df_continent_frequencies)
print("Size:", df_continent_frequencies.shape)

In [None]:
fig_name = "articles_count_per_category"
fig_title = "Continent distribution per Category"
fig_xlabel = "Article Count"
fig_ylabel = "Category"


ig, ax = plt.subplots()
bottom = np.zeros(len(categories))

for continent in continents:
    ax.barh(categories, continents_values[continent], label=continent, color=continents_colors[continent], edgecolor="w", height=0.5, left=bottom)
    bottom += continents_values[continent]

ax.set_yticks(category_positions)
ax.set_yticklabels(categories)
ax.set_xlabel(fig_xlabel)
ax.set_ylabel(fig_ylabel)
plt.title(fig_title)
plt.legend()
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()


continent_sorted = df_continent_frequencies.sum(axis="columns").sort_values(ascending=False).index

fig = px.bar(
    df_continent_frequencies.loc[continent_sorted].T.loc[df_continent_frequencies.sum(axis="index").sort_values().index],
    orientation ="h",
    title=fig_title,
    labels={"index": fig_ylabel, "value": fig_xlabel},
    color_discrete_sequence=[continents_colors_int[continent] for continent in continent_sorted],
)
fig.update_layout(
    legend_title_text="",
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT    
)
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()


fig = go.Figure()

annotations = {}
buttons = []
visible = True
mask = [False] * len(categories)
max_name_len = max(len(name) for name in continents)
for category_idx, category in enumerate(categories):
    category_data = df_continent_frequencies[category]
    category_data = category_data[category_data > 0]

    category_name = category.replace("_", " ")
    labels = [f"{name : <{max_name_len}}" for name in category_data.index]
    fig.add_trace(go.Pie(
        labels=labels,
        values=category_data.values,
        marker_colors=[continents_colors_int[continent] for continent in category_data.index],
        visible=visible,
        name=category_name
    ))

    annotation = dict(
        text=f"Category: {category_name}",
        x=0.5,
        y=1.1,
        showarrow=False
    )
    if visible:
        fig.add_annotation(annotation)

    mask[category_idx] = True
    buttons.append(dict(
        label=category_name,
        method="update",
        args=[
            {"visible": list(mask)},
            {"title": fig_title, "annotations": [annotation]}
        ]
    ))
    mask[category_idx] = False
    visible=False


fig.update_layout(
    title_text=fig_title,
    title_x=0.7,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT,
    legend=dict(
        x=-0.3,
        y=0.1
    )
)


fig.update_layout(
    updatemenus=[
        dict(
            active=0,
            buttons=buttons
        )
    ]
)

fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_pie.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_pie.pdf"))
fig.show()

In [None]:
df_articles = df_continents_categories[["article", "continent"]].drop_duplicates()
df_articles = pd.merge(df_articles, df_continents_categories.groupby("article")["categoryMain"].apply(list).reset_index(), on="article")

display(df_articles.head())
print("Size:", df_articles.shape)

## Length of articles

In [None]:
plaintext_path = os.path.join("Data", "plaintext_articles")

word_counts = []
for article_name in df_articles.article:
    file_path = os.path.join(plaintext_path, article_name + ".txt")

    with open(file_path, "r", encoding="utf-8") as file:

        _ = file.readline() # Skip the first line because it contains the word #copyright
        content = file.read()

    content = content[:re.search("Retrieved from", content).start(0)]
    word_counts.append(len(content.split()))

df_articles["length"] = word_counts

display(df_articles.head())
print("Size:", df_articles.shape)

In [None]:
continent_length = df_articles.groupby(["continent"]).length.agg(scipy.stats.gmean)
continent_length = continent_length.sort_values(ascending=False)

display(continent_length)
print("Size:", continent_length.shape)

In [None]:
fig_name = "gmean_length_of_articles_by_continent"
fig_title = "Length of articles by continent (Geometric mean)"
fig_xlabel = "Continent"
fig_ylabel = "World count"

ax = plt.bar(
    continent_length.index,
    continent_length.values.reshape(continent_length.shape[0]),
    color=[continents_colors[continent] for continent in continent_length.index]
)
plt.xticks(rotation=45)

plt.xlabel(fig_xlabel)
plt.ylabel(fig_ylabel)
plt.title(fig_title)
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()

fig = px.bar(
    x=continent_length.index,
    y=continent_length.values.reshape(continent_length.shape[0]),
    labels={"x": fig_xlabel, "y": fig_ylabel},
    title="Length of articles by continent (Geometric mean)",
    color=[continents_colors_int[continent] for continent in continent_length.index],
    color_discrete_map="identity",
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)

fig.update_layout(
    showlegend=False,
    title_x=0.5
)
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()

## Pagerank

In [None]:
# Load pagerank
df_pagerank = pd.read_csv(os.path.join("Data", "page_rank.csv"))

display(df_pagerank.head())
print("Size:", df_pagerank.shape)

In [None]:
df_articles = pd.merge(df_articles, df_pagerank, on="article", how="left").fillna(1e-6)

display(df_articles.head())
print("Size:", df_articles.shape)

In [None]:
# Compute mean
pagerank_continent_mean = df_articles.groupby("continent")["pageRank"].agg(scipy.stats.gmean)
pagerank_continent_mean = pagerank_continent_mean.sort_values(ascending=False)

# Compute mean
pagerank_continent_median = df_articles.groupby("continent")["pageRank"].median()
pagerank_continent_median = pagerank_continent_median.sort_values(ascending=False)

In [None]:
fig_name = "gmean_pagerank_by_continet"
fig_title = "PageRank by continent (Geometric Mean)"
fig_xlabel = "Continent"
fig_ylabel = "PageRank"

ax = plt.bar(
    pagerank_continent_mean.index,
    pagerank_continent_mean,
    color=[continents_colors[continent] for continent in pagerank_continent_mean.index]
)
plt.xticks(rotation=45)

plt.xlabel(fig_xlabel)
plt.ylabel(fig_ylabel)
plt.title(fig_title)
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()

fig = px.bar(
    x=pagerank_continent_mean.index,
    y=pagerank_continent_mean,
    labels={"x": fig_xlabel, "y": fig_ylabel},
    title=fig_title,
    color=[continents_colors_int[continent] for continent in pagerank_continent_mean.index],
    color_discrete_map="identity",
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)
fig.update_layout(
    showlegend=False,
    title_x=0.5
)
fig.update_yaxes(tickformat=".1e")
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()

In [None]:
fig_name = "median_pagerank_by_continet"
fig_title = "PageRank by continent (Geometric Median)"
fig_xlabel = "Continent"
fig_ylabel = "PageRank"

ax = plt.bar(pagerank_continent_median.index, pagerank_continent_median, color=[continents_colors[continent] for continent in pagerank_continent_median.index])
plt.xticks(rotation=45)

plt.xlabel(fig_xlabel)
plt.ylabel(fig_ylabel)
plt.title(fig_title)
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()

fig = px.bar(
    x=pagerank_continent_median.index,
    y=pagerank_continent_median,
    labels={"x": fig_xlabel, "y": fig_ylabel},
    title=fig_title,
    color=[continents_colors_int[continent] for continent in pagerank_continent_median.index],
    color_discrete_map="identity",
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)
fig.update_layout(
    showlegend=False,
    title_x=0.5
)
fig.update_yaxes(tickformat=".1e")
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()

## Analysis of paths

Group paths by continents, one path will be assigned to the continent corresponding to the GOAL article

Compute the number of "backclicks" in each path

Compute the length of each path

In [None]:
df_paths_finished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_finished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"],
    skip_blank_lines=True,
    comment="#"
)
df_paths_unfinished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_unfinished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "motif"],
    skip_blank_lines=True,
    comment="#"
)

df_paths_finished["backclicks"] = df_paths_finished["path"].apply(lambda x: x.count("<"))
df_paths_finished["pathSteps"] = df_paths_finished["path"].apply(lambda x: x.count(";") + 1)
df_paths_finished["uniqueArticles"] = df_paths_finished["pathSteps"] - df_paths_finished["backclicks"]
df_paths_finished["path"] = df_paths_finished["path"].apply(lambda x: x.split(";"))
df_paths_finished["start"] = df_paths_finished["path"].str[0]
df_paths_finished["target"] = df_paths_finished["path"].str[-1]
df_paths_finished["isFinished"] = True

df_paths_unfinished["backclicks"] = df_paths_unfinished["path"].apply(lambda x: x.count("<"))
df_paths_unfinished["pathSteps"] = df_paths_unfinished["path"].apply(lambda x: x.count(";") + 1)
df_paths_unfinished["uniqueArticles"] = df_paths_unfinished["pathSteps"] - df_paths_unfinished["backclicks"]
df_paths_unfinished["path"] = df_paths_unfinished["path"].apply(lambda x: x.split(";"))
df_paths_unfinished["start"] = df_paths_unfinished["path"].str[0]
df_paths_unfinished["isFinished"] = False

display(df_paths_finished.head())
display(df_paths_unfinished.head())

df_paths = pd.concat([df_paths_finished, df_paths_unfinished])
display(df_paths.head())

In [None]:
tmp = df_articles.copy()
tmp.columns = [column[0].upper() + column[1:] for column in tmp.columns]
tmp = tmp.add_prefix("target")

df_analysis = pd.merge(df_paths, tmp, left_on="target", right_on="targetArticle", suffixes=["", ]).drop(columns="targetArticle")

tmp = df_articles.copy()
tmp.columns = [column[0].upper() + column[1:] for column in tmp.columns]
tmp = tmp.add_prefix("start")
df_analysis = pd.merge(df_analysis, tmp, left_on="start", right_on="startArticle", suffixes=["", ]).drop(columns="startArticle")

df_analysis["isFinishedInt"] = df_analysis["isFinished"].astype(int)

display(df_analysis.head())
print("Size:", df_analysis.shape)

In [None]:
df_analysis_finished = df_analysis[df_analysis["isFinished"]]
df_analysis_unfinished = df_analysis[~df_analysis["isFinished"]]

df_analysis_per_continent = pd.DataFrame()
for name in ["start", "target"]:
    # Create a column to show the number of articles in each continent
    df_analysis_per_continent[f"{name}Articles"] = df_analysis.groupby(f"{name}Continent").size()

    # Create columns for av. backclicks
    df_analysis_per_continent[f"{name}BackcliksFinished"] =  df_analysis_finished.groupby(f"{name}Continent")["backclicks"].agg(scipy.stats.gmean)
    df_analysis_per_continent[f"{name}BackcliksUnfinished"] =  df_analysis_unfinished.groupby(f"{name}Continent")["backclicks"].agg(scipy.stats.gmean)

    # Create columns for av. path steps
    df_analysis_per_continent[f"{name}PathStepsFinished"] =  df_analysis_finished.groupby(f"{name}Continent")["pathSteps"].agg(scipy.stats.gmean)
    df_analysis_per_continent[f"{name}PathStepsUnfinished"] =  df_analysis_unfinished.groupby(f"{name}Continent")["pathSteps"].agg(scipy.stats.gmean)

    # Create columns for the number of finished and unfinished paths for each continent
    df_analysis_per_continent[f"{name}PathsFinished"] = df_analysis_finished[f"{name}Continent"].value_counts()
    df_analysis_per_continent[f"{name}PathsUnfinished"] = df_analysis_unfinished[f"{name}Continent"].value_counts()
    df_analysis_per_continent[f"{name}PathsFinishedPercentage"] = df_analysis.groupby(f"{name}Continent")["isFinishedInt"].mean() * 100

df_analysis_per_continent.index.name = "continent"

In [None]:
continent_data = df_analysis_per_continent.sort_values(by="targetPathsFinishedPercentage",ascending=False)
display(continent_data)
print("Size:", continent_data.shape)

In [None]:
continent_data = df_analysis_per_continent.sort_values(by="startPathsFinishedPercentage",ascending=False)
display(continent_data)
print("Size:", continent_data.shape)

In [None]:
fig_name = "count_target_by_continet"
fig_title = "Target articles by continent"
fig_xlabel = "Continent"
fig_ylabel = "Count"

ax = plt.bar(continent_data.index,continent_data["targetArticles"], color=[continents_colors[continent] for continent in continent_data.index])
plt.xticks(rotation=45)

plt.xlabel(fig_xlabel)
plt.ylabel(fig_ylabel)
plt.title(fig_title)
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()

fig = px.bar(
    x=continent_data.index,
    y=continent_data["targetArticles"],
    labels={"x": fig_xlabel, "y": fig_ylabel},
    title=fig_title,
    color=[continents_colors_int[continent] for continent in continent_data.index],
    color_discrete_map="identity"
)

fig.update_layout(
    showlegend=False,
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()

In [None]:
fig_name = "count_start_by_continet"
fig_title = "Start articles by continent"
fig_xlabel = "Continent"
fig_ylabel = "Count"

ax = plt.bar(continent_data.index,continent_data["startArticles"], color=[continents_colors[continent] for continent in continent_data.index])
plt.xticks(rotation=45)

plt.xlabel(fig_xlabel)
plt.ylabel(fig_ylabel)
plt.title(fig_title)
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()

fig = px.bar(
    x=continent_data.index,
    y=continent_data["startArticles"],
    labels={"x": fig_xlabel, "y": fig_ylabel},
    title=fig_title,
    color=[continents_colors_int[continent] for continent in continent_data.index],
    color_discrete_map="identity"
)

fig.update_layout(
    showlegend=False,
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()

In [None]:
df_article_path_stats = pd.DataFrame()

df_article_path_stats["article"] = df_articles["article"]
df_article_path_stats["continent"] = df_articles["continent"]
df_article_path_stats["targetFinished"] = df_articles["article"].map(df_paths_finished["target"].value_counts()).fillna(0)
df_article_path_stats["targetUnfinished"] = df_articles["article"].map(df_paths_unfinished["target"].value_counts()).fillna(0)

df_article_path_stats["startFinished"] = df_articles["article"].map(df_paths_finished["start"].value_counts()).fillna(0)
df_article_path_stats["startUnfinished"] = df_articles["article"].map(df_paths_unfinished["start"].value_counts()).fillna(0)

paths_finished = pd.Series(np.concatenate(df_paths_finished.path.values))
paths_unfinished = pd.Series(np.concatenate(df_paths_unfinished.path.values))

# Create columns to count the number of times each article appears in general in finished and unfinished paths
df_article_path_stats["anyFinished"] = df_articles["article"].map(paths_finished.value_counts()).fillna(0)
df_article_path_stats["anyUnfinished"] = df_articles["article"].map(paths_unfinished.value_counts()).fillna(0)

# Let"s calculate the probability of finding an article (we can compare this with the pagerank)
df_article_path_stats["anyPercentage"] = (df_article_path_stats["anyFinished"] + df_article_path_stats["anyUnfinished"]) / (len(paths_finished) + len(paths_unfinished))

In [None]:
display(df_article_path_stats.sort_values("anyPercentage", ascending=False).head())
print("Size:", df_article_path_stats.shape)

In [None]:
fig_name = "article_occurrence_by_continet"
fig_title = "Occurrence of article in path per continent (Median)"
fig_xlabel = "Continent"
fig_ylabel = "Percentage"


tmp = df_article_path_stats.groupby("continent")
tmp = tmp.anyPercentage.median()

ax = plt.bar(tmp.index, tmp, color=[continents_colors[continent] for continent in tmp.index])
plt.xticks(rotation=45)

plt.xlabel(fig_xlabel)
plt.ylabel(fig_ylabel)
plt.title(fig_title)
plt.show()

fig = px.bar(
    x=tmp.index,
    y=tmp,
    labels={"x": fig_xlabel, "y": fig_ylabel},
    title=fig_title,
    color=[continents_colors_int[continent] for continent in tmp.index],
    color_discrete_map="identity"
)

fig.update_layout(
    showlegend=False,
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()

In [None]:
fig_name = "median_pagerank_by_continet"
fig_title = "Percentage of finished paths by continent"
fig_xlabel = "Continent"
fig_ylabel = "Percentage"

ax = plt.bar(continent_data.index, continent_data["targetPathsFinishedPercentage"], color=[continents_colors[continent] for continent in continent_data.index])
plt.xticks(rotation=45)
plt.xlabel(fig_xlabel)
plt.ylabel(fig_ylabel)
plt.title(fig_title)
plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
plt.show()

fig = px.bar(
    x=continent_data.index,
    y=continent_data["targetPathsFinishedPercentage"],
    labels={"x": fig_xlabel, "y": fig_ylabel},
    title=fig_title,
    color=[continents_colors_int[continent] for continent in continent_data.index],
    color_discrete_map="identity"
)
fig.update_layout(
    showlegend=False,
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
fig.show()

In [None]:
df_analysis["treatment"] = df_analysis.targetContinent == "Europe"

In [None]:
scipy.stats.ttest_ind(df_analysis[df_analysis.treatment]["isFinishedInt"], df_analysis[~df_analysis.treatment]["isFinishedInt"], equal_var=False)

In [None]:
cross_tab = pd.crosstab(df_analysis['treatment'], df_analysis['isFinished'], margins=True, margins_name='Total')
proportions = cross_tab.div(cross_tab['Total'], axis=0).iloc[:-1, :-1]
standard_errors = proportions.apply(lambda x: x**0.5 * ((1 - x) / x / cross_tab['Total'][:-1])**0.5, axis=0)

fig = go.Figure()
fig.add_trace(go.Bar(
    x=[f'Treatment {group}' for group in proportions.index],
    y=proportions[True],
    error_y=dict(type='data', array=standard_errors[True]),
))

fig.update_layout(
    title='Proportion of Finished Samples',
    xaxis=dict(title='Treatment'),
    yaxis=dict(title='Proportion'),
    barmode='group',
)

fig.show()


# Testing dependance of variables

In [None]:
df_articles_all = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "articles.tsv"),
    delimiter="\t",
    header=None,
    names=["name"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

name_to_index = dict(zip(df_articles_all["name"], df_articles_all.index))

display(df_articles_all.head())
print("Size:", df_articles_all.shape)

In [None]:
sp_lenght = []

with open(os.path.join("Data", "wikispeedia_paths-and-graph", "shortest-path-distance-matrix.txt")) as file:
    for line in file:
        line = line.strip()
        if line == "" or line.startswith("#"):
            continue
    
        sp_lenght.append(list(map(lambda x: -1 if x == "_" else int(x), list(line))))
        
sp_lenght = np.array(sp_lenght)

print(sp_lenght[:10, :10])

In [None]:
df_analysis["startIdx"] = df_analysis["start"].replace(name_to_index)
df_analysis["targetIdx"] = df_analysis["target"].replace(name_to_index)

df_analysis["shortestPath"] = df_analysis.apply(lambda x: sp_lenght[x["startIdx"]][x["targetIdx"]], axis="columns")

In [None]:
df_analysis_onehot = df_analysis.copy()

for col in ["startContinent", "startCategoryMain", "targetContinent", "targetCategoryMain"]:
    one_hot = pd.get_dummies(df_analysis[col].explode()).groupby(level=0).sum()
    one_hot = one_hot.add_prefix(f"{col}OneHot")

    df_analysis_onehot = pd.concat((df_analysis_onehot, one_hot), axis="columns")

df_analysis_onehot.columns = [col.replace(" ", "_") for col in df_analysis_onehot.columns]

In [None]:
corr_cols = ["backclicks", "pathSteps", "uniqueArticles", "targetLength", "targetPageRank", "startLength", "startPageRank", "isFinished", "shortestPath"] + [col for col in df_analysis_onehot.columns if "OneHot" in col]
display(df_analysis_onehot[corr_cols].corr()["isFinished"])
display(df_analysis_onehot[corr_cols].corr("spearman")["isFinished"])


In [None]:
features = [col for col in df_analysis_onehot.columns if "CategoryMainOneHot" in col] + ["startLength", "startPageRank", "targetLength", "targetPageRank", "shortestPath"]
eq = "isFinishedInt ~ " + " + ".join(features)

model = smf.logit(eq, df_analysis_onehot).fit()
print(model.summary())

In [None]:
pvalues = model.pvalues
print(pvalues[pvalues < 0.05])

In [None]:
eq = "isFinishedInt ~ " + " + ".join(x for x in pvalues[pvalues < 0.05].index if x != "Intercept")

model = smf.logit(eq, df_analysis_onehot).fit()
print(model.summary())