### Imports

In [None]:
import re
import os

import seaborn as sns
import plotly.subplots
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import scipy
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf

import networkx as nx

### Derfaults

In [None]:
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
REMOVE_INTERNATIONAL = True
INTERNATIONAL_LABEL = "International"

PLOTS_PATH = "plots"
PLOTS_PATH_PLT = os.path.join(PLOTS_PATH, "plt")
PLOTS_PATH_PX = os.path.join(PLOTS_PATH, "px")
PLOTS_PATH_HTML = os.path.join(PLOTS_PATH, "html")

FIGURE_WIDTH = 800
FIGURE_HEIGHT = 600

for path in [PLOTS_PATH_PLT, PLOTS_PATH_PX, PLOTS_PATH_HTML]: 
    os.makedirs(path, exist_ok=True)

# Continent Labels

Each article has assigned one continent label.

In [None]:
df_continents = pd.read_csv(os.path.join("Data", "continents.csv"))

if REMOVE_INTERNATIONAL:
    labeled_articles_all_count = len(df_continents)
    df_continents = df_continents[df_continents.continent != INTERNATIONAL_LABEL]
    labeled_articles_count = len(df_continents)
    print(f"Removing articles labeled as {INTERNATIONAL_LABEL}, Removed articles: {labeled_articles_all_count - labeled_articles_count}")

display(df_continents.head())
print("Size:", df_continents.shape)

### Colors for all plots

For consistency, we need to assign some colors to the continents so the plots are clear

In [None]:
continents = df_continents["continent"].unique()
random_colors = sns.color_palette("husl", n_colors=len(continents))
continents_colors = {}
continents_colors_int = {}
for i in range(len(continents)):
    continents_colors[continents[i]] = random_colors[i]
    continents_colors_int[continents[i]] = tuple(map(lambda x: int(255 * x), random_colors[i]))
    continents_colors_int[continents[i]] = "#{0:02x}{1:02x}{2:02x}".format(*continents_colors_int[continents[i]])
print(continents_colors)
print(continents_colors_int)

CONTINENTS_NUM = len(continents_colors)

In [None]:
continents_grouped = df_continents.groupby(["continent"]).size()
continents_grouped = continents_grouped.sort_values(ascending=False)

display(continents_grouped)

# Number of articles per continent

We are going to see how many articles are for each continent in our dataset

In [None]:
if False:
    fig_name = "articles_count_per_continent"

    num_categories = len(continents_grouped.keys())
    ax = plt.bar(continents_grouped.keys(), continents_grouped.values, color=[continents_colors[continent] for continent in continents_grouped.keys()])
    plt.xticks(rotation=45)
    plt.title("Number of articles per continent")
    plt.ylabel("Count")
    plt.xlabel("Continent")
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()


    fig = go.Figure(data=[go.Bar(
        x=continents_grouped.index,
        y=continents_grouped.values,
        marker_color=[continents_colors_int[continent] for continent in continents_grouped.index],
    )])
    fig.update_layout(
        title_text="Number of articles per continent",
        title_x=0.5,
        #xaxis=dict(tickangle=-45),
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT,
    )
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.show()


    fig = go.Figure(data=[go.Pie(
        labels=continents_grouped.index,
        values=continents_grouped.values,
        pull=[0.2] + [0] * (CONTINENTS_NUM - 1),
        marker_colors=[continents_colors_int[continent] for continent in continents_grouped.index]
    )])

    fig.update_layout(
        title_text="Number of articles per continent",
        title_x=0.5,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT,
    )
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_pie.pdf"))
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_pie.html"))
    fig.show()


## Article wikispeedia category

In this section we are interested about the distribution of the continents along the categories, i.e, what percentage of articles of a specific category belong to one specific continent.

To achieve this we first need to read the categories.tsv file that give us the category of a specific article.

In [None]:
df_categories = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "categories.tsv"),
    delimiter="\t",
    header=None,
    names=["article", "category"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

display(df_categories.head())
print("Size:", df_categories.shape)

Then we group in one dataframe about articles, the information about its continent and the category. 

We see that some articles appear in more than one category. This is okey because the analysis in this point is by categories.

In [None]:
df_continents_categories = pd.merge(df_continents, df_categories, on="article")

display(df_continents_categories.head())
print("Size:", df_continents_categories.shape)

The category value for each article starts with 'subject', then main category and then some subcategories. As we are interested for the moment just on the main category, we create a new column for the main category.

In [None]:
main_categories = []
for category in df_continents_categories["category"].values:
    main_categories.append(category.split(".")[1])

df_continents_categories["categoryMain"] = main_categories

display(df_continents_categories)
print("Size:", df_continents_categories.shape)

Then we divide the dataset by caetgories and we compute how many articles from each continent are in each of the categories

In [None]:
continents_categories = df_continents_categories.groupby(["categoryMain", "continent"]).size()

if not REMOVE_INTERNATIONAL:
    display(continents_categories[("Geography", "International")])

display(continents_categories)

In [None]:
categories = df_continents_categories["categoryMain"].unique()
continents = df_continents_categories["continent"].unique()
continents.sort()

category_positions = np.arange(len(categories))
bar_width = 0.5

continents_values = {}
for i, continent in enumerate(continents):
    frequencies = []
    for category in categories:
        try:
            value = continents_categories[(category, continent)]
        except KeyError:
            value = 0
        frequencies.append(value)
    continents_values[continent] = frequencies

df_continent_frequencies = pd.DataFrame(continents_values).T
df_continent_frequencies.columns = categories
display(df_continent_frequencies)
print("Size:", df_continent_frequencies.shape)

In [None]:
if False:

    fig_name = "articles_count_per_category"
    fig_title = "Continent distribution per Category"
    fig_xlabel = "Article Count"
    fig_ylabel = "Category"


    ig, ax = plt.subplots()
    bottom = np.zeros(len(categories))

    for continent in continents:
        ax.barh(categories, continents_values[continent], label=continent, color=continents_colors[continent], edgecolor="w", height=0.5, left=bottom)
        bottom += continents_values[continent]

    ax.set_yticks(category_positions)
    ax.set_yticklabels(categories)
    ax.set_xlabel(fig_xlabel)
    ax.set_ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.legend()
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()


    continent_sorted = df_continent_frequencies.sum(axis="columns").sort_values(ascending=False).index

    fig = px.bar(
        df_continent_frequencies.loc[continent_sorted].T.loc[df_continent_frequencies.sum(axis="index").sort_values().index],
        orientation ="h",
        title=fig_title,
        labels={"index": fig_ylabel, "value": fig_xlabel},
        color_discrete_sequence=[continents_colors_int[continent] for continent in continent_sorted],
    )
    fig.update_layout(
        legend_title_text="",
        title_x=0.5,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT    
    )
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()


    fig = go.Figure()

    annotations = {}
    buttons = []
    visible = True
    mask = [False] * len(categories)
    max_name_len = max(len(name) for name in continents)
    for category_idx, category in enumerate(categories):
        category_data = df_continent_frequencies[category]
        category_data = category_data[category_data > 0]

        category_name = category.replace("_", " ")
        labels = [f"{name : <{max_name_len}}" for name in category_data.index]
        fig.add_trace(go.Pie(
            labels=labels,
            values=category_data.values,
            marker_colors=[continents_colors_int[continent] for continent in category_data.index],
            visible=visible,
            name=category_name
        ))

        annotation = dict(
            text=f"Category: {category_name}",
            x=0.5,
            y=1.1,
            showarrow=False
        )
        if visible:
            fig.add_annotation(annotation)

        mask[category_idx] = True
        buttons.append(dict(
            label=category_name,
            method="update",
            args=[
                {"visible": list(mask)},
                {"title": fig_title, "annotations": [annotation]}
            ]
        ))
        mask[category_idx] = False
        visible=False


    fig.update_layout(
        title_text=fig_title,
        title_x=0.7,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT,
        legend=dict(
            x=-0.3,
            y=0.1
        )
    )


    fig.update_layout(
        updatemenus=[
            dict(
                active=0,
                buttons=buttons
            )
        ]
    )

    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_pie.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_pie.pdf"))
    fig.show()

On the graphs, it can be seen that in most of the categories there is a bigger representation of european articles which makes sense as we saw before that Europe is the continent with the most articles on the game.

In the following code we show how articles can belong to more than one category

In [None]:
df_articles = df_continents_categories[["article", "continent"]].drop_duplicates()
df_articles = pd.merge(df_articles, df_continents_categories.groupby("article")["categoryMain"].apply(list).reset_index(), on="article")

display(df_articles.head())
print("Size:", df_articles.shape)

## Length of articles

Now we are going to get how long are the articles and try to show if there is a significant difference depending on the continent 

In [None]:
plaintext_path = os.path.join("Data", "plaintext_articles")

word_counts = []
for article_name in df_articles.article:
    file_path = os.path.join(plaintext_path, article_name + ".txt")

    with open(file_path, "r", encoding="utf-8") as file:

        _ = file.readline() # Skip the first line because it contains the word #copyright
        content = file.read()

    content = content[:re.search("Retrieved from", content).start(0)]
    word_counts.append(len(content.split()))

df_articles["length"] = word_counts

display(df_articles.head())
print("Size:", df_articles.shape)

In [None]:
continent_length = df_articles.groupby(["continent"]).length.agg(scipy.stats.gmean)
continent_length = continent_length.sort_values(ascending=False)

display(continent_length)
print("Size:", continent_length.shape)

In [None]:
if False:

    fig_name = "gmean_length_of_articles_by_continent"
    fig_title = "Length of articles by continent (Geometric mean)"
    fig_xlabel = "Continent"
    fig_ylabel = "World count"

    ax = plt.bar(
        continent_length.index,
        continent_length.values.reshape(continent_length.shape[0]),
        color=[continents_colors[continent] for continent in continent_length.index]
    )
    plt.xticks(rotation=45)

    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()

    fig = px.bar(
        x=continent_length.index,
        y=continent_length.values.reshape(continent_length.shape[0]),
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title="Length of articles by continent (Geometric mean)",
        color=[continents_colors_int[continent] for continent in continent_length.index],
        color_discrete_map="identity",
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )

    fig.update_layout(
        showlegend=False,
        title_x=0.5
    )
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

As it can be seen, articles about Europe, North America and Asia are considerably longer than from other continents

## Pagerank

Pagerank is a centrality measure computed in a recursivity manner where your values is higher if the value of an article pointing to you is higher as well. This means that articles that are more likely to be clicked on will have a higher pagerank value. In our case the most popular article or the one with the highest pagerank is the United States.

Now we are going to analyse the PageRank of the articles and try to see if there is one continent with a considerably higher PageRank making it more popular in general.

In [None]:
# Load pagerank
df_pagerank = pd.read_csv(os.path.join("Data", "page_rank.csv"))

display(df_pagerank.head())
print("Size:", df_pagerank.shape)

In [None]:
df_articles = pd.merge(df_articles, df_pagerank, on="article", how="left").fillna(1e-6)

display(df_articles.head())
print("Size:", df_articles.shape)

Here we compute the geometric mean along the different articles of a continent and we plot it

In [None]:
# Compute mean
pagerank_continent_mean = df_articles.groupby("continent")["pageRank"].agg(scipy.stats.gmean)
pagerank_continent_mean = pagerank_continent_mean.sort_values(ascending=False)

# Compute mean
pagerank_continent_median = df_articles.groupby("continent")["pageRank"].median()
pagerank_continent_median = pagerank_continent_median.sort_values(ascending=False)

In [None]:
if False:
    fig_name = "gmean_pagerank_by_continet"
    fig_title = "PageRank by continent (Geometric Mean)"
    fig_xlabel = "Continent"
    fig_ylabel = "PageRank"

    ax = plt.bar(
        pagerank_continent_mean.index,
        pagerank_continent_mean,
        color=[continents_colors[continent] for continent in pagerank_continent_mean.index]
    )
    plt.xticks(rotation=45)

    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()

    fig = px.bar(
        x=pagerank_continent_mean.index,
        y=pagerank_continent_mean,
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title=fig_title,
        color=[continents_colors_int[continent] for continent in pagerank_continent_mean.index],
        color_discrete_map="identity",
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )
    fig.update_layout(
        showlegend=False,
        title_x=0.5
    )
    fig.update_yaxes(tickformat=".1e")
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

As it can be seen Asia, Europe and South America have the highes geometric mean of pagerank making their articles more popular in general than other continents.

On the next plot, we compute the geometric median which gives a similar result.

In [None]:
if False:
    fig_name = "median_pagerank_by_continet"
    fig_title = "PageRank by continent (Geometric Median)"
    fig_xlabel = "Continent"
    fig_ylabel = "PageRank"

    ax = plt.bar(pagerank_continent_median.index, pagerank_continent_median, color=[continents_colors[continent] for continent in pagerank_continent_median.index])
    plt.xticks(rotation=45)

    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()

    fig = px.bar(
        x=pagerank_continent_median.index,
        y=pagerank_continent_median,
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title=fig_title,
        color=[continents_colors_int[continent] for continent in pagerank_continent_median.index],
        color_discrete_map="identity",
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )
    fig.update_layout(
        showlegend=False,
        title_x=0.5
    )
    fig.update_yaxes(tickformat=".1e")
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

## Analysis of paths

In this section we want to compute all the statistics about the paths: 
- Number of backclicks 
- Steps taken
- Number of unique articles on the path
- If it is finished or unfinished
- Duration on the path
- The rating for each path
- The start and target articles and their categories and pagerank
- Frequency of each article on all the paths


In [None]:
df_paths_finished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_finished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"],
    skip_blank_lines=True,
    comment="#"
)
df_paths_unfinished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_unfinished.tsv"),
    sep="\t",
    header=None,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "motif"],
    skip_blank_lines=True,
    comment="#"
)

df_paths_finished["backclicks"] = df_paths_finished["path"].apply(lambda x: x.count("<"))
df_paths_finished["pathSteps"] = df_paths_finished["path"].apply(lambda x: x.count(";") + 1)
df_paths_finished["uniqueArticles"] = df_paths_finished["pathSteps"] - df_paths_finished["backclicks"]
df_paths_finished["path"] = df_paths_finished["path"].apply(lambda x: x.split(";"))
df_paths_finished["start"] = df_paths_finished["path"].str[0]
df_paths_finished["target"] = df_paths_finished["path"].str[-1]
df_paths_finished["isFinished"] = True

df_paths_unfinished["backclicks"] = df_paths_unfinished["path"].apply(lambda x: x.count("<"))
df_paths_unfinished["pathSteps"] = df_paths_unfinished["path"].apply(lambda x: x.count(";") + 1)
df_paths_unfinished["uniqueArticles"] = df_paths_unfinished["pathSteps"] - df_paths_unfinished["backclicks"]
df_paths_unfinished["path"] = df_paths_unfinished["path"].apply(lambda x: x.split(";"))
df_paths_unfinished["start"] = df_paths_unfinished["path"].str[0]
df_paths_unfinished["isFinished"] = False

display(df_paths_finished.head())
display(df_paths_unfinished.head())

df_paths = pd.concat([df_paths_finished, df_paths_unfinished])
display(df_paths.head())

In [None]:
tmp = df_articles.copy()
tmp.columns = [column[0].upper() + column[1:] for column in tmp.columns]
tmp = tmp.add_prefix("target")

df_analysis = pd.merge(df_paths, tmp, left_on="target", right_on="targetArticle", suffixes=["", ]).drop(columns="targetArticle")

tmp = df_articles.copy()
tmp.columns = [column[0].upper() + column[1:] for column in tmp.columns]
tmp = tmp.add_prefix("start")
df_analysis = pd.merge(df_analysis, tmp, left_on="start", right_on="startArticle", suffixes=["", ]).drop(columns="startArticle")

df_analysis["isFinishedInt"] = df_analysis["isFinished"].astype(int)

display(df_analysis.head())
print("Size:", df_analysis.shape)

In [None]:
df_analysis_finished = df_analysis[df_analysis["isFinished"]]
df_analysis_unfinished = df_analysis[~df_analysis["isFinished"]]

df_analysis_per_continent = pd.DataFrame()
for name in ["start", "target"]:
    # Create a column to show the number of articles in each continent
    df_analysis_per_continent[f"{name}Articles"] = df_analysis.groupby(f"{name}Continent").size()

    # Create columns for av. backclicks
    df_analysis_per_continent[f"{name}BackcliksFinished"] =  df_analysis_finished.groupby(f"{name}Continent")["backclicks"].agg(scipy.stats.gmean)
    df_analysis_per_continent[f"{name}BackcliksUnfinished"] =  df_analysis_unfinished.groupby(f"{name}Continent")["backclicks"].agg(scipy.stats.gmean)

    # Create columns for av. path steps
    df_analysis_per_continent[f"{name}PathStepsFinished"] =  df_analysis_finished.groupby(f"{name}Continent")["pathSteps"].agg(scipy.stats.gmean)
    df_analysis_per_continent[f"{name}PathStepsUnfinished"] =  df_analysis_unfinished.groupby(f"{name}Continent")["pathSteps"].agg(scipy.stats.gmean)

    # Create columns for the number of finished and unfinished paths for each continent
    df_analysis_per_continent[f"{name}PathsFinished"] = df_analysis_finished[f"{name}Continent"].value_counts()
    df_analysis_per_continent[f"{name}PathsUnfinished"] = df_analysis_unfinished[f"{name}Continent"].value_counts()
    df_analysis_per_continent[f"{name}PathsFinishedPercentage"] = df_analysis.groupby(f"{name}Continent")["isFinishedInt"].mean() * 100

df_analysis_per_continent.index.name = "continent"

Let's get know statistics by continent, like the number of target and start articles

In [None]:
continent_data = df_analysis_per_continent.sort_values(by="targetPathsFinishedPercentage",ascending=False)
display(continent_data)
print("Size:", continent_data.shape)

In [None]:
continent_data = df_analysis_per_continent.sort_values(by="startPathsFinishedPercentage",ascending=False)
display(continent_data)
print("Size:", continent_data.shape)

In [None]:
if False:
    fig_name = "count_target_by_continet"
    fig_title = "Target articles by continent"
    fig_xlabel = "Continent"
    fig_ylabel = "Count"

    ax = plt.bar(continent_data.index,continent_data["targetArticles"], color=[continents_colors[continent] for continent in continent_data.index])
    plt.xticks(rotation=45)

    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()

    fig = px.bar(
        x=continent_data.index,
        y=continent_data["targetArticles"],
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title=fig_title,
        color=[continents_colors_int[continent] for continent in continent_data.index],
        color_discrete_map="identity"
    )

    fig.update_layout(
        showlegend=False,
        title_x=0.5,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

In [None]:
if False:
    fig_name = "count_start_by_continet"
    fig_title = "Start articles by continent"
    fig_xlabel = "Continent"
    fig_ylabel = "Count"

    ax = plt.bar(continent_data.index,continent_data["startArticles"], color=[continents_colors[continent] for continent in continent_data.index])
    plt.xticks(rotation=45)

    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()

    fig = px.bar(
        x=continent_data.index,
        y=continent_data["startArticles"],
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title=fig_title,
        color=[continents_colors_int[continent] for continent in continent_data.index],
        color_discrete_map="identity"
    )

    fig.update_layout(
        showlegend=False,
        title_x=0.5,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

Now we want to compare frequency of articles in the paths. 

In [None]:
df_article_path_stats = pd.DataFrame()

df_article_path_stats["article"] = df_articles["article"]
df_article_path_stats["continent"] = df_articles["continent"]
df_article_path_stats["targetFinished"] = df_articles["article"].map(df_paths_finished["target"].value_counts()).fillna(0)
df_article_path_stats["targetUnfinished"] = df_articles["article"].map(df_paths_unfinished["target"].value_counts()).fillna(0)

df_article_path_stats["startFinished"] = df_articles["article"].map(df_paths_finished["start"].value_counts()).fillna(0)
df_article_path_stats["startUnfinished"] = df_articles["article"].map(df_paths_unfinished["start"].value_counts()).fillna(0)

paths_finished = pd.Series(np.concatenate(df_paths_finished.path.values))
paths_unfinished = pd.Series(np.concatenate(df_paths_unfinished.path.values))

# Create columns to count the number of times each article appears in general in finished and unfinished paths
df_article_path_stats["anyFinished"] = df_articles["article"].map(paths_finished.value_counts()).fillna(0)
df_article_path_stats["anyUnfinished"] = df_articles["article"].map(paths_unfinished.value_counts()).fillna(0)
df_article_path_stats["totalOccurrences"] = df_article_path_stats["anyFinished"] + df_article_path_stats["anyUnfinished"]

# Let"s calculate the probability of finding an article (we can compare this with the pagerank)
df_article_path_stats["anyPercentage"] = (df_article_path_stats["anyFinished"] + df_article_path_stats["anyUnfinished"]) / (len(paths_finished) + len(paths_unfinished))

#Let's joing with pagerank
df_article_path_stats = pd.merge(df_article_path_stats,df_pagerank,on='article',how='left')


#Finally we compute the difference between anyPercentage and pagerank
df_article_path_stats['diffOccPageRank'] = df_article_path_stats["anyPercentage"] - df_article_path_stats['pageRank']


In [None]:
display(df_article_path_stats.sort_values("anyPercentage", ascending=False).head())
print("Size:", df_article_path_stats.shape)

In [None]:
if False:
    fig_name = "article_occurrence_by_continet"
    fig_title = "Occurrence of article in path per continent (Median)"
    fig_xlabel = "Continent"
    fig_ylabel = "Percentage"


    tmp = df_article_path_stats.groupby("continent")
    tmp = tmp.anyPercentage.median()

    ax = plt.bar(tmp.index, tmp, color=[continents_colors[continent] for continent in tmp.index])
    plt.xticks(rotation=45)

    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.show()

    fig = px.bar(
        x=tmp.index,
        y=tmp,
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title=fig_title,
        color=[continents_colors_int[continent] for continent in tmp.index],
        color_discrete_map="identity"
    )

    fig.update_layout(
        showlegend=False,
        title_x=0.5,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

In [None]:
if False:
    fig_name = "article_occurrence_by_continet"
    fig_title = "Occurrence of article in path per continent Total"
    fig_xlabel = "Continent"
    fig_ylabel = "Total"


    tmp = df_article_path_stats.groupby("continent")
    tmp = tmp.totalOccurrences.sum()

    ax = plt.bar(tmp.index, tmp, color=[continents_colors[continent] for continent in tmp.index])
    plt.xticks(rotation=45)

    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.show()

    fig = px.bar(
        x=tmp.index,
        y=tmp,
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title=fig_title,
        color=[continents_colors_int[continent] for continent in tmp.index],
        color_discrete_map="identity"
    )

    fig.update_layout(
        showlegend=False,
        title_x=0.5,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

As we can see in the previous plot, European articles are more widely chosen. However, this is as well because most of the articles are european

In [None]:
if False:
    fig_name = "median_pagerank_by_continet"
    fig_title = "Percentage of finished paths by continent"
    fig_xlabel = "Continent"
    fig_ylabel = "Percentage"

    ax = plt.bar(continent_data.index, continent_data["targetPathsFinishedPercentage"], color=[continents_colors[continent] for continent in continent_data.index])
    plt.xticks(rotation=45)
    plt.xlabel(fig_xlabel)
    plt.ylabel(fig_ylabel)
    plt.title(fig_title)
    plt.savefig(os.path.join(PLOTS_PATH_PLT, f"{fig_name}_bar.pdf"))
    plt.show()

    fig = px.bar(
        x=continent_data.index,
        y=continent_data["targetPathsFinishedPercentage"],
        labels={"x": fig_xlabel, "y": fig_ylabel},
        title=fig_title,
        color=[continents_colors_int[continent] for continent in continent_data.index],
        color_discrete_map="identity"
    )
    fig.update_layout(
        showlegend=False,
        title_x=0.5,
        width=FIGURE_WIDTH,
        height=FIGURE_HEIGHT
    )
    fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_bar.html"))
    fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_bar.pdf"))
    fig.show()

Now we are going to do a statistical test to show if there is a statistical difference on the number of articles finished of Europe versus the rest of the world.

In [None]:
df_analysis["treatment"] = df_analysis.targetContinent == "Europe"

In [None]:
scipy.stats.ttest_ind(df_analysis[df_analysis.treatment]["isFinishedInt"], df_analysis[~df_analysis.treatment]["isFinishedInt"], equal_var=False)

As it can be seen, the p-value is lower than 0.05. Therefore, we reject the hypothesis that the number of articles finished is the same for Europe than for the rest of the world

In [None]:
cross_tab = pd.crosstab(df_analysis['treatment'], df_analysis['isFinished'], margins=True, margins_name='Total')
proportions = cross_tab.div(cross_tab['Total'], axis=0).iloc[:-1, :-1]
standard_errors = proportions.apply(lambda x: x**0.5 * ((1 - x) / x / cross_tab['Total'][:-1])**0.5, axis=0)

print(proportions, standard_errors)


if False:
    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=[f'Treatment {group}' for group in proportions.index],
        y=proportions[True],
        error_y=dict(type='data', array=standard_errors[True]),
    ))

    fig.update_layout(
        title='Proportion of Finished Samples',
        xaxis=dict(title='Treatment'),
        yaxis=dict(title='Proportion'),
        barmode='group',
    )

    fig.show()


# Testing dependance of variables

In [None]:
df_articles_all = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "articles.tsv"),
    delimiter="\t",
    header=None,
    names=["name"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

name_to_index = dict(zip(df_articles_all["name"], df_articles_all.index))

display(df_articles_all.head())
print("Size:", df_articles_all.shape)

In [None]:
sp_lenght = []
s = []
with open(os.path.join("Data", "wikispeedia_paths-and-graph", "shortest-path-distance-matrix.txt")) as file:
    for line in file:
        line = line.strip()
        if line == "" or line.startswith("#"):
            continue
        s.append(line)
        sp_lenght.append(list(map(lambda x: -1 if x == "_" else int(x), list(line))))
        
sp_lenght = np.array(sp_lenght)

print(sp_lenght[:10, :10])

In [None]:
df_analysis["startIdx"] = df_analysis["start"].replace(name_to_index)
df_analysis["targetIdx"] = df_analysis["target"].replace(name_to_index)

df_analysis["shortestPath"] = df_analysis.apply(lambda x: sp_lenght[x["startIdx"]][x["targetIdx"]], axis="columns")

In [None]:
df_analysis.head()

In [None]:
df_analysis_onehot = df_analysis.copy()

for col in ["startContinent", "startCategoryMain", "targetContinent", "targetCategoryMain"]:
    one_hot = pd.get_dummies(df_analysis[col].explode()).groupby(level=0).sum()
    one_hot = one_hot.add_prefix(f"{col}OneHot")

    df_analysis_onehot = pd.concat((df_analysis_onehot, one_hot), axis="columns")

df_analysis_onehot.columns = [col.replace(" ", "_") for col in df_analysis_onehot.columns]

In [None]:
corr_cols = ["backclicks", "pathSteps", "uniqueArticles", "targetLength", "targetPageRank", "startLength", "startPageRank", "isFinished", "shortestPath"] + [col for col in df_analysis_onehot.columns if "OneHot" in col]
display(df_analysis_onehot[corr_cols].corr()["isFinished"])
display(df_analysis_onehot[corr_cols].corr("spearman")["isFinished"])


In [None]:
features = [col for col in df_analysis_onehot.columns if "CategoryMainOneHot" in col] + ["startLength", "startPageRank", "targetLength", "targetPageRank", "shortestPath"]
eq = "isFinishedInt ~ " + " + ".join(features)

model = smf.logit(eq, df_analysis_onehot).fit()
print(model.summary())

In [None]:
pvalues = model.pvalues
print(pvalues[pvalues < 0.05])

In [None]:
eq = "isFinishedInt ~ " + " + ".join(x for x in pvalues[pvalues < 0.05].index if x != "Intercept")

model = smf.logit(eq, df_analysis_onehot).fit()
print(model.summary())

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

ax = sns.histplot(df_analysis[df_analysis.treatment]["startLength"], stat='probability', color='blue', label='treated', element="step", ax=axes[0])
ax = sns.histplot(df_analysis[~df_analysis.treatment]["startLength"], stat='probability', color='orange', label='control', element="step", ax=axes[0])


bx = sns.boxplot(data=df_analysis, y="startLength", x="treatment", ax=axes[1], width=0.3, palette=["orange", "blue"])

ax.set(title='Start Length distribution comparison', xlabel="Start Length", ylabel='Start Length probability')
bx.set(title='Start Length boxplot', xlabel="Treatment", ylabel='Start Length')
ax.legend()
bx.set_label(["treated","control"])
bx.legend(labels=["treated","control"])
plt.show()

fig_name = "start_length_distribution"

fig = px.histogram(
    df_analysis,
    x="startLength",
    color="treatment",
    labels={"startLength": "Start Length", "count": "Count", "treatment": "Treatment"},
    histnorm="probability",
    #nbins=30,
    barmode='overlay',
)

fig.update_layout(
    title_text="Start Length Distribution Comparison",
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT,
)

fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_hist.pdf"))
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_hist.html"))
fig.show()

fig = px.box(
    df_analysis,
    x="treatment",
    y="startLength",
    color="treatment",
    labels={"startLength": "Start Length", "treatment": "Treatment"},
    points="all"
)

fig.update_layout(
    title_text="Start Length Boxplot",
    showlegend=False,
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)

fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_box.pdf"))
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_box.html"))
fig.show()


In [None]:

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

ax = sns.histplot(df_analysis[df_analysis.treatment]["targetLength"], stat='probability', color='blue', label='treated', element="step", ax=axes[0])
ax = sns.histplot(df_analysis[~df_analysis.treatment]["targetLength"], stat='probability', color='orange', label='control', element="step", ax=axes[0])

bx = sns.boxplot(x="treatment", y="startLength", data=df_analysis, ax=axes[1], palette=["orange", "blue"], width=0.3)

ax.set(title='Target Length distribution comparison', xlabel="Target Length", ylabel='Target Length probability')
bx.set(title='Target Length boxplot', xlabel="Treatment", ylabel='Target Length')

ax.legend()
bx.legend()
plt.show()

fig_name = "target_length_distribution"

fig = px.histogram(
    df_analysis,
    x="targetLength",
    color="treatment",
    labels={"targetLength": "Target Length", "count": "Count", "treatment": "Treatment"},
    histnorm="probability",
    #nbins=30,
    barmode='overlay',
)

fig.update_layout(
    title_text="Target Length Distribution Comparison",
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT,
)

fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_hist.pdf"))
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_hist.html"))
fig.show()

fig = px.box(
    df_analysis,
    x="treatment",
    y="targetLength",
    color="treatment",
    labels={"targetLength": "Target Length", "treatment": "Treatment"},
    points="all"
)

fig.update_layout(
    title_text="Target Length Boxplot",
    showlegend=False,
    title_x=0.5,
    width=FIGURE_WIDTH,
    height=FIGURE_HEIGHT
)

fig.write_image(os.path.join(PLOTS_PATH_PX, f"{fig_name}_box.pdf"))
fig.write_html(os.path.join(PLOTS_PATH_HTML, f"{fig_name}_box.html"))
fig.show()

In [None]:
cols = ["startLength", "targetLength"]
names = ["Start Length", "Target Length"]

fig, axes = plt.subplots(nrows=2, ncols=len(cols), figsize=(15, 10), sharey="row")


for i, col in enumerate(cols):
    ax = sns.histplot(df_analysis[df_analysis.treatment][col], stat='probability', color='blue', label='treated', element="step", ax=axes[0,i],) # treatment
    ax = sns.histplot(df_analysis[~df_analysis.treatment][col], stat='probability', color='orange', label='control', element="step", ax=axes[0,i]) # control
    ax.set(title=f'{names[i]} distribution comparison', xlabel=names[i], ylabel=f'{names[i]} probability')
    ax.legend()
    
    ax = sns.boxplot(x="treatment", y=col, data=df_analysis, ax=axes[1,i], palette=["orange", "blue"], width=0.3)
    ax.set(title=f'{names[i]} Boxplot', xlabel='Treatment', ylabel=names[i])


plt.show()

In [None]:

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

ax = sns.histplot(df_analysis[df_analysis.treatment]["startPageRank"], stat='density', color='blue', label='treated', element="step", ax=axes[0], log_scale=True)
ax = sns.histplot(df_analysis[~df_analysis.treatment]["startPageRank"], stat='density', color='orange', label='control', element="step", ax=axes[0],log_scale=True)

bx = sns.boxplot(x="treatment", y="startPageRank", data=df_analysis, ax=axes[1], palette=["orange", "blue"], width=0.3)

ax.set(title='Start PageRank distribution comparison', xlabel="Start PageRank", ylabel='Start PageRank probability')
bx.set(title='Start PageRank boxplot', xlabel="Treatment", ylabel='Start PageRank Length')
plt.legend()
plt.show()



In [None]:
fig, axes =  plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

ax = sns.histplot(df_analysis[df_analysis.treatment]["targetPageRank"], stat='probability', color='blue', label='treated', element="step", ax=axes[0],log_scale=True)
ax = sns.histplot(df_analysis[~df_analysis.treatment]["targetPageRank"], stat='probability', color='orange', label='control', element="step", ax=axes[0], log_scale=True)

bx = sns.boxplot(x="treatment", y="startPageRank", data=df_analysis, ax=axes[1], palette=["orange", "blue"], width=0.3)

ax.set(title='Target PageRank distribution comparison', xlabel="Target PageRank", ylabel='Target PageRank probability')
bx.set(title='Target PageRank boxplot', xlabel="Treatment", ylabel='Target PageRank Length')
plt.legend()
plt.show()


In [None]:
cols = ["startPageRank", "targetPageRank"]
names = ["Start PageRank", "Target PageRank"]

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 10), sharey="row")


for i, col in enumerate(cols):
    ax = sns.histplot(df_analysis[df_analysis.treatment][col], stat='probability', color='blue', label='treated', element="step", ax=axes[0,i], log_scale=(True, False))
    ax = sns.histplot(df_analysis[~df_analysis.treatment][col], stat='probability', color='orange', label='control', element="step", ax=axes[0,i], log_scale=(True, False))
    ax.set(title=f'{names[i]} distribution comparison', xlabel=names[i], ylabel=f'{names[i]} probability')
    ax.legend()

    ax = sns.boxplot(x="treatment", y=col, data=df_analysis, ax=axes[1,i], palette=["orange", "blue"], width=0.3)
    ax.set(title=f'{names[i]} Boxplot', xlabel='Treatment', ylabel=names[i])

plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))


ax = sns.histplot(df_analysis[df_analysis.treatment]["shortestPath"], stat='probability', color='blue', label='treated', element="step", ax=axes[0], binwidth=0.8)
ax = sns.histplot(df_analysis[~df_analysis.treatment]["shortestPath"], stat='probability', color='orange', label='control', element="step", ax=axes[0], binwidth=0.8)
ax.set(title='Shortest Path distribution comparison', xlabel="Shortest Path", ylabel='Shortest Path probability')

bx = sns.boxplot(x="treatment", y="shortestPath", data=df_analysis, ax=axes[1], palette=["orange", "blue"], width=0.3)
ax.legend()

In [None]:
df_analysis.shortestPath.value_counts()

In [None]:
#ANOTHER EASIER WAY TO COMPUTE BUT THE COLORS AND BARS ARE NOT POSITIONED IN THE SAME WAY

cols=["startLength","targetLength","startPageRank","targetPageRank","shortestPath"]
for i, col in enumerate(cols):
    ax = sns.histplot(data=df_analysis, 
                      x=col, 
                      hue="treatment", 
                      stat='proportion',
                      element="step",
                      bins=10,
                      log_scale=True if col in ["startPageRank", "targetPageRank"] else None,
                      common_norm=False
                      )
    plt.show()

In [None]:
df_analysis[cols]

In [None]:
geom_mean_sLen = df_analysis.groupby("treatment").startLength.agg(scipy.stats.gmean)
geom_mean_tLen = df_analysis.groupby("treatment").targetLength.agg(scipy.stats.gmean)
geom_mean_sPG = df_analysis.groupby("treatment").startPageRank.agg(scipy.stats.gmean)
geom_mean_tPG = df_analysis.groupby("treatment").targetPageRank.agg(scipy.stats.gmean)
geom_mean_sPath = df_analysis.groupby("treatment").shortestPath.agg(scipy.stats.gmean)
print(geom_mean_sLen, geom_mean_tLen, geom_mean_sPG, geom_mean_tPG, geom_mean_sPath)


In [None]:
# CHATGPT:

# Calculate geometric means with confidence intervals
geom_means_ci_1 = df_analysis.groupby('treatment').agg(
    startLength=('startLength', scipy.stats.gmean),
    targetLength=('targetLength', scipy.stats.gmean),
)

geom_means_ci_2 = df_analysis.groupby('treatment').agg(
    startPageRank=('startPageRank', scipy.stats.gmean),
    targetPageRank=('targetPageRank', scipy.stats.gmean),
)


# Reshape the DataFrame for plotting
geom_means_ci_1 = geom_means_ci_1.stack().reset_index().rename(columns={0: 'Geometric Mean', 'level_1': 'Variable'})
geom_means_ci_2 = geom_means_ci_2.stack().reset_index().rename(columns={0: 'Geometric Mean', 'level_1': 'Variable'})

display(geom_means_ci_1, geom_means_ci_2)

# Plot using Seaborn
sns.barplot(
    x='Variable',
    y='Geometric Mean',
    hue='treatment',
    data=geom_means_ci_1,
    capsize=0.1,
    estimator=scipy.stats.gmean,  # Specify the estimator
    errorbar='ci',  # Set the method for confidence interval
    n_boot=10000,  # Number of bootstrap samples
    palette=["orange","blue"]
)

plt.title('Geometric Means with 95% Confidence Intervals')
plt.xlabel('Variable')
plt.ylabel('Geometric Mean')
plt.legend(title='Treatment')
plt.show()

sns.barplot(x='Variable', y='Geometric Mean', hue='treatment', data=geom_means_ci_2, errorbar=("ci", 95), capsize=0.1, palette=["orange", "blue"])

plt.title('Geometric Means with 95% Confidence Intervals')
plt.xlabel('Variable')
plt.ylabel('Geometric Mean')
plt.legend(title='Treatment')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats

# Generate random data
np.random.seed(42)
data = np.random.lognormal(mean=0, sigma=0.1, size=100)

# Calculate the sample standard deviation of the log-transformed data
log_data = np.log(data)
s = np.std(log_data, ddof=1)

# Calculate the standard error
se = s / np.sqrt(len(data))

# Calculate the geometric mean
geometric_mean = np.exp(np.mean(log_data))

# Set the desired confidence level
confidence_level = 0.95

# Calculate the z-score corresponding to the confidence level
z_score = scipy.stats.norm.ppf(1 - (1 - confidence_level) / 2)

# Calculate the confidence interval for the geometric mean
confidence_interval = np.exp((np.log(geometric_mean) - z_score * se, np.log(geometric_mean) + z_score * se))

# Plotting
fig, ax = plt.subplots()

# Bar for the geometric mean
ax.bar(x='Geometric Mean', height=geometric_mean, color='blue', alpha=0.7, label='Geometric Mean')

# Line for the confidence interval
ax.plot(['Geometric Mean', 'Geometric Mean'], confidence_interval, color='red', linewidth=2, label='95% Confidence Interval')

# Set labels and title
ax.set_ylabel('Value')
ax.set_title('Geometric Mean with 95% Confidence Interval')
ax.legend()

plt.show()


In [None]:
eq = "isFinishedInt ~ startLength + startPageRank + targetLength + targetPageRank"

model = smf.logit(eq, df_analysis).fit()

df_analysis["propensityScore"] = model.predict()

model.summary()

In [None]:
def get_similarity(propensity_score1, propensity_score2):
    '''Calculate similarity for instances with given propensity scores'''
    return 1 - np.abs(propensity_score1 - propensity_score2)

In [None]:
df_analysis[df_analysis["treatment"]].shape
df_analysis[~df_analysis["treatment"]].shape

In [None]:
treatment_df = df_analysis[df_analysis["treatment"]].head(500)
control_df = df_analysis[~df_analysis["treatment"]].head(500)

display(treatment_df.head())

G = nx.Graph()
for control_id, control_row in control_df.iterrows():
    for treatment_id, treatment_row in treatment_df.iterrows():

        if len(set(treatment_row['startCategoryMain']) & set(control_row['startCategoryMain'])) \
        and len(set(treatment_row['targetCategoryMain']) & set(control_row['targetCategoryMain'])) \
        and treatment_row["shortestPath"] == control_row["shortestPath"]:
            weight = get_similarity(treatment_row["propensityScore"], control_row["propensityScore"])
            G.add_weighted_edges_from([(treatment_id, control_id, weight)])

matching = nx.max_weight_matching(G)

In [None]:
matched = [i[0] for i in list(matching)] + [i[1] for i in list(matching)]
df_balanced = df_analysis.iloc[matched]

In [None]:
cols=["startLength","targetLength","startPageRank","targetPageRank","shortestPath"]
for i, col in enumerate(cols):
    ax = sns.histplot(data=df_balanced, 
                      x=col, 
                      hue="treatment", 
                      stat='proportion',
                      element="step",
                      bins=10,
                      log_scale=True if col in ["startPageRank", "targetPageRank"] else None,
                      common_norm=False
                      )
    plt.show()

In [None]:
len(df_balanced.treatment)

In [None]:
scipy.stats.ttest_ind(df_balanced[df_balanced.treatment]["isFinishedInt"], df_balanced[~df_balanced.treatment]["isFinishedInt"], equal_var=False)
