# Analysing Dead-Ends in Wikispeedia

In [16]:
%load_ext autoreload
%autoreload 2

## Imports

In [96]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
import plotly.graph_objects as go


from src.data.dataloader import *
from src.utils.plots import *
from src.utils.functions import *

In [103]:
path='data/wikispeedia_paths-and-graph/'
articles = load_articles_dataframe(path=path)
categories = load_categories_dataframe(path=path)
finished_paths = load_path_finished_dataframe(path=path)
unfinished_paths = load_path_unfinished_distance_dataframe(path=path)
df_country_clicks_links = pd.read_csv('data/country_clicks_links.csv', index_col=0)

### Topic 1: What's the distribution of countries among start and stop articles

In [104]:
df_articles_count = process_article_paths(finished_paths, unfinished_paths, articles)

In [105]:
df_country_clicks_links_sorted = df_country_clicks_links.sort_values(by='click_count', ascending=False)
df_country_clicks_links_sorted.head(5)

Unnamed: 0,Top_1_name,click_count,num_links_in,name_links_in,num_links_out,name_links_out
United_States,united states,12370,1551.0,"['%E2%82%AC2_commemorative_coins', '15th_Marin...",294.0,"['Abraham_Lincoln', 'Advertising', 'Agricultur..."
Europe,,5553,933.0,"['13th_century', '14th_century', '15th_century...",159.0,"['18th_century', 'Africa', 'Albania', 'Amsterd..."
United_Kingdom,united kingdom,5278,972.0,"['%C3%85land', '%C3%89ire', '%E2%82%AC2_commem...",168.0,"['Acts_of_Union_1707', 'Adam_Smith', 'Airbus',..."
England,,4304,751.0,"['%C3%93engus_I_of_the_Picts', '11th_century',...",172.0,"['10th_century', '19th_century', 'Acts_of_Unio..."
Earth,,4134,269.0,"['16_Cygni_Bb', '4_Vesta', '55_Cancri', '55_Ca...",118.0,"['Acid_rain', 'Africa', 'Afrikaans', 'Aluminiu..."


### Compute All Paths Done in Wikispeedia to find start and stop articles

In [106]:
all_paths = pd.concat([finished_paths["path"], unfinished_paths["path"]])
all_paths_merged = all_paths.apply(lambda row: row.split(';'))

# Extract start and stop articles
df_paths = pd.DataFrame({
    'start': all_paths_merged.str[0],  # First article in path
    'stop': all_paths_merged.str[-1]  # Last article in path
})

In [108]:
# Simulate df_country_clicks_links_sorted
df_country_clicks_links_sorted = df_country_clicks_links_sorted.reset_index().rename(columns={'index': 'article'})
df_country_clicks_links_sorted

Unnamed: 0,article,Top_1_name,click_count,num_links_in,name_links_in,num_links_out,name_links_out
0,United_States,united states,12370,1551.0,"['%E2%82%AC2_commemorative_coins', '15th_Marin...",294.0,"['Abraham_Lincoln', 'Advertising', 'Agricultur..."
1,Europe,,5553,933.0,"['13th_century', '14th_century', '15th_century...",159.0,"['18th_century', 'Africa', 'Albania', 'Amsterd..."
2,United_Kingdom,united kingdom,5278,972.0,"['%C3%85land', '%C3%89ire', '%E2%82%AC2_commem...",168.0,"['Acts_of_Union_1707', 'Adam_Smith', 'Airbus',..."
3,England,,4304,751.0,"['%C3%93engus_I_of_the_Picts', '11th_century',...",172.0,"['10th_century', '19th_century', 'Acts_of_Unio..."
4,Earth,,4134,269.0,"['16_Cygni_Bb', '4_Vesta', '55_Cancri', '55_Ca...",118.0,"['Acid_rain', 'Africa', 'Afrikaans', 'Aluminiu..."
...,...,...,...,...,...,...,...
4595,Lesothosaurus,lesotho,0,,,10.0,"['Animal', 'Chordate', 'Dinosaur', 'Fossil', '..."
4596,Leo_%28constellation%29,,0,,,14.0,"['Alchemy', 'Astrology', 'Astronomy', 'Extraso..."
4597,Lemon_myrtle,australia,0,1.0,['Lemon'],9.0,"['Australia', 'Bill_Clinton', 'Binomial_nomenc..."
4598,Lemon_balm,united states,0,4.0,"['Absinthe', 'Essential_oil', 'Lemon', 'List_o...",11.0,"['Bee', 'Binomial_nomenclature', 'Carolus_Linn..."


In [None]:
# Step 3: Map start and stop articles to countries
def article_to_country(article):
    """Gets the country of the article from df_country_clicks_links_sorted."""
    try:
        country_series = df_country_clicks_links_sorted.loc[df_country_clicks_links_sorted['article'] == article, 'Top_1_name']
        if not country_series.empty:
            return country_series.iloc[0] 
        else:
            return None  # return None if no match is found
    except Exception as e:
        print(f"Error for article {article}: {e}")
        return None


df_paths['start_country'] = df_paths['start'].map(article_to_country)
df_paths['stop_country'] = df_paths['stop'].map(article_to_country)

df_paths.head(5)

Unnamed: 0,start,stop,start_country,stop_country
0,14th_century,African_slave_trade,italy,
1,14th_century,African_slave_trade,italy,
2,14th_century,African_slave_trade,italy,
3,14th_century,Greece,italy,greece
4,14th_century,John_F._Kennedy,italy,united states


In [126]:
from src.scripts.graphs import *
df_counts = pd.concat([
    df_paths['start_country'].value_counts().rename('start'),
    df_paths['stop_country'].value_counts().rename('stop')
], axis=1).fillna(0).reset_index().rename(columns={'index': 'country'})


def get_top_10_df(sort_by="start"):
    return df_counts.nlargest(10, sort_by)


df_top_start = get_top_10_df("start")
df_top_stop = get_top_10_df("stop")

plot_start_stop_count(df_top_start, df_top_stop)

## Find unique dead end countries 
We'd like to have unique countries in the dataframe to get a clear view of each country's overall impact as a dead-end article without redundancy.
It's important to avoid duplicate entries, which would skew metrics like total click counts and success/failure ratios. 

The following approach creates a single, summarized entry for each country, preserving meaningful metrics and avoiding arbitrary duplicate drops. It allows for a comprehensive view of each country’s overall engagement and dead-end behavior within the dataset.

In [None]:
top_dead_end_articles = df_articles_count.sort_values(by=['failure_ratio_unique', 'total_click_count'], ascending=False)
print("Top Dead-End Articles (Most Likely to Cause Players to Stop):")
top_dead_end_articles[['article', 'total_click_count', 'failure_ratio_unique', 'unique_failure_count']].head(10)

In [5]:
# prepare for merge
df_country_click_links_reset = df_country_clicks_links.reset_index().rename(columns={'index': 'article'})

# merge to get the country
dead_end_countries = pd.merge(
    df_country_click_links_reset, 
    top_dead_end_articles, 
    on='article', 
    how='inner'
)

print("Top Dead-End Articles (Most Likely to Cause Players to Stop) related to link counts:")
dead_end_countries.sort_values(by=['failure_ratio_unique','total_click_count'], ascending=False)[['article', 'num_links_in', 'num_links_out', 'failure_ratio_unique']].head(10)


Top Dead-End Articles (Most Likely to Cause Players to Stop) related to link counts:


Unnamed: 0,article,num_links_in,num_links_out,failure_ratio_unique
1925,Hilda_of_Whitby,2.0,16.0,1.0
2048,Hurricane_Gloria,2.0,6.0,1.0
2621,Malwa_%28Madhya_Pradesh%29,3.0,29.0,1.0
1353,Eliminative_materialism,1.0,12.0,1.0
3616,San_Diego_and_Arizona_Railway,2.0,8.0,1.0
504,Battle_of_Smolensk_%281943%29,2.0,13.0,1.0
2574,Lynton_and_Barnstaple_Railway,3.0,11.0,1.0
3490,Richard_O%27Connor,1.0,35.0,1.0
313,Architecture_of_Windows_NT,,3.0,1.0
844,CF7,2.0,6.0,1.0


In [6]:
# group by 'Top_1_name' (country) and aggregate the data
unique_dead_end_countries = (
    dead_end_countries
    .groupby('Top_1_name', as_index=False)
    .agg({
        'click_count': 'sum',             # Sum click counts to get total clicks per country
        'num_links_out': lambda x: int(round(x.sum())),          # Sum of outgoing links
        'num_links_in': lambda x: int(round(x.sum())),           # Sum of incoming links
        'unique_success_count': 'sum',    # Sum of unique success counts per country
        'unique_failure_count': 'sum',    # Sum of unique failure counts per country
        'success_ratio_total': 'mean',    # Mean of total success ratios
        'failure_ratio_total': 'mean',    # Mean of total failure ratios
        'success_ratio_unique': 'mean',   # Mean of unique success ratios
        'failure_ratio_unique': 'mean'    # Mean of unique failure ratios
    }).rename(columns={
        'num_links_out': 'sum_num_links_out',
        'num_links_in': 'sum_num_links_in',
        'success_ratio_total': 'mean_success_ratio_total',
        'failure_ratio_total': 'mean_failure_ratio_total',
        'success_ratio_unique': 'mean_success_ratio_unique',
        'failure_ratio_unique': 'mean_failure_ratio_unique'
    })
)

unique_dead_end_countries = unique_dead_end_countries.sort_values(by=['click_count'], ascending=False)

unique_dead_end_countries.head(10)

Unnamed: 0,Top_1_name,click_count,sum_num_links_out,sum_num_links_in,unique_success_count,unique_failure_count,mean_success_ratio_total,mean_failure_ratio_total,mean_success_ratio_unique,mean_failure_ratio_unique
185,united states,36078,11710,9562,34993,13220,0.604929,0.2489,0.609996,0.251265
184,united kingdom,17150,6453,6315,16252,5614,0.645435,0.249759,0.649921,0.251556
64,germany,9244,3284,3750,9448,3026,0.706367,0.220213,0.711659,0.222303
59,france,8476,3487,4487,7933,2912,0.670494,0.247028,0.676006,0.249227
87,italy,8288,2530,3152,7700,2928,0.657015,0.239025,0.661918,0.240995
39,china,6158,2454,2696,6449,2120,0.724648,0.222597,0.728767,0.224174
161,south africa,5815,1236,1261,5464,1418,0.730293,0.185385,0.73429,0.186762
10,australia,5814,2534,2241,5969,2102,0.638846,0.225703,0.643508,0.228051
67,greece,5599,1672,1732,5274,1892,0.675084,0.224611,0.683117,0.227597
80,india,5462,2767,2425,5535,1960,0.686685,0.22849,0.692581,0.230496


If we scale by the number of outgoing links, which places all countries at the same scale, we will have a better intuition on which countries player tend to stop playing the game the most 

In [7]:
# Scale the click count by the mean number of outgoing links
unique_dead_end_countries["scaled_click_count"] = unique_dead_end_countries["click_count"] / unique_dead_end_countries["sum_num_links_out"]

In [65]:
# don't add title to the plot, will be added as html
# TITLE = "Top Country-Related Dead-End Articles (Before/After Scaling)"
def plot_top_dead_end_countries_plotly(unique_dead_end_countries, top_n=10):
    """
    Plots the top country-related dead-end articles with an interactive button
    to switch between before scaling and after scaling (scaled click counts).
    
    Args:
        unique_dead_end_countries (pd.DataFrame): DataFrame containing dead-end 
                                                  country-related articles with click counts and link information.
        top_n (int): Number of top articles to display (default is 10).
    """
    # Get the top N data for both "before scaling" and "after scaling"
    top_before_scaling = unique_dead_end_countries.sort_values(
        by="click_count", ascending=False
    ).head(top_n)
    top_after_scaling = unique_dead_end_countries.sort_values(
        by="scaled_click_count", ascending=False
    ).head(top_n)
    
    # Get global min and max for "Sum Links Out" across both datasets
    global_min = unique_dead_end_countries["sum_num_links_out"].min()
    global_max = unique_dead_end_countries["sum_num_links_out"].max()
    
    # Create traces for before scaling
    trace_before = go.Bar(
        x=top_before_scaling["click_count"],
        y=top_before_scaling["Top_1_name"],
        orientation="h",
        marker=dict(
            color=top_before_scaling["sum_num_links_out"], 
            colorscale="YlGnBu", 
            cmin=global_min,
            cmax=global_max,
            colorbar=dict(title="Sum Links Out", x=1.02),  # Position color bar to the right
        ),
        name="Before Scaling",
    )

    # Create traces for after scaling
    trace_after = go.Bar(
        x=top_after_scaling["scaled_click_count"],
        y=top_after_scaling["Top_1_name"],
        orientation="h",
        marker=dict(
            color=top_after_scaling["sum_num_links_out"], 
            colorscale="YlGnBu",
            cmin=global_min,
            cmax=global_max,
            colorbar=dict(title="Sum Links Out", x=1.02),  # Hide duplicate color bar for this trace
        ),
        name="After Scaling",
    )

    # Layout with updatemenus (buttons)
    layout = go.Layout(
        xaxis=dict(title="Click Count"),
        yaxis=dict(title="Country"),
        updatemenus=[
            dict(
                type="buttons",
                direction="left",
                x=0.7,
                y=1.2,
                showactive=True,
                buttons=[
                    dict(
                        label="Before Scaling",
                        method="update",
                        args=[
                            {"visible": [True, False]},
                            {"xaxis.title.text": "Click Count"}  # Show first trace
                        ],
                    ),
                    dict(
                        label="After Scaling",
                        method="update",
                        args=[
                            {"visible": [False, True]},
                            {"xaxis.title.text": "Scaled Click Count by Sum Links Out"}  # Show second trace
                        ],
                    ),
                ],
            )
        ],
    )

    # Combine traces
    fig = go.Figure(data=[trace_before, trace_after], layout=layout)

    # Initially set visibility
    fig.data[0].visible = True  # Before scaling
    fig.data[1].visible = False  # After scaling

    # Show the interactive plot
    fig.show()
    #fig.write_html('graphs/top_country_dead_end_articles.html')

In [66]:
plot_top_dead_end_countries_plotly(unique_dead_end_countries, top_n=10)

This analysis again provides interesting insights!

- In the first plot, we see that highly connected countries like the United States and United Kingdom frequently appear as the last countries in unfinished paths, likely due to their high click count and connectivity. Their numerous outgoing links make them common endpoints when players get stuck in navigation.
- The scaled plot, however, reveals a different trend: countries with fewer outgoing links, such as the United Arab Emirates and French Polynesia, stand out as prominent dead ends. This indicates that users often land on these pages not merely due to connectivity but due to specific navigational patterns or interests that naturally lead them to these locations.

Therefore scaling shows countries that serve as genuine dead ends, even with limited outgoing links, reflecting meaningful user engagement and specific pathways within the game and failure when encountering these articles countries.

### Pushing it further : 
#### Get the count of all articles that come before the "<" and link it with the failure ratio
What country articles make players want to go back ?

### Dead ends analysis Text 1