In [1]:
import seaborn as sn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import config
from src.utils.HTMLParser import HTMLParser
from src.data.data_loader import *
from src.utils.helpers import *
from src.models.networks import *
from src.models.similarity_matrices import *
from src.models.markov_chains import *

parser = HTMLParser()
parser.load_pickle()

In [5]:
# Load and preprocess the data
df_article_names, df_html_stats, df_categories, df_links, df_shortest_path, df_unfinished, df_finished, df_article = read_all()

# Load and preprocess the data
df_article_names, df_html_stats, df_categories, df_links, df_shortest_path, df_unfinished, df_finished, df_article = read_all()

# save old categories, needed to make the flourish sankey plot 
df_categories_og=df_categories.copy()

df_categories_original = read_categories()
# Count the number of articles with multiples categorie
df_categories_original.groupby("article")["article"].size().value_counts()

world_regions_categories = ['Geography of Great Britain', 'Geography of Asia', 'Geography of Oceania Australasia', 'North American Geography', 'European Geography', 'African Geography', 'Central and South American Geography', 'Antarctica', 'Geography of the Middle East', 'African Countries', 'Countries']
df_categories = assign_world_region_categories(df_categories, world_regions_categories)

labels_filtered, parents_filtered, values_filtered, ids_filtered = create_treemap_data(df_categories, show_articles=True)

# Sort paths into voyage and non-voyage
df_finished = game_voyage_sorting(df_finished, df_categories)
df_unfinished = game_voyage_sorting(df_unfinished, df_categories)

# Assign the categories "World Regions" to the concerned articles
category_map = dict(zip(df_categories["article"], df_categories["level_1"]))
df_article["category"] = df_article["article"].map(category_map)

voyage_count = (df_finished['Wikispeedia_Voyage'] == 1).sum() + (df_unfinished['Wikispeedia_Voyage'] == 1).sum()
non_voyage_count = (df_finished['Wikispeedia_Voyage'] == 0).sum() + (df_unfinished['Wikispeedia_Voyage'] == 0).sum()

df_merged = pd.concat([df_finished, df_unfinished], ignore_index=True, sort=False)


Unfinished Paths
---------------- 
Number of rows before filtering: 24875
Invalid target articles found: {'Rat', 'The Rock', 'Great', 'Bogota', 'Black ops 2', 'Macedonia', 'Adolph Hitler', 'Test', 'Kashmir', 'The', 'Western Australia', 'Rss', 'Sportacus', 'Netbook', 'Fats', 'Mustard', ' Zebra', 'Christmas', 'Charlottes web', 'Pikachu', 'Podcast', 'Georgia', 'English', 'Long peper', 'Usa'}
Invalid articles found in path: {'Pikachu', 'Private Peaceful', 'Osteomalacia', 'Wikipedia Text of the GNU Free Documentation License', 'Local community'}
Number of 'timeout' games with a duration of less than 30 minutes: 7
Number of rows after filtering: 24709 

Finished Paths
-------------- 
Number of rows before filtering: 51318
Invalid articles found in path: {'Wikipedia Text of the GNU Free Documentation License', 'Local community', 'Pikachu', 'Osteomalacia'}
Number of rows after filtering: 51196
Unfinished Paths
---------------- 
Number of rows before filtering: 24875
Invalid target articles fou

In [6]:
#  sankey diagram : users paths -----------------------------------------------------------------

# get paths data and their voyage status 
df_merged_ = df_merged.copy()
df_merged_=df_merged_[['source_maincategory', 'target_maincategory', 'Wikispeedia_Voyage', 'Category Path', 'type']]

# get intermediate step:
def process_row(row):
    series = row['Category Path']
    type_value = row['type']
    
    if pd.isna(type_value):
        if len(series) <= 2:
            return None  # Mark for dropping
        else:
            # Check excluding the first and last elements
            series_to_check = series[1:-1]
    else:
        if len(series) <= 1:
            return None  # Mark for dropping
        else:
            # Check excluding the first element
            series_to_check = series[1:]
    
    # Check for 'World Regions' in the filtered series
    return 'World Regions' if 'World Regions' in series_to_check else 'Others'

df_merged_['intermediate_step'] = df_merged_.apply(process_row, axis=1)
# Drop rows too short to have intermediate steps
df_merged_ = df_merged_.dropna(subset=['intermediate_step'])

# Replace source and target categories that are not 'World Regions' to 'Others' 
df_merged_.loc[df_merged_['source_maincategory']!='World Regions', 'source_maincategory'] = 'Others'
df_merged_.loc[df_merged_['target_maincategory']!='World Regions', 'target_maincategory'] = 'Others'
df_merged_

Unnamed: 0,source_maincategory,target_maincategory,Wikispeedia_Voyage,Category Path,type,intermediate_step
0,Others,Others,True,"[History, History, History, Geography, Geograp...",,World Regions
1,Others,Others,True,"[History, World Regions, World Regions, Histor...",,World Regions
2,Others,Others,True,"[History, World Regions, World Regions, Histor...",,World Regions
3,Others,World Regions,False,"[History, History, History, World Regions]",,Others
4,Others,Others,True,"[History, World Regions, Religion, Science, Pe...",,World Regions
...,...,...,...,...,...,...
75899,World Regions,Others,False,"[World Regions, World Regions, World Regions, ...",restart,World Regions
75900,Others,World Regions,False,"[People, Science, Citizenship, Science, World ...",restart,World Regions
75902,Others,Others,False,"[IT, Language and literature, Everyday life, C...",timeout,Others
75903,World Regions,Others,False,"[World Regions, World Regions, History, History]",timeout,World Regions


In [7]:

#The first transition describes the transition fron the source of the path to an intermediate category 
#count first steps of users paths 
transitions_1_to_2 = df_merged_.groupby(['source_maincategory', 'intermediate_step']).count().reset_index()
transitions_1_to_2_count = transitions_1_to_2[['source_maincategory', 'intermediate_step', 'Wikispeedia_Voyage']].rename(columns={'source_maincategory': 'source', 'intermediate_step': 'target', 'Wikispeedia_Voyage':'count'})
transitions_1_to_2_count['step'] = 1
transitions_1_to_2_count

Unnamed: 0,source,target,count,step
0,Others,Others,22728,1
1,Others,World Regions,37280,1
2,World Regions,Others,3007,1
3,World Regions,World Regions,6906,1


In [8]:
#The second transition describes the transition fron the intermediate of the path tothe target category 
#count first steps of users paths 
transitions_2_to_3 = df_merged_.groupby(['intermediate_step', 'target_maincategory']).count().reset_index()
transitions_2_to_3_count = transitions_2_to_3[['intermediate_step', 'target_maincategory', 'Wikispeedia_Voyage']].rename(columns={'intermediate_step': 'source', 'target_maincategory': 'target', 'Wikispeedia_Voyage':'count'})
transitions_2_to_3_count['step'] = 2
transitions_2_to_3_count

Unnamed: 0,source,target,count,step
0,Others,Others,24884,2
1,Others,World Regions,851,2
2,World Regions,Others,30764,2
3,World Regions,World Regions,13422,2


In [9]:
final_sankey_transitions=pd.concat([transitions_1_to_2_count, transitions_2_to_3_count])
final_sankey_transitions

Unnamed: 0,source,target,count,step
0,Others,Others,22728,1
1,Others,World Regions,37280,1
2,World Regions,Others,3007,1
3,World Regions,World Regions,6906,1
0,Others,Others,24884,2
1,Others,World Regions,851,2
2,World Regions,Others,30764,2
3,World Regions,World Regions,13422,2


In [None]:
import plotly.graph_objects as go

# Define the nodes
nodes = ["Start - World Regions", "Start - Others", 
         "Path - World Regions", "Path - Others",
         "Target - World Regions", "Target - Others"]

# Define the links and highlight the specific path
links = dict(
    source=[0, 0, 1, 1, 2, 2, 3, 3],  # Source nodes
    target=[2, 3, 2, 3, 4, 5, 4, 5],  # Target nodes
    value=[50, 20, 10, 20, 40, 10, 15, 15],  # Flow values
    color=[
        "lightblue", "lightblue",  # Start -> Path
        "orange", "lightblue",        # Start -> Path (Others -> World Regions)
        "lightblue", "orange",    # Path -> Target (World Regions -> Others highlighted)
        "lightblue", "lightblue"   # Path -> Target
    ]
)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, thickness=20,
        label=nodes, color=["blue", "green", "blue", "green", "blue", "green"]
    ),
    link=dict(
        source=links['source'],
        target=links['target'],
        value=links['value'],
        color=links['color']  # Custom colors for highlighting
    )
))

# Update layout
fig.update_layout(title_text="Highlighting the Path: Others → World Regions → Others", font_size=12)
fig.show()