In [1]:
import seaborn as sn
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import config
from src.utils.HTMLParser import HTMLParser
from src.data.data_loader import *
from src.utils.helpers import *
from src.models.networks import *
from src.models.similarity_matrices import *
from src.models.markov_chains import *

parser = HTMLParser()
parser.load_pickle()

In [8]:
# Load and preprocess the data
df_article_names, df_html_stats, df_categories, df_links, df_shortest_path, df_unfinished, df_finished, df_article = read_all()

Unfinished Paths
---------------- 
Number of rows before filtering: 24875
Invalid target articles found: {'Test', 'Black ops 2', 'Netbook', 'Adolph Hitler', 'Fats', 'Pikachu', 'Rat', 'Sportacus', 'Rss', 'Christmas', 'The', 'English', 'Usa', ' Zebra', 'Podcast', 'The Rock', 'Long peper', 'Western Australia', 'Georgia', 'Macedonia', 'Great', 'Kashmir', 'Mustard', 'Charlottes web', 'Bogota'}
Invalid articles found in path: {'Local community', 'Osteomalacia', 'Pikachu', 'Wikipedia Text of the GNU Free Documentation License', 'Private Peaceful'}
Number of 'timeout' games with a duration of less than 30 minutes: 7
Number of rows after filtering: 24709 

Finished Paths
-------------- 
Number of rows before filtering: 51318
Invalid articles found in path: {'Pikachu', 'Local community', 'Wikipedia Text of the GNU Free Documentation License', 'Osteomalacia'}
Number of rows after filtering: 51196


In [43]:
world_regions_categories = ['Geography of Great Britain', 'Geography of Asia', 'Geography of Oceania Australasia', 'North American Geography', 'European Geography', 'African Geography', 'Central and South American Geography', 'Antarctica', 'Geography of the Middle East', 'African Countries', 'Countries']
df_categories = assign_world_region_categories(df_categories, world_regions_categories)

# Sort paths into voyage and non-voyage
df_finished = game_voyage_sorting(df_finished, df_categories)
df_unfinished = game_voyage_sorting(df_unfinished, df_categories)

# Assign the categories "World Regions" to the concerned articles
category_map = dict(zip(df_categories["article"], df_categories["level_1"]))
df_article["category"] = df_article["article"].map(category_map)

df_merged = pd.concat([df_finished, df_unfinished], ignore_index=True, sort=False)

In [44]:
df_merged

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,source,target,cosine_similarity,shortest_path,path_length,back_clicks,categories_similarity,Transition Category Path,Category Path,source_maincategory,target_maincategory,Wikispeedia_Voyage,type
0,6a3701d319fc3754,2011-02-15 03:26:49,166,14th century;15th century;16th century;Pacific...,,14th century,African slave trade,0.353137,3,9,0,1.0,"[History, Geography, World Regions, History]","[History, History, History, Geography, Geograp...",History,History,True,
1,3824310e536af032,2012-08-12 06:36:52,88,14th century;Europe;Africa;Atlantic slave trad...,3.0,14th century,African slave trade,0.353137,3,5,0,1.0,"[History, World Regions, History]","[History, World Regions, World Regions, Histor...",History,History,True,
2,415612e93584d30e,2012-10-03 21:10:40,138,14th century;Niger;Nigeria;British Empire;Slav...,,14th century,African slave trade,0.353137,3,8,0,1.0,"[History, World Regions, History, Citizenship,...","[History, World Regions, World Regions, Histor...",History,History,True,
3,64dd5cd342e3780c,2010-02-08 07:25:25,37,14th century;Renaissance;Ancient Greece;Greece,,14th century,Greece,0.371986,2,4,0,0.0,"[History, World Regions]","[History, History, History, World Regions]",History,World Regions,False,
4,015245d773376aab,2013-04-23 15:27:08,175,14th century;Italy;Roman Catholic Church;HIV;R...,3.0,14th century,John F. Kennedy,0.331395,3,7,0,0.0,"[History, World Regions, Religion, Science, Pe...","[History, World Regions, Religion, Science, Pe...",History,People,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75900,109ed71f571d86e9,2014-01-15 12:06:45,180,Franz Kafka;Tuberculosis;World Health Organiza...,,Franz Kafka,Cholera,0.453761,3,8,1,0.0,"[People, Science, Citizenship, Science, World ...","[People, Science, Citizenship, Science, World ...",People,World Regions,False,restart
75901,232f992e57d43e8d,2014-01-15 12:08:17,6,Modern history,,Modern history,Hollandic,0.328068,3,1,0,0.0,[History],[History],History,History,False,restart
75902,2e09a7224600a7cd,2014-01-15 15:06:40,1900,Computer programming;Linguistics;Culture;Popul...,,Computer programming,The Beatles,0.395061,3,5,1,0.0,"[IT, Language and literature, Everyday life, C...","[IT, Language and literature, Everyday life, C...",IT,<,False,timeout
75903,60af9e2138051b96,2014-01-15 15:24:41,1903,Jamaica;United Kingdom;World War II;Battle of ...,,Jamaica,Alan Turing,0.369151,3,4,0,0.0,"[World Regions, History]","[World Regions, World Regions, History, History]",World Regions,History,False,timeout


In [70]:
#  sankey diagram : users paths -----------------------------------------------------------------

# get paths data and their voyage status 
df_merged_ = df_merged.copy()
df_merged_=df_merged_[['source_maincategory', 'target_maincategory', 'Wikispeedia_Voyage', 'Category Path', 'type']]

# get intermediate step:
def process_row(row):
    series = row['Category Path']
    type_value = row['type']
    
    if pd.isna(type_value):
        # Check excluding the first and last elements
        series_to_check = series[1:-1]
    else:
        # Check excluding the first element
        series_to_check = series[1:]
    # Check for 'World Regions' in the filtered series
    return 'World Regions' if 'World Regions' in series_to_check else 'Others'

df_merged_['intermediate_step'] = df_merged_.apply(process_row, axis=1)

# Replace source and target categories that are not 'World Regions' to 'Others' 
df_merged_.loc[df_merged_['source_maincategory']!='World Regions', 'source_maincategory'] = 'Others'
df_merged_.loc[df_merged_['target_maincategory']!='World Regions', 'target_maincategory'] = 'Others'
df_merged_["Count"] = 1
df_merged_

Unnamed: 0,source_maincategory,target_maincategory,Wikispeedia_Voyage,Category Path,type,intermediate_step,Count
0,Others,Others,True,"[History, History, History, Geography, Geograp...",,World Regions,1
1,Others,Others,True,"[History, World Regions, World Regions, Histor...",,World Regions,1
2,Others,Others,True,"[History, World Regions, World Regions, Histor...",,World Regions,1
3,Others,World Regions,False,"[History, History, History, World Regions]",,Others,1
4,Others,Others,True,"[History, World Regions, Religion, Science, Pe...",,World Regions,1
...,...,...,...,...,...,...,...
75900,Others,World Regions,False,"[People, Science, Citizenship, Science, World ...",restart,World Regions,1
75901,Others,Others,False,[History],restart,Others,1
75902,Others,Others,False,"[IT, Language and literature, Everyday life, C...",timeout,Others,1
75903,World Regions,Others,False,"[World Regions, World Regions, History, History]",timeout,World Regions,1


In [71]:
#The first transition describes the transition fron the source of the path to an intermediate category 
#count first steps of users paths 
transitions_1_to_2 = df_merged_.groupby(['source_maincategory', 'intermediate_step']).count().reset_index()
transitions_1_to_2_count = transitions_1_to_2[['source_maincategory', 'intermediate_step', 'Count']].rename(columns={'source_maincategory': 'source', 'intermediate_step': 'target', 'Count':'count'})
transitions_1_to_2_count['step'] = 1

# Calculate percentages within each step
transitions_1_to_2_count['percentage'] = transitions_1_to_2_count['count'].transform(lambda x: (x / x.sum()) * 100)

transitions_1_to_2_count

Unnamed: 0,source,target,count,step,percentage
0,Others,Others,27723,1,36.523286
1,Others,World Regions,37314,1,49.158817
2,World Regions,Others,3996,1,5.264475
3,World Regions,World Regions,6872,1,9.053422


In [72]:
#The second transition describes the transition fron the intermediate of the path tothe target category 
#count first steps of users paths 
transitions_2_to_3 = df_merged_.groupby(['intermediate_step', 'target_maincategory']).count().reset_index()
transitions_2_to_3_count = transitions_2_to_3[['intermediate_step', 'target_maincategory', 'Count']].rename(columns={'intermediate_step': 'source', 'target_maincategory': 'target', 'Count':'count'})
transitions_2_to_3_count['step'] = 2

# Calculate percentages within each step
transitions_2_to_3_count['percentage'] = transitions_2_to_3_count['count'].transform(lambda x: (x / x.sum()) * 100)

transitions_2_to_3_count

Unnamed: 0,source,target,count,step,percentage
0,Others,Others,29824,2,39.291219
1,Others,World Regions,1895,2,2.496542
2,World Regions,Others,30772,2,40.540149
3,World Regions,World Regions,13414,2,17.67209


In [73]:
final_sankey_transitions=pd.concat([transitions_1_to_2_count, transitions_2_to_3_count])
final_sankey_transitions

Unnamed: 0,source,target,count,step,percentage
0,Others,Others,27723,1,36.523286
1,Others,World Regions,37314,1,49.158817
2,World Regions,Others,3996,1,5.264475
3,World Regions,World Regions,6872,1,9.053422
0,Others,Others,29824,2,39.291219
1,Others,World Regions,1895,2,2.496542
2,World Regions,Others,30772,2,40.540149
3,World Regions,World Regions,13414,2,17.67209


In [74]:
# Calculate percentages within each step
final_sankey_transitions['percentage'] = (
    final_sankey_transitions.groupby('step')['count']
    .transform(lambda x: (x / x.sum()) * 100)
)

final_sankey_transitions

Unnamed: 0,source,target,count,step,percentage
0,Others,Others,27723,1,36.523286
1,Others,World Regions,37314,1,49.158817
2,World Regions,Others,3996,1,5.264475
3,World Regions,World Regions,6872,1,9.053422
0,Others,Others,29824,2,39.291219
1,Others,World Regions,1895,2,2.496542
2,World Regions,Others,30772,2,40.540149
3,World Regions,World Regions,13414,2,17.67209


In [77]:
df = final_sankey_transitions.copy()

# Step 1: Modify source and target labels based on step
df['source_label'] = df.apply(lambda row: f"Start - {row['source']}" if row['step'] == 1 else f"Path - {row['source']}", axis=1)
df['target_label'] = df.apply(lambda row: f"Path - {row['target']}" if row['step'] == 1 else f"Target - {row['target']}", axis=1)

# Step 2: Create a unique list of all nodes
all_nodes = pd.concat([df['source_label'], df['target_label']]).unique()
node_mapping = {node: i for i, node in enumerate(all_nodes)}

# Step 3: Map nodes to indices
df['source_index'] = df['source_label'].map(node_mapping)
df['target_index'] = df['target_label'].map(node_mapping)

# Step 4: Prepare Sankey inputs
sources = df['source_index']
targets = df['target_index']
values = df['percentage']


df

Unnamed: 0,source,target,count,step,percentage,source_label,target_label,source_index,target_index
0,Others,Others,27723,1,36.523286,Start - Others,Path - Others,0,2
1,Others,World Regions,37314,1,49.158817,Start - Others,Path - World Regions,0,3
2,World Regions,Others,3996,1,5.264475,Start - World Regions,Path - Others,1,2
3,World Regions,World Regions,6872,1,9.053422,Start - World Regions,Path - World Regions,1,3
0,Others,Others,29824,2,39.291219,Path - Others,Target - Others,2,4
1,Others,World Regions,1895,2,2.496542,Path - Others,Target - World Regions,2,5
2,World Regions,Others,30772,2,40.540149,Path - World Regions,Target - Others,3,4
3,World Regions,World Regions,13414,2,17.67209,Path - World Regions,Target - World Regions,3,5


In [None]:
# finaly we stop the idea but for really obtain the good percentage for the flow voyages you need to soutraire le percentage de start world regions who go to others and the percentage of others who go to target world
# aka the two layers are independant and we need to soustract the percentage of the first layer to the percentage of the second layer

In [78]:
import plotly.graph_objects as go

# Define the links and highlight the specific path
links = dict(
    source= sources,  # Source nodes
    target= targets,  # Target nodes
    value= values,  # Flow values
    color=[
        "lightblue", "orange", 
        "lightblue", "lightblue",     
        "lightblue", "lightblue",   
        "orange", "lightblue"   
    ]
)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, thickness=20,
        label=list(node_mapping.keys()),  # Dynamically generate node labels
        color=["blue", "green", "blue", "green", "blue", "green"]
    ),
    link=dict(
        source=links['source'],
        target=links['target'],
        value=links['value'],
        color=links['color']  # Custom colors for highlighting
    )
))

# Update layout
fig.update_layout(title_text="Highlighting the Path: Others → World Regions → Others", font_size=12)
fig.show()

### Example brut

In [15]:
import plotly.graph_objects as go

# Define the nodes
nodes = ["Start - World Regions", "Start - Others", 
         "Path - World Regions", "Path - Others",
         "Target - World Regions", "Target - Others"]

# Define the links and highlight the specific path
links = dict(
    source=[0, 0, 1, 1, 2, 2, 3, 3],  # Source nodes
    target=[2, 3, 2, 3, 4, 5, 4, 5],  # Target nodes
    value=[50, 20, 10, 20, 40, 10, 15, 15],  # Flow values
    color=[
        "lightblue", "lightblue",  # Start -> Path
        "orange", "lightblue",        # Start -> Path (Others -> World Regions)
        "lightblue", "orange",    # Path -> Target (World Regions -> Others highlighted)
        "lightblue", "lightblue"   # Path -> Target
    ]
)

# Create the Sankey diagram
fig = go.Figure(go.Sankey(
    node=dict(
        pad=15, thickness=20,
        label=nodes, color=["blue", "green", "blue", "green", "blue", "green"]
    ),
    link=dict(
        source=links['source'],
        target=links['target'],
        value=links['value'],
        color=links['color']  # Custom colors for highlighting
    )
))

# Update layout
fig.update_layout(title_text="Highlighting the Path: Others → World Regions → Others", font_size=12)
fig.show()