In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.stats import kurtosis
import ipywidgets as widgets
from IPython.display import display
import plotly.graph_objects as go

from src.utils.HTMLParser import HTMLParser
from src.data.data_loader import *
from src.utils.helpers import *
from src.models.networks import *

from src.models.similarity_matrices import *


parser = HTMLParser()
parser.load_pickle()

In [None]:
df_article_names = read_articles() 
df_html_stats = parser.get_df_html_stats()
df_categories = read_categories()
df_links = read_links()
df_shortest_path = read_shortest_path_matrix()
df_unfinished = read_unfinished_paths()
df_finished = read_finished_paths() 
df_sm = read_similartiy_matrix() 
df_scat = read_categories_matrix()

In [None]:
#DEFINE A NEW COLOR PALETTE TO HIGHLIGHT COUNTRY AND CATEGORIES, and add a possible color 'Others'
categories_others = ['Art',
 'Business Studies',
 'Citizenship',
 'Countries',
 'Design and Technology',
 'Everyday life',
 'Geography',
 'History',
 'IT',
 'Language and literature',
 'Mathematics',
 'Music',
 'People',
 'Religion',
 'Science',
 'Others',]

# colors for country and geo 
highlight_colors = {'Countries': '#2CB5AE','Geography': '#16A2F3'}

# shades of grey for other categories
num_greys = len(categories_others) - 2  # - country and geo 
grey_shades = [mcolors.to_hex((v, v, v)) for v in np.linspace(0.2, 0.4, num_greys)]
non_custom_categories = [cat for cat in categories_others if cat not in highlight_colors]
grey_palette = dict(zip(non_custom_categories, grey_shades)) ##here zip with a new 

# Combine custom colors and grey palette
palette_category_dict = {**highlight_colors, **grey_palette}

In [None]:
df_article = pd.DataFrame(df_article_names).copy()

# Compute in-degree (number of times each article is a target link)
in_degree = df_links.groupby('linkTarget').size().reset_index(name="in_degree")
# Compute out-degree (link density: number of times each article is a source link)
out_degree = df_links.groupby('linkSource').size().reset_index(name="out_degree")

# Merge in-degree and out-degree with df_article_names
df_article = df_article.merge(in_degree, left_on='article', right_on='linkTarget', how='left')
df_article = df_article.merge(out_degree, left_on='article', right_on='linkSource', how='left')
df_article = df_article.drop(columns=['linkTarget', 'linkSource'])

# Fill NaN values with 0, assuming no links imply zero counts for those articles
df_article = df_article.fillna(0).astype({'in_degree': 'int', 'out_degree': 'int'})

# add the html stats to the articles
df_html_stats = df_html_stats.rename(columns={'article_name': 'article'})
df_article = pd.merge(df_article, df_html_stats, how='inner')

# add the category (level_1) to each articles
category_map = dict(zip(df_categories["article"], df_categories["level_1"]))
df_article["category"] = df_article["article"].map(category_map)

In [None]:
df_article = pd.DataFrame(df_article_names).copy()

# Compute in-degree (number of times each article is a target link)
in_degree = df_links.groupby('linkTarget').size().reset_index(name="in_degree")
# Compute out-degree (link density: number of times each article is a source link)
out_degree = df_links.groupby('linkSource').size().reset_index(name="out_degree")

# Merge in-degree and out-degree with df_article_names
df_article = df_article.merge(in_degree, left_on='article', right_on='linkTarget', how='left')
df_article = df_article.merge(out_degree, left_on='article', right_on='linkSource', how='left')
df_article = df_article.drop(columns=['linkTarget', 'linkSource'])

# Fill NaN values with 0, assuming no links imply zero counts for those articles
df_article = df_article.fillna(0).astype({'in_degree': 'int', 'out_degree': 'int'})

# add the html stats to the articles
df_html_stats = df_html_stats.rename(columns={'article_name': 'article'})
df_article = pd.merge(df_article, df_html_stats, how='inner')

# add the category (level_1) to each articles
category_map = dict(zip(df_categories["article"], df_categories["level_1"]))
df_article["category"] = df_article["article"].map(category_map)

In [None]:
# let's add some useful metrics to each paths dataframe: shortest path, semantic similarity
df_unfinished['cosine_similarity'] = df_unfinished.apply(lambda x: find_shortest_distance(x, df_sm), axis=1)
df_unfinished['shortest_path'] = df_unfinished.apply(lambda x: find_shortest_distance(x, df_shortest_path), axis=1)
df_unfinished['path_length'] = df_unfinished['path'].apply(lambda x: x.count(';') + 1)
df_unfinished['back_clicks'] = df_unfinished['path'].apply(lambda x: x.count('<'))
df_unfinished['categories_similarity'] = df_unfinished.apply(lambda x: find_shortest_distance(x, df_scat), axis=1)

df_finished['cosine_similarity'] = df_finished.apply(lambda x: find_shortest_distance(x, df_sm), axis=1)
df_finished['shortest_path'] = df_finished.apply(lambda x: find_shortest_distance(x, df_shortest_path), axis=1)
df_finished['path_length'] = df_finished['path'].apply(lambda x: x.count(';') + 1)
df_finished['back_clicks'] = df_finished['path'].apply(lambda x: x.count('<'))
df_finished['categories_similarity'] = df_finished.apply(lambda x: find_shortest_distance(x, df_scat), axis=1)

## Difficulty Metrics

### Whether a game was finished or not 

In [None]:
df_categories_filtered = filter_most_specific_category(df_categories)
df_finished_voyage = game_voyage_sorting(df_finished, df_categories_filtered, True, n=3)
df_finished_voyage["finished"] = True
df_finished_voyage["cte"] = 1
df_unfinished_voyage = game_voyage_sorting(df_unfinished, df_categories_filtered, False, n=3)
df_unfinished_voyage["finished"] = False
df_unfinished_voyage["cte"] = 1
df_voyage = pd.concat([df_finished_voyage, df_unfinished_voyage])

In [None]:
df_voyage_comparison = df_voyage.groupby(["finished", "voyage"])[["voyage"]].count() 

df_voyage_comparison.columns = ["count"]
df_voyage_comparison = df_voyage_comparison.reset_index()
df_voyage_comparison = df_voyage_comparison.sort_values(by="finished", ascending=False)
df_voyage_comparison["percentage"] = df_voyage_comparison.groupby("voyage")["count"].transform(lambda x: (x / x.sum()) * 100).round(1)
df_voyage_comparison

import plotly.express as px
df_voyage_comparison["voyage_label"] = df_voyage_comparison["voyage"].map({False: "Non-Voyage", True: "Voyage"})
df_voyage_comparison["finished_label"] = df_voyage_comparison["finished"].map({False: "Unfinished", True: "Finished"})
fig = px.bar(df_voyage_comparison, x="voyage_label", y="count", color="finished_label",text="percentage", title="Completion Ratios for Voyage and Non-Voyage Games", labels={"finished_label": "Path Status"})

fig.update_traces(texttemplate="%{text}%") 
fig.update_layout(
    yaxis_title="Count",
    xaxis_title="Game Type",
)
fig.show()

### Game duration

In [None]:
# Game duration
df_duration_voyage = df_voyage.groupby(["finished", "voyage"])[["durationInSec"]].mean().reset_index()
df_duration_voyage

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(
    rows=1, cols=1, 
    # subplot_titles=["Comparison of Finished and Unfinished Paths"], 
    horizontal_spacing=0.1
)

# Add histograms for Unfinished paths
fig.add_trace(
    go.Histogram(
        x=df_finished_voyage[df_finished_voyage["voyage"]==False]["durationInSec"],
        nbinsx=1000,  # Same bin count for consistency
        marker_color="gray",
        name="Non-Voyage",
        histnorm="probability density",  # Normalize the histogram
        opacity=0.6,  # Adjust transparency for overlapping bars
        legendgroup="Non-Voyage",
        showlegend=True
    )
)


# Add histograms for Finished paths
fig.add_trace(
    go.Histogram(
        x=df_finished_voyage[df_finished_voyage["voyage"]==True]["durationInSec"],
        nbinsx=1000,  # Use the same bin count for both histograms
        marker_color='#16A2F3',
        name="Voyage",
        histnorm="probability density",  # Normalize the histogram
        opacity=0.6,  # Adjust transparency for overlapping bars
        legendgroup="Voyage",
        showlegend=True
    )
)


# Update layout for better visualization
fig.update_layout(
    title_text="Comparison of Voyage and Non-Voyage Game Durations",
    xaxis_title="Duration (seconds)",
    yaxis_title="Probability Density",
    barmode="overlay",  # Superpose the histograms
    legend_title="Path Status",
)

# limit x axis
fig.update_xaxes(range=[0, 1000])

# Show the plot
fig.show()


In [None]:
def remove_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    filtered_df = df[(df[col] >= (Q1 - 1.5 * IQR)) & (df[col] <= (Q3 + 1.5 * IQR))]
    return filtered_df

In [None]:
import plotly.graph_objects as go

# Create the figure
fig = go.Figure()

df_voyage_duration = df_finished_voyage[df_finished_voyage["voyage"] == True]
df_voyage_duration = remove_outliers(df_voyage_duration, "durationInSec")

fig.add_trace(go.Violin(
    x=df_voyage_duration["cte"], 
    y=df_voyage_duration["durationInSec"],
    legendgroup="Yes", 
    scalegroup="Yes", 
    name="Voyage",
    side="negative", 
    line_color='#16A2F3', 
))

df_non_voyage_duration = df_finished_voyage[df_finished_voyage["voyage"] == False]
df_non_voyage_duration = remove_outliers(df_non_voyage_duration, "durationInSec")

fig.add_trace(go.Violin(
    x=df_non_voyage_duration["cte"],
    y=df_non_voyage_duration["durationInSec"],
    legendgroup="No", 
    scalegroup="No", 
    name="Non-Voyage",
    side="positive", 
    line_color="gray",
))

fig.update_traces(meanline_visible=True, box_visible=True)
fig.update_layout(
    title="Duration Distribution for Voyage and Non-Voyage Games",
    violingap=0.4, 
    violinmode="overlay", 
    xaxis=dict(
        tickvals=[True, False], 
        ticktext=["Voyage", "Non-Voyage"],
        showgrid=False,
    ),
    yaxis_title="Duration in Seconds",
    xaxis_title="Game Type",
    width=800, 
    height=600,
)

fig.show()

###  Difficulty rating given for finished paths

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Voyage Game", "Non-Voyage Game"))

df_voyage_rating = df_finished_voyage[df_finished_voyage["voyage"] == True]
df_voyage_rating["rating"] = df_voyage_rating["rating"].fillna('NaN').astype(str)
df_voyage_rating = df_voyage_rating.groupby("rating")["rating"].count().reset_index(name="count")
fig.add_trace(
    go.Bar(x=df_voyage_rating["rating"], y=df_voyage_rating["count"], marker_color='#16A2F3', name="Voyage Game"),
    row=1, col=1
)

df_non_voyage_rating = df_finished_voyage[df_finished_voyage["voyage"] == False]
df_non_voyage_rating["rating"] = df_non_voyage_rating["rating"].fillna('NaN').astype(str)
df_non_voyage_rating = df_non_voyage_rating.groupby("rating")["rating"].count().reset_index(name="count")
fig.add_trace(
    go.Bar(x=df_non_voyage_rating["rating"], y=df_non_voyage_rating["count"], marker_color="gray", name="Non-Voyage Game"),
    row=1, col=2
)

fig.update_layout(
    title="Rating Distribution for Voyage and Non-Voyage Games",
    barmode='group',
    legend_title="Game Type",
    xaxis_title="Rating",
    yaxis_title="Count",
    yaxis=dict(tickformat='.0f'),
    showlegend=True
)
fig.show()

### Number of back-clicks needed

In [None]:
df_voyage = pd.concat([df_finished_voyage, df_unfinished_voyage])

df_voyage = extract_category_path(df_voyage, df_categories_filtered)
df_voyage = backtrack(df_voyage)

df_voyage.groupby(["voyage"])["back_nb"].mean().reset_index()

In [None]:
df_voyage_comparison

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# Create subplots layout
fig = make_subplots(
    rows=2, cols=2, 
    subplot_titles=(
        "Duration Distribution", 
        "Completion Ratios", 
        "Rating Distribution for Voyage Game", 
        "Rating Distribution for Non-Voyage Game"
    )
)

# ==== PLOT 1 (Violin Plot: Duration Distribution) ====

fig.add_trace(
    go.Violin(
        x=df_voyage_duration["cte"], 
        y=df_voyage_duration["durationInSec"],
        legendgroup="Yes", 
        scalegroup="Yes", 
        name="Voyage",
        side="negative", 
        line_color='#16A2F3', 
        box_visible=True,
        meanline_visible=True,
        showlegend=False),
    row=1, col=1
)
fig.add_trace(
    go.Violin(
        x=df_non_voyage_duration["cte"],
        y=df_non_voyage_duration["durationInSec"],
        legendgroup="No", 
        scalegroup="No", 
        name="Non-Voyage",
        side="positive", 
        line_color="gray",
        box_visible=True,
        meanline_visible=True,
        showlegend=False
    ),
    row=1, col=1
)

# ==== PLOT 2 (Bar Plot: Completion Ratios) ====

for voyage_label, color in [("Voyage", '#16A2F3'), ("Non-Voyage", 'gray')]:
    filtered_data = df_voyage_comparison[df_voyage_comparison["voyage_label"] == voyage_label]
    fig.add_trace(
        go.Bar(
            x=filtered_data["finished_label"],
            y=filtered_data["count"],
            text=filtered_data["percentage"],
            name=voyage_label,
            marker_color=color,
            texttemplate="%{text}%",
        ),
        row=1, col=2
    )

# ==== PLOT 3 (Bar Plot: Rating Distribution for Voyage Games) ====
fig.add_trace(
    go.Bar(
        x=df_voyage_rating["rating"], 
        y=df_voyage_rating["count"], 
        marker_color='#16A2F3', 
        name="Voyage",
        showlegend=False
    ),
    row=2, col=1
)

# ==== PLOT 4 (Bar Plot: Rating Distribution for Non-Voyage Games) ====
fig.add_trace(
    go.Bar(
        x=df_non_voyage_rating["rating"], 
        y=df_non_voyage_rating["count"], 
        marker_color="gray", 
        name="Non-Voyage",
        showlegend=False
    ),
    row=2, col=2
)

legend_x = 1.2  # Position the legend outside the plot
legend_y_start = 1.1  # Top position for the legend entries
legend_spacing = 0.05  # Spacing between legend entries

# ==== Final Layout Update ====
fig.update_layout(
    height=1000, width=1000,  # Adjust size of the overall figure
    title="Summary of Voyage and Non-Voyage Game Metrics",
    showlegend=True,
    legend_title="Legend",
    xaxis_title="Game Type",
    yaxis_title="Count/Percentage",
    violingap=0.4, 
    violinmode="overlay"
)

# Show the final plot
fig.show()


In [None]:
df_voyage_comparison