In [None]:
from src.data.data_loader import read_all
from src.utils.helpers import game_voyage_sorting , plot_sankey_voyage
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import silhouette_score

In [None]:
_, _ , df_categories, _ , _ , df_unfinished, df_finished, _, _ = read_all()

## Statistical corelation between paths metrics and voyage status (finished/unfinished) SEPARATELY

In [None]:
from scipy.stats import pearsonr, spearmanr

def calculate_correlations_with_voyage(df, column_name):
    """
    Calculate Pearson and Spearman correlations between 'voyage' and a specified column.
    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        column_name (str): The column to calculate correlation with 'voyage'.

    Returns:
        None: Prints the correlation results.
    """
    # Ensure 'voyage' is numeric (convert True/False to 1/0)
    df['voyage'] = df['voyage'].astype(int)

    # Calculate Pearson and Spearman correlations
    pearson_corr, pearson_p = pearsonr(df['voyage'], df[column_name])
    spearman_corr, spearman_p = spearmanr(df['voyage'], df[column_name])
    # WARNING: comparing oranges with apples here. First separate into voyage, non-voyage, then compare the means with t-test

    # Print results
    print(f"{column_name}:")
    print(f"  Pearson correlation: {pearson_corr:.4f}, p-value: {pearson_p:.4e}")
    print(f"  Spearman correlation: {spearman_corr:.4f}, p-value: {spearman_p:.4e}\n")

df_finished = game_voyage_sorting(df_finished, df_categories, True, n=3)
df_unfinished = game_voyage_sorting(df_unfinished, df_categories, True, n=3)

# Fill missing values in 'rating' with the mean
df_finished['rating'] = df_finished['rating'].fillna(df_finished['rating'].mean())

# Convert 'type' column to a binary column 'timeout'
df_unfinished['timeout'] = (df_unfinished['type'] == 'timeout').astype(int)

# Example use-case (replace `df_finished` and `df_unfinished` with actual data)
print("Finished paths:")
for metric in ['durationInSec', 'rating', 'cosine_similarity', 'shortest_path', 'path_length', 'back_clicks', 'categories_similarity']:
    calculate_correlations_with_voyage(df_finished, metric)

print("Unfinished paths:")
for metric in ['durationInSec', 'timeout', 'cosine_similarity', 'shortest_path', 'path_length', 'back_clicks', 'categories_similarity']:
    calculate_correlations_with_voyage(df_unfinished, metric)

## Study with paths merged

In [None]:
paths_merged = pd.concat([df_finished, df_unfinished])
# Sort paths into voyage and non-voyage
paths_merged = game_voyage_sorting(paths_merged, df_categories, True, n=3)
# If the the type is NaN tell that this path is finished
paths_merged['type'] = paths_merged['type'].fillna('Finished')

In [None]:
paths_merged.head(2)

In [None]:
df = paths_merged.copy()

# One-hot encode the 'type' column
df = pd.get_dummies(df, columns=['type'])

# Handle missing values (e.g., replace NaN in 'rating' with the mean)
df['rating'] = df['rating'].fillna(df['rating'].mean())
# Drop columns that are not useful for the analysis
columns_to_drop = ['hashedIpAddress','path','Category Path','start_maincategory','end_maincategory','target']
df = df.drop(columns=columns_to_drop)

# Compute the correlation matrix
correlation_matrix = df.corr()

# Visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", cbar=True)
plt.title("Feature Correlation Matrix")
plt.show()

# Focus on correlation with 'voyage'
voyage_correlation = correlation_matrix['voyage'].sort_values(ascending=False)
print("Correlation of features with 'voyage':\n", voyage_correlation)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

# Features to cluster paths

Not trivial which features choose to cluster the paths to see if their correlate to being a voyage or not...

We migth clusters multiples groups of features or do it separatly (What to try??)

Features interesting 
- For df_finished: 
```features = ['durationInSec','rating','cosine_similarity','shortest_path','path_length', 'back_clicks', 'categories_similarity']```
- For df_unfinsihed:
Same but + ```type``` and without ```rating```

Maybe also merged all paths... -> the **silhouette_score** for 2 cluster change if paths are merged or not (logic both type paths differ in logic and length)
- df_finished : 0.6
- df_unfinished : 0.2 (better at 4 cluster (0.35)) **without type column**
- df_unfinished : 0.4 (better at 4 cluster (0.45)) **with type column**
- df_finished + df_unfinished : 0.4 **without type column** (one options is to create 3 dummies: finish,  timeout, restart)

=> mmm beter do it separatly to avoid the confounding of cluster by intrisic differences btw f and u

**IDEA**
Cluster paths based on subsets of features that represent specific aspects of behavior or data:

- Group A: Path Characteristics
Features: ``durationInSec, path_length, back_clicks.``  
Focus: Captures navigation dynamics (e.g., efficiency, hesitation).

- Group B: Content Similarity
Features: ``cosine_similarity, categories_similarity.``   
Focus: Captures how similar the target and the source of the path are similar.

- Group C: Performance Metrics
Features: ``rating, shortest_path.``  
Focus: Measures subjective and objective path quality.

For the moment done with all meaningfull and not bias (towards voyage) features


## K-means

In [None]:
def silhouette_score_plot(df_scaled, max_clusters):
    scores = []
    for k in tqdm(range(2, max_clusters), desc="Calculating silhouette scores"):
        kmeans = KMeans(n_clusters=k, n_init='auto', random_state=42)
        labels = kmeans.fit_predict(df_scaled)
        scores.append(silhouette_score(df_scaled, labels))

    plt.plot(range(2, max_clusters), scores, marker='o')
    plt.title('Silhouette Score vs Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Silhouette Score')
    plt.show()

## df_finished

In [None]:
# Calculate the distribution of voyages
voyage_counts = df_finished['voyage'].value_counts()
plt.figure()
plt.pie(
    voyage_counts,
    labels=voyage_counts.index.astype(str),
    autopct='%1.1f%%',
    startangle=90,
)
plt.title('Distribution of Voyages in Finished Paths')
plt.tight_layout()
plt.show()

In [None]:
features = ['durationInSec','cosine_similarity','shortest_path','path_length', 'back_clicks', 'categories_similarity'] # 'rating'
# rating we could use the mean value or do clusters with only paths with ratting?

# Normalize the features for clustering
scaler = StandardScaler()
df_scaled_f = scaler.fit_transform(df_finished[features])

In [None]:
silhouette_score_plot(df_scaled_f, 6)

In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=2, n_init='auto', random_state=42)
df_finished['cluster'] = kmeans.fit_predict(df_scaled_f)

# Reduce dimensions for visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(df_scaled_f)
df_finished['pca1'], df_finished['pca2'] = reduced_data[:, 0], reduced_data[:, 1]

# Plot the clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x='pca1', y='pca2', hue='cluster', palette='viridis', data=df_finished,style='voyage')
plt.title('Clusters and Voyage Status')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Voyage / Cluster')
plt.grid(True)
plt.show()

In [None]:
# Generate a confusion matrix-like DataFrame
confusion_matrix = (
    df_finished.groupby('voyage')['cluster']
    .value_counts(normalize=True)
    .unstack()
    .fillna(0)
)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix, annot=True, fmt=".2f", cbar=True)
plt.title("Cluster vs Voyage Confusion Matrix")
plt.ylabel("Voyage (True/False)")
plt.xlabel("Cluster")
plt.show()

# Calculate the percentage size of each cluster
cluster_percentages = df_finished['cluster'].value_counts(normalize=True) * 100

# Print the results
print("Percentage size of each cluster:")
for cluster, percentage in cluster_percentages.items():
    print(f"Cluster {cluster}: {percentage:.2f}%")

## df_unfinished

In [None]:
# Calculate the distribution of voyages
voyage_counts = df_unfinished['voyage'].value_counts()
plt.figure()
plt.pie(
    voyage_counts,
    labels=voyage_counts.index.astype(str),
    autopct='%1.1f%%',
    startangle=90,
)
plt.title('Distribution of Voyages in Unfinished Paths')
plt.tight_layout()
plt.show()

In [None]:
features = ['durationInSec','cosine_similarity','shortest_path','path_length', 'back_clicks', 'categories_similarity']
# timout is a binary variable: 0 is for restart and 1 is for timeout

# Normalize the features for clustering
scaler = StandardScaler()
df_scaled_u = scaler.fit_transform(df_unfinished[features])

In [None]:
silhouette_score_plot(df_scaled_u, 6)

In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=2, n_init='auto', random_state=42)
df_unfinished['cluster'] = kmeans.fit_predict(df_scaled_u)

# Reduce dimensions for visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(df_scaled_u)
df_unfinished['pca1'], df_unfinished['pca2'] = reduced_data[:, 0], reduced_data[:, 1]

# Plot the clusters
plt.figure(figsize=(8, 6))
sns.scatterplot(x='pca1', y='pca2', hue='cluster', palette='viridis', data=df_unfinished,style='voyage')
plt.title('Clusters and Voyage Status')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend(title='Voyage / Cluster')
plt.grid(True)
plt.show()

In [None]:
# Generate a confusion matrix-like DataFrame
confusion_matrix = (
    df_unfinished.groupby('voyage')['cluster']
    .value_counts(normalize=True)
    .unstack()
    .fillna(0)
)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix, annot=True, fmt=".2f", cbar=True)
plt.title("Cluster vs Voyage Confusion Matrix")
plt.ylabel("Voyage (True/False)")
plt.xlabel("Cluster")
plt.show()

# Calculate the percentage size of each cluster
cluster_percentages = df_unfinished['cluster'].value_counts(normalize=True) * 100

# Print the results
print("Percentage size of each cluster:")
for cluster, percentage in cluster_percentages.items():
    print(f"Cluster {cluster}: {percentage:.2f}%")


# Hierarchical Clustering 
$O(n^{2})$ memory complexity ! for paths...

In [None]:
"""from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt

# Perform hierarchical clustering
# Method: 'ward' minimizes variance; try 'single', 'complete', 'average' for other linkage criteria
linkage_matrix = linkage(df_scaled, method='ward')


# Plot the dendrogram
plt.figure(figsize=(12, 8))
dendrogram(linkage_matrix, truncate_mode='level', p=5, leaf_rotation=90., leaf_font_size=10.)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.grid(True)
plt.show()

# Assign cluster labels based on a distance threshold or number of clusters
# Example: Cutting the dendrogram to form 3 clusters
cluster_labels = fcluster(linkage_matrix, t=3, criterion='maxclust')

# Add cluster labels to your dataset (if applicable)
# df['cluster'] = cluster_labels  # Uncomment and replace df with your DataFrame

# Print example cluster assignments
print("Cluster labels:", cluster_labels)"""

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Step 1: Create a directed graph with transition counts
G = nx.DiGraph()

# Example graph data: edges with weights (transition counts)
# New edges with weights focusing on making node C central
edges = [
    ('A', 'C', 25),  # High weight to C from A
    ('B', 'C', 30),  # High weight to C from B
    ('C', 'D', 20),  # C connects to D with significant weight
    ('C', 'E', 15),  # C connects to E with significant weight
    ('D', 'C', 10),  # Return edge to C from D
    ('E', 'C', 12),  # Return edge to C from E
    ('F', 'C', 18),  # F strongly connects to C
    ('C', 'F', 10)   # C connects back to F
]


# Add edges to the graph
for u, v, weight in edges:
    G.add_edge(u, v, weight=weight)

# Step 2: Calculate centrality metrics
pagerank = nx.pagerank(G, weight='weight')  # PageRank
in_degree_centrality = nx.in_degree_centrality(G)  # In-degree centrality

# Step 3: Identify frequent chains (high-weight paths)
# Sort edges by weight
frequent_chains = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)

# Visualization
plt.figure(figsize=(8, 6))
pos = nx.spring_layout(G, seed=42)  # Layout for visualization
nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=500, font_size=10)
labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=labels)
plt.title("Directed Graph with Transition Counts")
plt.show()

# Print results
print("PageRank (centrality measure):")
for node, rank in pagerank.items():
    print(f"{node}: {rank:.4f}")

print("\nIn-degree centrality:")
for node, centrality in in_degree_centrality.items():
    print(f"{node}: {centrality:.4f}")

print("\nFrequent chains (high-weight edges):")
for u, v, data in frequent_chains:
    print(f"{u} -> {v} (weight: {data['weight']})")
