In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import ast
from plotly.subplots import make_subplots
import warnings
import re
from river import cluster

warnings.filterwarnings("ignore")

In [2]:
def clean_np_literals(s):
    if not isinstance(s, str):
        return s
    # Tisztítjuk a np.float64(…) és np.int64(…) hívásokat
    s = re.sub(r'np\.float64\(([^)]+)\)', r'\1', s)
    s = re.sub(r'np\.int64\(([^)]+)\)', r'\1', s)
    return s

In [3]:
centroid_df = pd.read_excel("../data/results/hyperparameter_tuning_centroid_vs_full/centroid_results_kmeans500_v2_l2.xlsx")
centroid_df['centroid_metrics'] = (centroid_df['centroid_metrics'].apply(clean_np_literals).apply(ast.literal_eval))
full_df = pd.read_excel("../data/results/hyperparameter_tuning_centroid_vs_full/full_results_kmeans500_v2_l2.xlsx")
full_df['full_metrics'] = (full_df['full_metrics'].apply(clean_np_literals).apply(ast.literal_eval))

centroid_df['centroid_metrics'] = centroid_df['centroid_metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
full_df['full_metrics'] = full_df['full_metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
centroid_df['x_axis'] = centroid_df['top_k'].astype(str) + "_" + centroid_df['top_n_clusters'].astype(str)

In [4]:
centroid_df.head()

Unnamed: 0.1,Unnamed: 0,top_k,top_n_clusters,centroid_metrics,centroid_time,x_axis
0,0,3,5,"{'doc_accuracy': 0.5764677678968938, 'chunk_ac...",2201.52147,3_5
1,1,3,10,"{'doc_accuracy': 0.5995616388314935, 'chunk_ac...",2382.507434,3_10
2,2,3,20,"{'doc_accuracy': 0.6154636468452837, 'chunk_ac...",2775.859128,3_20
3,3,3,35,"{'doc_accuracy': 0.6242080389045537, 'chunk_ac...",3575.666577,3_35
4,4,5,5,"{'doc_accuracy': 0.5978492905170151, 'chunk_ac...",2254.86549,5_5


In [5]:
centroid_df.loc[0, "centroid_metrics"]

{'doc_accuracy': 0.5764677678968938,
 'chunk_accuracy': 0.6632153335083734,
 'doc_precision': 0.8496396817969115,
 'doc_recall': 0.6096159011549825,
 'doc_f1': 0.7098877898111584,
 'chunk_precision': 1.0,
 'chunk_recall': 0.6037872683319904,
 'chunk_f1': 0.7529518163090991,
 'correct_chunk_accuracy': 0.26499541471173566,
 'doc_true_positives': 45392,
 'doc_true_negatives': 5106,
 'doc_false_positives': 8033,
 'doc_false_negatives': 29068,
 'chunk_true_positives': 44958,
 'chunk_true_negatives': 13139,
 'chunk_false_positives': 0,
 'chunk_false_negatives': 29502}

In [6]:
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    "F1 Score": ["doc_f1", "chunk_f1"],
    "Precision": ["doc_precision", "chunk_precision"],
    "Recall": ["doc_recall", "chunk_recall"],
    "Correct chunk accuracy": ["correct_chunk_accuracy"],
}

for title, keys in groups.items():
    fig = make_subplots(rows=1, cols=2, subplot_titles=(f"Centroid - {title}", f"Full - {title}"))

    for key in keys:
        fig.add_trace(go.Scatter(
            x=centroid_df['x_axis'],
            y=centroid_df['centroid_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=1)

        fig.add_trace(go.Scatter(
            x=full_df['top_k'],
            y=full_df['full_metrics'].apply(lambda m: m.get(key)),
            mode='lines+markers',
            name=key
        ), row=1, col=2)

    fig.update_yaxes(range=[0, 1.1], row=1, col=1)
    fig.update_yaxes(range=[0, 1.1], row=1, col=2)

    fig.update_xaxes(title_text='top k with top n clusters', row=1, col=1)
    fig.update_xaxes(title_text='top k', row=1, col=2)

    fig.show()

# 1. kérdéshez

In [7]:
similarity_thresholds = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
missing_ratios = ["03", "05", "015", "005"]

dfs = []
for sim in similarity_thresholds:
    for miss in missing_ratios:
        df = pd.read_excel(f"../data/results/1_kerdes/onlinekmeans_v2_sim={sim}__missing={miss}.xlsx")
        df['similarity_threshold'] = sim
        df['missing_ratio'] = miss
        dfs.append(df)

In [8]:
for df in dfs:
    df['metrics'] = (df['metrics'].apply(clean_np_literals).apply(ast.literal_eval))
    df['metrics'] = df['metrics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [13]:
# --- Groups to plot ---
groups = {
    "Accuracy": ["doc_accuracy", "chunk_accuracy"],
    # "Recall": ["doc_recall", "chunk_recall"]
}

# Colors per similarity threshold
color_map = {
    0.3: "blue",
    0.4: "orange",
    0.5: "green",
    0.6: "red",
    0.7: "purple",
    0.8: "brown",
    0.9: "pink"
}


# ===========================================================
# MAIN LOOP → Create diagrams for each missing_ratio separately
# ===========================================================
for miss in missing_ratios:

    # Filter all dfs with this missing_ratio
    dfs_miss = [df for df in dfs if df["missing_ratio"].iloc[0] == miss]

    print(f"\n📊 Plotting missing_ratio = {miss} ...")

    for title, keys in groups.items():

        fig = make_subplots(
            rows=1,
            cols=2,
            subplot_titles=(f"Document-level {title}", f"Chunk-level {title}")
        )

        all_values = []

        # loop through all similarity thresholds for this missing ratio
        for df in dfs_miss:
            sim = df["similarity_threshold"].iloc[0]
            color = color_map[sim]

            # Document-level metric
            y_doc = df["metrics"].apply(lambda m: m.get(keys[0]))
            all_values.extend(y_doc.dropna())
            
            fig.add_trace(
                go.Scatter(
                    x=df["batch"],
                    y=y_doc,
                    mode="lines+markers",
                    name=f"sim={sim}",
                    line=dict(color=color)
                ),
                row=1, col=1
            )

            # Chunk-level metric
            y_chunk = df["metrics"].apply(lambda m: m.get(keys[1]))
            all_values.extend(y_chunk.dropna())

            fig.add_trace(
                go.Scatter(
                    x=df["batch"],
                    y=y_chunk,
                    mode="lines+markers",
                    name=f"sim={sim}",
                    showlegend=False,  # legend only for left plot
                    line=dict(color=color)
                ),
                row=1, col=2
            )

        # --- Dynamic y-axis scaling ---
        if all_values:
            ymin, ymax = np.min(all_values), np.max(all_values)
            padding = (ymax - ymin) * 0.05 if ymax != ymin else 0.05
            yrange = [ymin - padding, ymax + padding]
        else:
            yrange = [0.5, 1]

        fig.update_yaxes(title_text=title, range=yrange, row=1, col=1)
        fig.update_yaxes(title_text=title, range=yrange, row=1, col=2)
        fig.update_xaxes(title_text="Batch", row=1, col=1)
        fig.update_xaxes(title_text="Batch", row=1, col=2)
        if miss == "03":
            missing_str = "30%"
        elif miss == "05":
            missing_str = "50%"
        elif miss == "015":
            missing_str = "15%"
        elif miss == "005":
            missing_str = "5%"
        else:
            missing_str = miss
        
        fig.update_layout(
            title=f"{title} for OnlineKMeans with {missing_str} missing answers",
            template="plotly_white",
            width=1100,
            height=500,
            legend_title="Similarity threshold"
        )

        fig.show()


📊 Plotting missing_ratio = 03 ...



📊 Plotting missing_ratio = 05 ...



📊 Plotting missing_ratio = 015 ...



📊 Plotting missing_ratio = 005 ...
