# **Analysis**

In [71]:
import pandas as pd
from scipy import stats
import numpy as np
from itertools import combinations
import math
#plt
import matplotlib.pyplot as plt

## **Dev**

### **Importing Dataframes**

In [72]:
df_base = pd.read_csv('./Validation_Results/df_base.csv')
df_decomposition = pd.read_csv('./Validation_Results/df_decomposition.csv')
df_hyde = pd.read_csv('./Validation_Results/df_hyde.csv')

In [73]:
df_gpt_base = pd.read_csv('./Validation_Results/df_gpt_base.csv')
df_gpt_decomposition = pd.read_csv('./Validation_Results/df_gpt_decomposition.csv')
df_gpt_hyde = pd.read_csv('./Validation_Results/df_gpt_hyde.csv')

### **Concat Dataframes**

In [74]:
df_gemma  = pd.concat([
    df_base.assign(model='Gemma', tech='Base'),
    df_decomposition.assign(model='Gemma', tech='Decomposition'),
    df_hyde.assign(model='Gemma', tech='HyDE')
])

df_gpt = pd.concat([
    df_gpt_base.assign(model='GPT', tech='Base'),
    df_gpt_decomposition.assign(model='GPT', tech='Decomposition'),
    df_gpt_hyde.assign(model='GPT', tech='HyDE')
])

df_all = pd.concat([df_gemma, df_gpt], ignore_index=True)
df_all

Unnamed: 0,rouge1-P,rouge1-R,rouge1-F1,rouge2-P,rouge2-R,rouge2-F1,rougeL-P,rougeL-R,rougeL-F1,METEOR,BERTScore Precision,BERTScore Recall,BERTScore F1,MATTR,model,tech
0,0.674419,0.213235,0.324022,0.171875,0.054054,0.082243,0.372093,0.117647,0.178771,0.135188,0.881479,0.852133,0.866558,0.784615,Gemma,Base
1,0.622222,0.0625,0.11359,0.204545,0.020134,0.03666,0.355556,0.035714,0.064909,0.035565,0.882748,0.793773,0.8359,0.8,Gemma,Base
2,0.646018,0.13059,0.217262,0.241071,0.048387,0.080597,0.442478,0.089445,0.14881,0.0815,0.877768,0.818323,0.847004,0.758125,Gemma,Base
3,0.614035,0.064457,0.116667,0.232143,0.023985,0.043478,0.473684,0.049724,0.09,0.042399,0.883883,0.800092,0.839903,0.78,Gemma,Base
4,0.74359,0.148338,0.247335,0.324675,0.064103,0.107066,0.435897,0.086957,0.144989,0.102564,0.894963,0.814272,0.852713,0.713103,Gemma,Base
5,0.57265,0.123162,0.202723,0.206897,0.044199,0.072838,0.393162,0.084559,0.139183,0.081424,0.886397,0.837483,0.861246,0.755882,Gemma,Base
6,0.612732,0.292035,0.395548,0.228723,0.108861,0.147513,0.275862,0.131479,0.178082,0.18513,0.88232,0.866433,0.874305,0.778293,Gemma,Base
7,0.632479,0.31828,0.423462,0.171674,0.086207,0.114778,0.333333,0.167742,0.223176,0.191558,0.875856,0.852887,0.864219,0.755459,Gemma,Base
8,0.43342,0.348008,0.386047,0.086387,0.069328,0.076923,0.190601,0.15304,0.169767,0.198817,0.865793,0.856556,0.86115,0.669701,Gemma,Base
9,0.657143,0.187755,0.292063,0.23741,0.067485,0.105096,0.378571,0.108163,0.168254,0.110851,0.878814,0.840818,0.859396,0.787033,Gemma,Base


In [75]:
num_cols = df_all.select_dtypes('number').columns

# Calculate the mean for each model and tech
mean_table = (
    df_all
      .groupby(['model', 'tech'])[num_cols]
      .mean()
      .round(4)
)
mean_table

Unnamed: 0_level_0,Unnamed: 1_level_0,rouge1-P,rouge1-R,rouge1-F1,rouge2-P,rouge2-R,rouge2-F1,rougeL-P,rougeL-R,rougeL-F1,METEOR,BERTScore Precision,BERTScore Recall,BERTScore F1,MATTR
model,tech,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
GPT,Base,0.3501,0.6883,0.4518,0.133,0.2584,0.1705,0.1784,0.3562,0.2308,0.358,0.8632,0.8925,0.8775,0.7392
GPT,Decomposition,0.2742,0.6756,0.388,0.0952,0.2359,0.135,0.1425,0.3526,0.2019,0.3282,0.8527,0.8887,0.8703,0.7392
GPT,HyDE,0.7062,0.3677,0.4746,0.2778,0.1464,0.1881,0.3566,0.1851,0.239,0.222,0.8948,0.8662,0.8802,0.7457
Gemma,Base,0.6209,0.1888,0.2719,0.2105,0.0587,0.0867,0.3651,0.1024,0.1506,0.1165,0.881,0.8333,0.8562,0.7582
Gemma,Decomposition,0.5569,0.3712,0.4336,0.1499,0.1019,0.118,0.2634,0.1724,0.2028,0.2099,0.8727,0.86,0.8662,0.7579
Gemma,HyDE,0.5706,0.2417,0.3174,0.1667,0.0713,0.0941,0.3134,0.1256,0.1677,0.1469,0.878,0.8421,0.8595,0.7669


In [76]:
def find_strategy_winners_verbose(df, model_name):
    """
    Identifies and prints:
     - average of each metric by technique,
     - winner per metric,
     - total win counts,
    for a given model.
    
    Args:
        df (DataFrame): Complete dataset.
        model_name (str): Model to filter (e.g., "Gemma" or "GPT").
    """
    # 1) Filter only the specified model
    model_df = df[df["model"] == model_name].copy()
    
    # 2) Choose numeric metrics
    metric_cols = [
        c for c in model_df.select_dtypes("number").columns
        if model_df[c].notna().any()
    ]
    
    # 3) Average by technique
    mean_table = model_df.groupby("tech")[metric_cols].mean()
    winner_per_metric = mean_table.idxmax()
    win_counts = winner_per_metric.value_counts()
    
    # 4) Print outputs
    print(f"\n=== AVERAGE OF EACH METRIC BY TECHNIQUE ({model_name}) ===")
    print(mean_table.T.round(4))
    
    print(f"\n=== TECHNIQUE WINNER PER METRIC ({model_name}) ===")
    print(winner_per_metric)
    
    print(f"\n=== WIN COUNTS ({model_name}) ===")
    print(win_counts)
    
    return mean_table, winner_per_metric, win_counts

In [77]:
def ttest_cohen_d_between_techniques(df, model_name):
    """
    Performs paired t-tests and calculates Cohen's d between all pairs
    of techniques for a specified model.
    
    Args:
        df (DataFrame): Complete dataset.
        model_name (str): Model to filter (e.g., "Gemma" or "GPT").
    
    Returns:
        stats_df (DataFrame): DataFrame containing metric, tech_A, tech_B, p_value, and cohen_d.
    """
    from itertools import combinations
    from scipy import stats

    def cohen_d_paired(x, y):
        diff = x - y
        return diff.mean() / diff.std(ddof=1)
    
    # Filter model
    model_df = df[df["model"] == model_name].copy()
    
    # Identify numeric metrics
    metric_cols = [
        c for c in model_df.select_dtypes("number").columns
        if model_df[c].notna().any()
    ]
    
    # Compare techniques pairwise
    rows = []
    for tech_a, tech_b in combinations(model_df["tech"].unique(), 2):
        a = model_df[model_df.tech == tech_a]
        b = model_df[model_df.tech == tech_b]
        for m in metric_cols:
            x = a[m].dropna().reset_index(drop=True)
            y = b[m].dropna().reset_index(drop=True)
            x, y = x.align(y, join="inner")  # Pair by original index
            if len(x) < 2:
                continue
            t, p = stats.ttest_rel(x, y)
            d = cohen_d_paired(x, y)
            rows.append({
                "metric": m,
                "tech_A": tech_a,
                "tech_B": tech_b,
                "p_value": round(p, 4),
                "cohen_d": round(d, 2)
            })
    
    stats_df = pd.DataFrame(rows)
    
    print(f"\n=== t-test + Cohen's d Between Techniques ({model_name}) ===")
    print(stats_df.round(4))
    
    return stats_df

In [78]:
def analyze_wins_from_stats(stats_df, model_name):
    """
    Filters statistically significant results and identifies winners per technique.
    
    Args:
        stats_df (DataFrame): DataFrame containing metric, tech_A, tech_B, p_value, and cohen_d.
        model_name (str): Model name to use in printed outputs (e.g., "Gemma" or "GPT").
    
    Returns:
        sig (DataFrame): Significant comparisons with winner assigned.
        win_counts (Series): Number of wins per technique.
    """
    import numpy as np
    
    # 1) Filter by p < 0.05 and |d| >= 0.5
    sig = stats_df[(stats_df["p_value"] < 0.05) & (stats_df["cohen_d"].abs() >= 0.5)].copy()
    
    # 2) Decide the winner based on the sign of Cohen's d
    sig["winner"] = np.where(sig["cohen_d"] > 0, sig["tech_A"], sig["tech_B"])
    
    # 3) Count wins
    win_counts = sig["winner"].value_counts()
    
    # 4) Print outputs
    print(f"\n### Winner per technique ({model_name}; p<0.05 & |d|>=0.5) ###")
    print(win_counts)
    
    # 5) List metrics won by each technique
    for tech in win_counts.index:
        metrics = sig.loc[sig["winner"] == tech, "metric"].tolist()
        print(f"\n{tech} won: {', '.join(metrics)}")
    
    return sig, win_counts

In [79]:
def analyze_semantic_metrics(gemma, gpt, stat_gemma, stat_gpt, semantic_metrics, top_k=3, confidence_level=0.95):
    """
    Performs semantic metric analysis, printing:
      1) Top-k techniques per metric by mean score with confidence intervals.
      2) Statistical wins per technique (p<0.05 and |d|>=0.5).
    
    Args:
        gemma (DataFrame): Subset of the dataset filtered for the Gemma model.
        gpt (DataFrame): Subset of the dataset filtered for the GPT model.
        stat_gemma (DataFrame): Statistical test results (paired t-test and Cohen's d) for Gemma.
        stat_gpt (DataFrame): Statistical test results for GPT.
        semantic_metrics (list): List of semantic metric names to analyze.
        top_k (int, optional): Number of top techniques to display. Default is 3.
        confidence_level (float, optional): Confidence level for intervals (e.g., 0.95 for 95%). Default is 0.95.
    
    Returns:
        None
    """
    z = 1.96  # z-score for 95% confidence interval
    
    print(f"\n=== Semantic Metric Analysis – Top {top_k} Techniques (Confidence Level: {int(confidence_level * 100)}%) ===")
    for m in semantic_metrics:
        print(f"\n{m}:")
        
        # GEMMA
        grp_g = gemma.groupby('tech')[m]
        desc_g = grp_g.mean().sort_values(ascending=False)
        print("  GEMMA:")
        for rank, tech in enumerate(desc_g.index[:top_k], start=1):
            mu = desc_g.loc[tech]
            std = grp_g.std()[tech]
            n = grp_g.count()[tech]
            sem = std / math.sqrt(n)
            ic = z * sem
            print(f"    {rank}º: {tech}: {mu:.3f} [95% CI: {mu-ic:.3f} – {mu+ic:.3f}] (n={n})")
        
        # GPT
        grp_p = gpt.groupby('tech')[m]
        desc_p = grp_p.mean().sort_values(ascending=False)
        print("  GPT:")
        for rank, tech in enumerate(desc_p.index[:top_k], start=1):
            mu = desc_p.loc[tech]
            std = grp_p.std()[tech]
            n = grp_p.count()[tech]
            sem = std / math.sqrt(n)
            ic = z * sem
            print(f"    {rank}º: {tech}: {mu:.3f} [95% CI: {mu-ic:.3f} – {mu+ic:.3f}] (n={n})")
    
    print("\n=== Statistical Wins by Metric (p<0.05 and |d|>=0.5) ===")
    for m in semantic_metrics:
        cnt_g = stat_gemma[stat_gemma.metric == m]['winner'].value_counts()
        cnt_p = stat_gpt[stat_gpt.metric == m]['winner'].value_counts()
        print(f"\n{m}:")
        print(f"  Gemma: {cnt_g.to_dict() or 'no statistically significant wins'}")
        print(f"  GPT  : {cnt_p.to_dict() or 'no statistically significant wins'}")

In [80]:
def summary(df_all, semantic_metrics=None, top_k=3, confidence_level=0.95):
    """
    Runs a complete performance analysis across techniques for Gemma and GPT models.
    
    Steps:
    1) Victories by average (mean).
    2) Victories by statistical significance (paired t-test, p<0.05 and |d|>=0.5).
    3) Semantic metrics detailed analysis: top-k by mean, statistical wins.
    4) Summarizes all results into a structured table (grouped by Model first, then Criterion).
    
    Args:
        df_all (DataFrame): Complete dataset containing all models, techniques, and metrics.
        semantic_metrics (list, optional): List of semantic metric names to analyze. Defaults to common metrics.
        top_k (int, optional): Number of top techniques to display per metric. Default is 3.
        confidence_level (float, optional): Confidence level for intervals. Default is 0.95 (95%).
    
    Returns:
        summary_table (DataFrame): Final structured table.
    """
    import math
    import numpy as np
    import pandas as pd
    from itertools import combinations
    from scipy import stats
    
    # 1) Filter Gemma and GPT
    gemma = df_all[df_all['model'] == 'Gemma']
    gpt   = df_all[df_all['model'] == 'GPT']

    # 2) Identify numeric metric columns
    metric_cols = [
        c for c in df_all.select_dtypes(include=np.number).columns
        if df_all[c].notna().any()
    ]
    
    # ————————————————————————————————
    # Block 1: Wins by Average
    # ————————————————————————————————
    mean_gemma = gemma.groupby('tech')[metric_cols].mean()
    mean_gpt   = gpt.groupby('tech')[metric_cols].mean()

    win_mean_gemma = mean_gemma.idxmax().value_counts()
    win_mean_gpt   = mean_gpt.idxmax().value_counts()

    print("1) Which strategy wins within Gemma (by average)?")
    print(win_mean_gemma)

    print("\n2) Which strategy wins within GPT (by average)?")
    print(win_mean_gpt)

    # ————————————————————————————————
    # Block 2: Wins by Statistical Test
    # ————————————————————————————————
    def cohen_d_paired(x, y):
        diff = x - y
        return diff.mean() / diff.std(ddof=1)

    def head_to_head(df_model):
        rows = []
        for tech_a, tech_b in combinations(df_model['tech'].unique(), 2):
            a = df_model[df_model.tech == tech_a]
            b = df_model[df_model.tech == tech_b]
            for m in metric_cols:
                xa = a[m].dropna().reset_index(drop=True)
                xb = b[m].dropna().reset_index(drop=True)
                xa, xb = xa.align(xb, join='inner')
                if len(xa) < 2:
                    continue
                t, p = stats.ttest_rel(xa, xb)
                d = cohen_d_paired(xa, xb)
                if (p < 0.05) and (abs(d) >= 0.5):
                    winner = tech_a if d > 0 else tech_b
                    rows.append({'metric': m, 'winner': winner})
        return pd.DataFrame(rows)

    stat_gemma = head_to_head(gemma)
    stat_gpt   = head_to_head(gpt)

    win_stat_gemma = stat_gemma['winner'].value_counts()
    win_stat_gpt   = stat_gpt['winner'].value_counts()

    print("\n3) Which strategy wins statistically within Gemma (p<0.05 & |d|>=0.5)?")
    print(win_stat_gemma)

    print("\n4) Which strategy wins statistically within GPT (p<0.05 & |d|>=0.5)?")
    print(win_stat_gpt)

    # ————————————————————————————————
    # Block 3: Build the final structured table
    # ————————————————————————————————
    summary_table = pd.DataFrame([
        {'Model': 'Gemma', 'Criterion': 'By Average',          'Base': win_mean_gemma.get('Base', 0), 'HyDE': win_mean_gemma.get('HyDE', 0), 'Decomposition': win_mean_gemma.get('Decomposition', 0)},
        {'Model': 'Gemma', 'Criterion': 'By Statistical Test', 'Base': win_stat_gemma.get('Base', 0), 'HyDE': win_stat_gemma.get('HyDE', 0), 'Decomposition': win_stat_gemma.get('Decomposition', 0)},
        {'Model': 'GPT',   'Criterion': 'By Average',          'Base': win_mean_gpt.get('Base', 0),   'HyDE': win_mean_gpt.get('HyDE', 0),   'Decomposition': win_mean_gpt.get('Decomposition', 0)},
        {'Model': 'GPT',   'Criterion': 'By Statistical Test', 'Base': win_stat_gpt.get('Base', 0),   'HyDE': win_stat_gpt.get('HyDE', 0),   'Decomposition': win_stat_gpt.get('Decomposition', 0)}
    ])

    summary_table = summary_table.set_index(['Model', 'Criterion'])

    print("\n=== Final Summary Table ===")
    print(summary_table)

    return summary_table

### **Which strategy achieves the best performance within the Gemma model?**

In [81]:
mean_table_gemma, winner_per_metric_gemma, win_counts_gemma = find_strategy_winners_verbose(df_all, "Gemma")


=== AVERAGE OF EACH METRIC BY TECHNIQUE (Gemma) ===
tech                   Base  Decomposition    HyDE
rouge1-P             0.6209         0.5569  0.5706
rouge1-R             0.1888         0.3712  0.2417
rouge1-F1            0.2719         0.4336  0.3174
rouge2-P             0.2105         0.1499  0.1667
rouge2-R             0.0587         0.1019  0.0713
rouge2-F1            0.0867         0.1180  0.0941
rougeL-P             0.3651         0.2634  0.3134
rougeL-R             0.1024         0.1724  0.1256
rougeL-F1            0.1506         0.2028  0.1677
METEOR               0.1165         0.2099  0.1469
BERTScore Precision  0.8810         0.8727  0.8780
BERTScore Recall     0.8333         0.8600  0.8421
BERTScore F1         0.8562         0.8662  0.8595
MATTR                0.7582         0.7579  0.7669

=== TECHNIQUE WINNER PER METRIC (Gemma) ===
rouge1-P                        Base
rouge1-R               Decomposition
rouge1-F1              Decomposition
rouge2-P                  

In [82]:
stats_df_gemma = ttest_cohen_d_between_techniques(df_all, "Gemma")


=== t-test + Cohen's d Between Techniques (Gemma) ===
                 metric         tech_A         tech_B  p_value  cohen_d
0              rouge1-P           Base  Decomposition   0.1244     0.54
1              rouge1-R           Base  Decomposition   0.0031    -1.26
2             rouge1-F1           Base  Decomposition   0.0010    -1.50
3              rouge2-P           Base  Decomposition   0.0294     0.82
4              rouge2-R           Base  Decomposition   0.0169    -0.93
5             rouge2-F1           Base  Decomposition   0.0402    -0.76
6              rougeL-P           Base  Decomposition   0.0200     0.89
7              rougeL-R           Base  Decomposition   0.0048    -1.18
8             rougeL-F1           Base  Decomposition   0.0056    -1.14
9                METEOR           Base  Decomposition   0.0059    -1.13
10  BERTScore Precision           Base  Decomposition   0.1380     0.51
11     BERTScore Recall           Base  Decomposition   0.0112    -1.00
12       

In [83]:
sig_gemma, win_counts_gemma = analyze_wins_from_stats(stats_df_gemma, "Gemma")


### Winner per technique (Gemma; p<0.05 & |d|>=0.5) ###
winner
Decomposition    12
Base              3
Name: count, dtype: int64

Decomposition won: rouge1-R, rouge1-F1, rouge2-R, rouge2-F1, rougeL-R, rougeL-F1, METEOR, BERTScore Recall, BERTScore F1, rouge1-R, rouge1-F1, rougeL-R

Base won: rouge2-P, rougeL-P, rouge2-P


### **Which strategy performed best within the GPT model?**

In [84]:
mean_table_gpt, winner_per_metric_gpt, win_counts_gpt = find_strategy_winners_verbose(df_all, "GPT")


=== AVERAGE OF EACH METRIC BY TECHNIQUE (GPT) ===
tech                   Base  Decomposition    HyDE
rouge1-P             0.3501         0.2742  0.7062
rouge1-R             0.6883         0.6756  0.3677
rouge1-F1            0.4518         0.3880  0.4746
rouge2-P             0.1330         0.0952  0.2778
rouge2-R             0.2584         0.2359  0.1464
rouge2-F1            0.1705         0.1350  0.1881
rougeL-P             0.1784         0.1425  0.3566
rougeL-R             0.3562         0.3526  0.1851
rougeL-F1            0.2308         0.2019  0.2390
METEOR               0.3580         0.3282  0.2220
BERTScore Precision  0.8632         0.8527  0.8948
BERTScore Recall     0.8925         0.8887  0.8662
BERTScore F1         0.8775         0.8703  0.8802
MATTR                0.7392         0.7392  0.7457

=== TECHNIQUE WINNER PER METRIC (GPT) ===
rouge1-P               HyDE
rouge1-R               Base
rouge1-F1              HyDE
rouge2-P               HyDE
rouge2-R               Base
r

In [85]:
stats_df_gpt = ttest_cohen_d_between_techniques(df_all, "GPT")


=== t-test + Cohen's d Between Techniques (GPT) ===
                 metric         tech_A         tech_B  p_value  cohen_d
0              rouge1-P           Base  Decomposition   0.0394     0.76
1              rouge1-R           Base  Decomposition   0.4850     0.23
2             rouge1-F1           Base  Decomposition   0.0325     0.80
3              rouge2-P           Base  Decomposition   0.0621     0.67
4              rouge2-R           Base  Decomposition   0.1416     0.51
5             rouge2-F1           Base  Decomposition   0.0541     0.70
6              rougeL-P           Base  Decomposition   0.0597     0.68
7              rougeL-R           Base  Decomposition   0.7699     0.10
8             rougeL-F1           Base  Decomposition   0.0570     0.69
9                METEOR           Base  Decomposition   0.0149     0.95
10  BERTScore Precision           Base  Decomposition   0.0371     0.77
11     BERTScore Recall           Base  Decomposition   0.2032     0.43
12         

  return diff.mean() / diff.std(ddof=1)


In [86]:
sig_gpt, win_counts_gpt = analyze_wins_from_stats(stats_df_gpt, "GPT")


### Winner per technique (GPT; p<0.05 & |d|>=0.5) ###
winner
HyDE             12
Base             10
Decomposition     5
Name: count, dtype: int64

HyDE won: rouge1-P, rouge2-P, rougeL-P, BERTScore Precision, rouge1-P, rouge1-F1, rouge2-P, rouge2-F1, rougeL-P, rougeL-F1, BERTScore Precision, BERTScore F1

Base won: rouge1-P, rouge1-F1, METEOR, BERTScore Precision, BERTScore F1, rouge1-R, rouge2-R, rougeL-R, METEOR, BERTScore Recall

Decomposition won: rouge1-R, rouge2-R, rougeL-R, METEOR, BERTScore Recall


### **Overview**

In [87]:
semantic_metrics = [
    'BERTScore Precision',
    'BERTScore Recall',
    'BERTScore F1',
    'MATTR'
]

analyze_semantic_metrics(
    gemma=df_all[df_all['model'] == 'Gemma'],
    gpt=df_all[df_all['model'] == 'GPT'],
    stat_gemma=sig_gemma,
    stat_gpt=sig_gpt,
    semantic_metrics=semantic_metrics,
    top_k=3,               
    confidence_level=0.95 
)


=== Semantic Metric Analysis – Top 3 Techniques (Confidence Level: 95%) ===

BERTScore Precision:
  GEMMA:
    1º: Base: 0.881 [95% CI: 0.876 – 0.886] (n=10)
    2º: HyDE: 0.878 [95% CI: 0.872 – 0.884] (n=10)
    3º: Decomposition: 0.873 [95% CI: 0.865 – 0.880] (n=10)
  GPT:
    1º: HyDE: 0.895 [95% CI: 0.888 – 0.901] (n=10)
    2º: Base: 0.863 [95% CI: 0.853 – 0.874] (n=10)
    3º: Decomposition: 0.853 [95% CI: 0.847 – 0.859] (n=10)

BERTScore Recall:
  GEMMA:
    1º: Decomposition: 0.860 [95% CI: 0.852 – 0.868] (n=10)
    2º: HyDE: 0.842 [95% CI: 0.826 – 0.859] (n=10)
    3º: Base: 0.833 [95% CI: 0.818 – 0.849] (n=10)
  GPT:
    1º: Base: 0.892 [95% CI: 0.885 – 0.900] (n=10)
    2º: Decomposition: 0.889 [95% CI: 0.883 – 0.895] (n=10)
    3º: HyDE: 0.866 [95% CI: 0.855 – 0.877] (n=10)

BERTScore F1:
  GEMMA:
    1º: Decomposition: 0.866 [95% CI: 0.859 – 0.873] (n=10)
    2º: HyDE: 0.859 [95% CI: 0.850 – 0.869] (n=10)
    3º: Base: 0.856 [95% CI: 0.849 – 0.864] (n=10)
  GPT:
    1º: H

In [88]:
summary(df_all)

1) Which strategy wins within Gemma (by average)?
Decomposition    9
Base             4
HyDE             1
Name: count, dtype: int64

2) Which strategy wins within GPT (by average)?
HyDE    9
Base    5
Name: count, dtype: int64

3) Which strategy wins statistically within Gemma (p<0.05 & |d|>=0.5)?
winner
Decomposition    12
Base              3
Name: count, dtype: int64

4) Which strategy wins statistically within GPT (p<0.05 & |d|>=0.5)?
winner
HyDE             12
Base             10
Decomposition     5
Name: count, dtype: int64

=== Final Summary Table ===
                           Base  HyDE  Decomposition
Model Criterion                                     
Gemma By Average              4     1              9
      By Statistical Test     3     0             12
GPT   By Average              5     9              0
      By Statistical Test    10    12              5


  return diff.mean() / diff.std(ddof=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Base,HyDE,Decomposition
Model,Criterion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gemma,By Average,4,1,9
Gemma,By Statistical Test,3,0,12
GPT,By Average,5,9,0
GPT,By Statistical Test,10,12,5


## **Engineering**

### **Importing Dataframes**

In [89]:
df_eng_base = pd.read_csv('./Validation_Results/df_eng_base.csv')
df_eng_decomposition = pd.read_csv('./Validation_Results/df_eng_decomposition.csv')
df_eng_hyde = pd.read_csv('./Validation_Results/df_eng_hyde.csv')

In [90]:
df_eng_gpt_base = pd.read_csv('./Validation_Results/df_eng_gpt_base.csv')
df_eng_gpt_decomposition = pd.read_csv('./Validation_Results/df_eng_gpt_decomposition.csv')
df_eng_gpt_hyde = pd.read_csv('./Validation_Results/df_eng_gpt_hyde.csv')

### **Concat Dataframes**

In [91]:
df_gemma  = pd.concat([
    df_eng_base.assign(model='Gemma', tech='Base'),
    df_eng_decomposition.assign(model='Gemma', tech='Decomposition'),
    df_eng_hyde.assign(model='Gemma', tech='HyDE')
])

df_gpt = pd.concat([
    df_eng_gpt_base.assign(model='GPT', tech='Base'),
    df_eng_gpt_decomposition.assign(model='GPT', tech='Decomposition'),
    df_eng_gpt_hyde.assign(model='GPT', tech='HyDE')
])

df_all = pd.concat([df_gemma, df_gpt], ignore_index=True)
df_all

Unnamed: 0,rouge1-P,rouge1-R,rouge1-F1,rouge2-P,rouge2-R,rouge2-F1,rougeL-P,rougeL-R,rougeL-F1,METEOR,BERTScore Precision,BERTScore Recall,BERTScore F1,MATTR,model,tech
0,0.3,0.482759,0.370044,0.100719,0.162791,0.124444,0.214286,0.344828,0.264317,0.303169,0.876839,0.887158,0.881968,0.802857,Gemma,Base
1,0.258786,0.522581,0.346154,0.0576,0.116505,0.077088,0.124601,0.251613,0.166667,0.258973,0.856648,0.873663,0.865072,0.806343,Gemma,Base
2,0.373874,0.525316,0.436842,0.090498,0.127389,0.10582,0.157658,0.221519,0.184211,0.313936,0.879132,0.884566,0.88184,0.801965,Gemma,Base
3,0.289283,0.785558,0.42285,0.112097,0.304825,0.163915,0.120064,0.326039,0.175501,0.362792,0.868069,0.89535,0.881498,0.798523,Gemma,Base
4,0.259259,0.753846,0.385827,0.10424,0.304124,0.155263,0.123457,0.358974,0.183727,0.397709,0.862435,0.90805,0.884655,0.81556,Gemma,Base
5,0.214844,0.760369,0.335025,0.100391,0.356481,0.156663,0.125,0.442396,0.194924,0.366189,0.862882,0.885754,0.874169,0.770629,Gemma,Base
6,0.403651,0.496259,0.44519,0.148374,0.1825,0.163677,0.225152,0.276808,0.248322,0.284623,0.860501,0.862375,0.861437,0.678514,Gemma,Base
7,0.373134,0.724638,0.492611,0.270677,0.529412,0.358209,0.30597,0.594203,0.403941,0.427665,0.897906,0.926077,0.911774,0.687059,Gemma,Base
8,0.257218,0.723247,0.379477,0.094612,0.266667,0.13967,0.139108,0.391144,0.205227,0.345948,0.874368,0.901394,0.887675,0.816101,Gemma,Base
9,0.484487,0.618902,0.543507,0.212919,0.272171,0.238926,0.23389,0.29878,0.262383,0.350207,0.88142,0.882854,0.882136,0.665081,Gemma,Base


In [92]:
num_cols = df_all.select_dtypes('number').columns

mean_table = (
    df_all
      .groupby(['model', 'tech'])[num_cols]
      .mean()
      .round(4)
)
mean_table

Unnamed: 0_level_0,Unnamed: 1_level_0,rouge1-P,rouge1-R,rouge1-F1,rouge2-P,rouge2-R,rouge2-F1,rougeL-P,rougeL-R,rougeL-F1,METEOR,BERTScore Precision,BERTScore Recall,BERTScore F1,MATTR
model,tech,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
GPT,Base,0.6834,0.3678,0.4514,0.3068,0.1595,0.1982,0.3747,0.2031,0.2483,0.2357,0.9011,0.8784,0.8895,0.7763
GPT,Decomposition,0.6282,0.34,0.4066,0.2675,0.1316,0.1656,0.3323,0.1805,0.2155,0.2169,0.8875,0.8704,0.8787,0.7997
GPT,HyDE,0.8065,0.1156,0.1913,0.3945,0.0504,0.0855,0.5916,0.0816,0.1363,0.0816,0.9017,0.8201,0.8588,0.8087
Gemma,Base,0.3215,0.6393,0.4158,0.1292,0.2623,0.1684,0.1769,0.3506,0.2289,0.3411,0.872,0.8907,0.8812,0.7643
Gemma,Decomposition,0.5267,0.4626,0.4671,0.1789,0.1563,0.1592,0.2336,0.2125,0.2109,0.2709,0.8769,0.8781,0.8774,0.8092
Gemma,HyDE,0.5976,0.3188,0.4032,0.2018,0.1038,0.1326,0.3002,0.1622,0.2042,0.1943,0.8857,0.8689,0.8772,0.8015


### **Which strategy achieves the best performance within the Gemma model?**


In [93]:
mean_table_gemma, winner_per_metric_gemma, win_counts_gemma = find_strategy_winners_verbose(df_all, "Gemma")


=== AVERAGE OF EACH METRIC BY TECHNIQUE (Gemma) ===
tech                   Base  Decomposition    HyDE
rouge1-P             0.3215         0.5267  0.5976
rouge1-R             0.6393         0.4626  0.3188
rouge1-F1            0.4158         0.4671  0.4032
rouge2-P             0.1292         0.1789  0.2018
rouge2-R             0.2623         0.1563  0.1038
rouge2-F1            0.1684         0.1592  0.1326
rougeL-P             0.1769         0.2336  0.3002
rougeL-R             0.3506         0.2125  0.1622
rougeL-F1            0.2289         0.2109  0.2042
METEOR               0.3411         0.2709  0.1943
BERTScore Precision  0.8720         0.8769  0.8857
BERTScore Recall     0.8907         0.8781  0.8689
BERTScore F1         0.8812         0.8774  0.8772
MATTR                0.7643         0.8092  0.8015

=== TECHNIQUE WINNER PER METRIC (Gemma) ===
rouge1-P                        HyDE
rouge1-R                        Base
rouge1-F1              Decomposition
rouge2-P                  

In [94]:
stats_df_gemma = ttest_cohen_d_between_techniques(df_all, "Gemma")


=== t-test + Cohen's d Between Techniques (Gemma) ===
                 metric         tech_A         tech_B  p_value  cohen_d
0              rouge1-P           Base  Decomposition   0.0074    -1.09
1              rouge1-R           Base  Decomposition   0.0320     0.80
2             rouge1-F1           Base  Decomposition   0.0371    -0.77
3              rouge2-P           Base  Decomposition   0.0441    -0.74
4              rouge2-R           Base  Decomposition   0.0174     0.92
5             rouge2-F1           Base  Decomposition   0.5635     0.19
6              rougeL-P           Base  Decomposition   0.0512    -0.71
7              rougeL-R           Base  Decomposition   0.0028     1.28
8             rougeL-F1           Base  Decomposition   0.0209     0.88
9                METEOR           Base  Decomposition   0.0237     0.86
10  BERTScore Precision           Base  Decomposition   0.3803    -0.29
11     BERTScore Recall           Base  Decomposition   0.0656     0.66
12       

In [95]:
sig_gpt, win_counts_gpt = analyze_wins_from_stats(stats_df_gpt, "Gemma")


### Winner per technique (Gemma; p<0.05 & |d|>=0.5) ###
winner
HyDE             12
Base             10
Decomposition     5
Name: count, dtype: int64

HyDE won: rouge1-P, rouge2-P, rougeL-P, BERTScore Precision, rouge1-P, rouge1-F1, rouge2-P, rouge2-F1, rougeL-P, rougeL-F1, BERTScore Precision, BERTScore F1

Base won: rouge1-P, rouge1-F1, METEOR, BERTScore Precision, BERTScore F1, rouge1-R, rouge2-R, rougeL-R, METEOR, BERTScore Recall

Decomposition won: rouge1-R, rouge2-R, rougeL-R, METEOR, BERTScore Recall


### **Which strategy performed best within the GPT model?**

In [96]:
mean_table_gpt, winner_per_metric_gpt, win_counts_gpt = find_strategy_winners_verbose(df_all, "GPT")


=== AVERAGE OF EACH METRIC BY TECHNIQUE (GPT) ===
tech                   Base  Decomposition    HyDE
rouge1-P             0.6834         0.6282  0.8065
rouge1-R             0.3678         0.3400  0.1156
rouge1-F1            0.4514         0.4066  0.1913
rouge2-P             0.3068         0.2675  0.3945
rouge2-R             0.1595         0.1316  0.0504
rouge2-F1            0.1982         0.1656  0.0855
rougeL-P             0.3747         0.3323  0.5916
rougeL-R             0.2031         0.1805  0.0816
rougeL-F1            0.2483         0.2155  0.1363
METEOR               0.2357         0.2169  0.0816
BERTScore Precision  0.9011         0.8875  0.9017
BERTScore Recall     0.8784         0.8704  0.8201
BERTScore F1         0.8895         0.8787  0.8588
MATTR                0.7763         0.7997  0.8087

=== TECHNIQUE WINNER PER METRIC (GPT) ===
rouge1-P               HyDE
rouge1-R               Base
rouge1-F1              Base
rouge2-P               HyDE
rouge2-R               Base
r

In [97]:
stats_df_gpt = ttest_cohen_d_between_techniques(df_all, "GPT")


=== t-test + Cohen's d Between Techniques (GPT) ===
                 metric         tech_A         tech_B  p_value  cohen_d
0              rouge1-P           Base  Decomposition   0.1126     0.56
1              rouge1-R           Base  Decomposition   0.3054     0.34
2             rouge1-F1           Base  Decomposition   0.0544     0.70
3              rouge2-P           Base  Decomposition   0.1953     0.44
4              rouge2-R           Base  Decomposition   0.0298     0.81
5             rouge2-F1           Base  Decomposition   0.0120     0.99
6              rougeL-P           Base  Decomposition   0.1717     0.47
7              rougeL-R           Base  Decomposition   0.0212     0.88
8             rougeL-F1           Base  Decomposition   0.0114     1.00
9                METEOR           Base  Decomposition   0.1097     0.56
10  BERTScore Precision           Base  Decomposition   0.0249     0.85
11     BERTScore Recall           Base  Decomposition   0.0112     1.01
12         

In [98]:
sig_gpt, win_counts_gpt = analyze_wins_from_stats(stats_df_gpt, "GPT")


### Winner per technique (GPT; p<0.05 & |d|>=0.5) ###
winner
Base             16
Decomposition     9
HyDE              7
Name: count, dtype: int64

Base won: rouge2-R, rouge2-F1, rougeL-R, rougeL-F1, BERTScore Precision, BERTScore Recall, BERTScore F1, rouge1-R, rouge1-F1, rouge2-R, rouge2-F1, rougeL-R, rougeL-F1, METEOR, BERTScore Recall, BERTScore F1

Decomposition won: rouge1-R, rouge1-F1, rouge2-R, rouge2-F1, rougeL-R, rougeL-F1, METEOR, BERTScore Recall, BERTScore F1

HyDE won: rouge1-P, rouge2-P, rougeL-P, rouge1-P, rouge2-P, rougeL-P, BERTScore Precision


### **Overview**

In [99]:
semantic_metrics = [
    'BERTScore Precision',
    'BERTScore Recall',
    'BERTScore F1',
    'MATTR'
]

analyze_semantic_metrics(
    gemma=df_all[df_all['model'] == 'Gemma'],
    gpt=df_all[df_all['model'] == 'GPT'],
    stat_gemma=sig_gemma,
    stat_gpt=sig_gpt,
    semantic_metrics=semantic_metrics,
    top_k=3,               
    confidence_level=0.95 
)


=== Semantic Metric Analysis – Top 3 Techniques (Confidence Level: 95%) ===

BERTScore Precision:
  GEMMA:
    1º: HyDE: 0.886 [95% CI: 0.877 – 0.895] (n=10)
    2º: Decomposition: 0.877 [95% CI: 0.867 – 0.887] (n=10)
    3º: Base: 0.872 [95% CI: 0.864 – 0.880] (n=10)
  GPT:
    1º: HyDE: 0.902 [95% CI: 0.899 – 0.905] (n=10)
    2º: Base: 0.901 [95% CI: 0.895 – 0.907] (n=10)
    3º: Decomposition: 0.888 [95% CI: 0.876 – 0.899] (n=10)

BERTScore Recall:
  GEMMA:
    1º: Base: 0.891 [95% CI: 0.880 – 0.902] (n=10)
    2º: Decomposition: 0.878 [95% CI: 0.869 – 0.887] (n=10)
    3º: HyDE: 0.869 [95% CI: 0.863 – 0.875] (n=10)
  GPT:
    1º: Base: 0.878 [95% CI: 0.871 – 0.886] (n=10)
    2º: Decomposition: 0.870 [95% CI: 0.865 – 0.876] (n=10)
    3º: HyDE: 0.820 [95% CI: 0.806 – 0.834] (n=10)

BERTScore F1:
  GEMMA:
    1º: Base: 0.881 [95% CI: 0.873 – 0.890] (n=10)
    2º: Decomposition: 0.877 [95% CI: 0.870 – 0.885] (n=10)
    3º: HyDE: 0.877 [95% CI: 0.872 – 0.883] (n=10)
  GPT:
    1º: B

In [100]:
summary(df_all)

1) Which strategy wins within Gemma (by average)?
Base             8
HyDE             4
Decomposition    2
Name: count, dtype: int64

2) Which strategy wins within GPT (by average)?
Base    9
HyDE    5
Name: count, dtype: int64

3) Which strategy wins statistically within Gemma (p<0.05 & |d|>=0.5)?
winner
Base             10
Decomposition     9
HyDE              6
Name: count, dtype: int64

4) Which strategy wins statistically within GPT (p<0.05 & |d|>=0.5)?
winner
Base             16
Decomposition     9
HyDE              7
Name: count, dtype: int64

=== Final Summary Table ===
                           Base  HyDE  Decomposition
Model Criterion                                     
Gemma By Average              8     4              2
      By Statistical Test    10     6              9
GPT   By Average              9     5              0
      By Statistical Test    16     7              9


Unnamed: 0_level_0,Unnamed: 1_level_0,Base,HyDE,Decomposition
Model,Criterion,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Gemma,By Average,8,4,2
Gemma,By Statistical Test,10,6,9
GPT,By Average,9,5,0
GPT,By Statistical Test,16,7,9
