LEX OLS 

## Prep

In [None]:
# 1) Imports & MongoDB-Verbindung
from pymongo import MongoClient
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Verbinden und Collection auswählen
client = MongoClient("mongodb://localhost:27018/")
db     = client["transcriptions"]
coll   = db["transcripts_denis"]

# Alle Dokumente (ohne manuell ausgesperrte)
docs = list(coll.find({"excludeGeneral": 0}))


In [None]:
# 2) TF-IDF auf gesamten Korpus fitten
# Wir brauchen globale IDF-Werte, also fitten wir auf ALLE Vorverarbeitungen:
lex_hyps = [doc["text_lex_denis"] for doc in docs]
lex_refs = [doc["src_lex_denis"] for doc in docs]
corpus   = lex_hyps + lex_refs

vectorizer    = TfidfVectorizer()
tfidf_matrix  = vectorizer.fit_transform(corpus)


In [None]:
# 3) Paarweise Cosine-Ähnlichkeit berechnen und in DataFrame packen
rows = []
N = len(docs)
for i, doc in enumerate(docs):
    hyp_vec = tfidf_matrix[i]
    ref_vec = tfidf_matrix[i + N]
    sim     = cosine_similarity(hyp_vec, ref_vec)[0][0]

    rows.append({
        "convoID":           doc.get("convoID"),
        "ambientVariant":    doc.get("ambientVariant"),
        "processedVolume":   doc.get("processedVolume"),
        "technology":        doc.get("technology"),
        "model":             doc.get("model"),
        "lex_cosine_sim":    sim
    })

df_lex = pd.DataFrame(rows)
df_lex.to_csv("lexical_cosine_scores_full.csv", index=False, encoding="utf-8-sig")
print("Lexical Cosine Similarity scores saved to lexical_cosine_scores_full.csv")

In [None]:
# %%  
# 1) Deskriptive Statistik über alle Dokumente
import pandas as pd

# CSV einlesen
df_lex = pd.read_csv("lexical_cosine_scores_full.csv")

# Kennzahlen berechnen
overall = df_lex["lex_cosine_sim"].agg(["mean", "median", "std", "min", "max"])
print("=== Overall Lexical Cosine Similarity ===")
print(overall, "\n")

# CSV speichern (optional)
overall.to_frame(name="value").to_csv("lex_overall_stats.csv")


In [None]:
# %%  
# 2) Gruppiert nach Technologie & Modell
import pandas as pd

df_lex = pd.read_csv("lexical_cosine_scores_full.csv")

# Gruppieren und Kennzahlen berechnen
grouped = df_lex.groupby(["technology", "model"])["lex_cosine_sim"]
stats = grouped.agg(["mean", "median", "std", "min", "max"]).reset_index()

print("=== Lexical Cosine by Technology & Model ===")
print(stats)

# Als CSV speichern
stats.to_csv("lex_stats_by_model.csv", index=False)


# Descriptive

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
from matplotlib.ticker import MultipleLocator

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# 1) Daten laden
csv_file = "lexical_cosine_scores_full.csv"
df = pd.read_csv(csv_file)
df["system"] = df["technology"] + "–" + df["model"]

# 2) Labels mappen
label_map = {
    "recapp–gsw-CH_smoothed":         "recapp",
    "vosk–vosk-model-de-0.21":         "vosk",
    "whisper–medium":                  "whisper_medium",
    "whisper–large":                   "whisper_large",
    "whisper–turbo":                   "whisper_turbo",
    "whisper_v2–whisper_rescuespeech": "rescuespeech"
}
df["label"] = df["system"].map(label_map)

# 3) Reihenfolge & Farben
order = [lbl for lbl in ["recapp","vosk","whisper_medium","whisper_large","whisper_turbo","rescuespeech"]
         if lbl in df["label"].unique()]
palette = {
    "recapp":         "#c6c6c6",
    "vosk":           "#aad9e6",
    "whisper_medium": "#7bc96f",
    "whisper_large":  "#2ca02c",
    "whisper_turbo":  "#1f7a1f",
    "rescuespeech":   "#fdb064"
}

# 4) Plot‑Style
sns.set(style="whitegrid", font_scale=1.0)

# 5) Figure & Axes absolut setzen
fig = plt.figure(figsize=(3, 3))               # Breite 3", Höhe 4"
ax = fig.add_axes([0.18, 0.18, 0.75, 0.75])     # Position und Größe des Plot-Bereichs

# 6) Boxplot zeichnen
flierprops = dict(marker='o', color='black', markersize=4)
sns.boxplot(
    data=df,
    x="label", y="lex_cosine_sim",
    order=order, palette=palette,
    flierprops=flierprops,
    ax=ax,
    width=0.4
)

# 7) Titel, Achsenbeschriftungen & fixe Y‑Skala
ax.set_title("TF-IDF Cosine Sim. per Model", pad=12)
ax.set_xlabel("")  # untere Beschriftung entfernt
ax.set_ylabel("Cosine Similarity", labelpad=8)
ax.set_ylim(0, 1)

# 8) Einheitliche Grid‑Linien
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.grid(axis="y", which="major", linestyle="--", linewidth=0.5)

# 9) X‑Ticks rotieren
ax.set_xticklabels(order, rotation=45, ha="right", fontsize=10)

# 10) Ergebnis‑Ordner
output_dir = os.path.join(os.path.dirname(csv_file), "results_for_paper")
os.makedirs(output_dir, exist_ok=True)

# 11) Speichern ohne tight_layout
output_png = os.path.join(output_dir, "tfidf_cosine_boxplot_fixed.png")
fig.savefig(output_png, format="png", dpi=300)

# 12) Anzeige und Pfad
plt.show()
print(f"Plot saved at: {output_png}")





In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
from matplotlib.ticker import MultipleLocator

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# 1) Daten laden
csv_file = "lexical_cosine_scores_full.csv"
df = pd.read_csv(csv_file)
df["system"] = df["technology"] + "–" + df["model"]

# 2) Labels mappen
label_map = {
    "recapp–gsw-CH_smoothed":         "recapp",
    "vosk–vosk-model-de-0.21":         "vosk",
    "whisper–medium":                  "whisper_medium",
    "whisper–large":                   "whisper_large",
    "whisper–turbo":                   "whisper_turbo",
    "whisper_v2–whisper_rescuespeech": "rescuespeech"
}
df["label"] = df["system"].map(label_map)

# 3) Reihenfolge & Farben
order = [lbl for lbl in ["recapp","vosk","whisper_medium","whisper_large","whisper_turbo","rescuespeech"]
         if lbl in df["label"].unique()]
palette = {
    "recapp":         "#c6c6c6",
    "vosk":           "#aad9e6",
    "whisper_medium": "#7bc96f",
    "whisper_large":  "#2ca02c",
    "whisper_turbo":  "#1f7a1f",
    "rescuespeech":   "#fdb064"
}

# 4) Plot‑Style
sns.set(style="whitegrid", font_scale=1.0)

# 5) Figure & Axes absolut setzen
fig = plt.figure(figsize=(3, 3))
# exakt gleiche Ränder wie beim finalen WER-Plot
ax = fig.add_axes([0.12, 0.18, 0.80, 0.75])

# 6) Boxplot zeichnen
flierprops = dict(marker='o', color='black', markersize=4)
sns.boxplot(
    data=df,
    x="label", y="lex_cosine_sim",
    order=order, palette=palette,
    flierprops=flierprops,
    ax=ax,
    width=0.4
)

# 7) Titel, Achsenbeschriftungen & fixe Y‑Skala
ax.set_title("TF-IDF Cosine Sim. per Model", pad=12)
ax.set_xlabel("")  
ax.set_ylabel("Cosine Similarity", labelpad=8)
ax.set_ylim(0, 1)

# 8) Einheitliche Grid‑Linien
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.grid(axis="y", which="major", linestyle="--", linewidth=0.5)

# 9) X‑Ticks rotieren
ax.set_xticklabels(order, rotation=45, ha="right", fontsize=10)

# 10) Ergebnis‑Ordner & Speichern
output_dir = os.path.join(os.path.dirname(csv_file), "results_for_paper")
os.makedirs(output_dir, exist_ok=True)
output_png = os.path.join(output_dir, "tfidf_cosine_boxplot_fixed.png")

# Hier wird bbox_inches='tight' verwendet, um Ränder exakt zu erhalten
fig.savefig(output_png, format="png", dpi=300, bbox_inches='tight', pad_inches=0.1)

# 11) Anzeige & Pfad
plt.show()
print(f"Plot saved at: {output_png}")



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os
from matplotlib.ticker import MultipleLocator

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# 1) Daten laden
csv_file = "lexical_cosine_scores_full.csv"
df = pd.read_csv(csv_file)
df["system"] = df["technology"] + "–" + df["model"]

# 2) Labels mappen
label_map = {
    "recapp–gsw-CH_smoothed":         "recapp",
    "vosk–vosk-model-de-0.21":         "vosk",
    "whisper–medium":                  "whisper_medium",
    "whisper–large":                   "whisper_large",
    "whisper–turbo":                   "whisper_turbo",
    "whisper_v2–whisper_rescuespeech": "rescuespeech"
}
df["label"] = df["system"].map(label_map)

# 3) Reihenfolge & Farben
order = ["recapp","vosk","whisper_medium","whisper_large","whisper_turbo","rescuespeech"]
palette = {
    "recapp":         "#c6c6c6",
    "vosk":           "#aad9e6",
    "whisper_medium": "#7bc96f",
    "whisper_large":  "#2ca02c",
    "whisper_turbo":  "#1f7a1f",
    "rescuespeech":   "#fdb064"
}

# 4) Fest definierte dBFS- & Ambient-Varianten
volumes = ["-35dBFS","-30dBFS","-25dBFS","-20dBFS","-15dBFS"]
# Zuordnungen zu Anzeige-Text mit SNR in Klammern
vol_labels = {
    "-35dBFS": "-35 dBFS\n(SNR: 18 dB)",
    "-30dBFS": "-30 dBFS\n(SNR: 13 dB)",
    "-25dBFS": "-25 dBFS\n(SNR: 8 dB)",
    "-20dBFS": "-20 dBFS\n(SNR: 3 dB)",
    "-15dBFS": "-15 dBFS\n(SNR: –2 dB)"
}
ambients = sorted(df["ambientVariant"].unique())

sns.set(style="whitegrid", font_scale=1.0)

# 5) FacetGrid mit fixierter Reihenfolge
g = sns.catplot(
    data=df,
    x="label", y="lex_cosine_sim",
    row="ambientVariant", col="processedVolume",
    kind="box",
    order=order, palette=palette,
    height=3, aspect=1,
    sharey=True, margin_titles=True, legend=False,
    col_order=volumes,
    row_order=ambients
)

# 6) Entferne Default-Titles
g.set_titles(row_template="", col_template="")

# 7) Abstände und Supertitel setzen
g.fig.subplots_adjust(
    top=0.85,    # Raum für Haupttitel + Noise Volume
    left=0.18,   # Raum für Ambient-Labels
    right=0.98,
    hspace=0.35,
    wspace=0.25
)
g.fig.suptitle("TF-IDF Cosine Similarity", fontsize=16)

# 8) Noise Volume Label oberhalb der Spalten
g.fig.text(
    0.50, 0.92,           # y etwas tiefer als Haupttitel
    "Noise Volume",
    ha="center", va="center",
    fontsize=12, fontweight="bold"
)

# 9) Facet-Überschriften (Volume mit SNR) manuell setzen
for col_idx, vol in enumerate(volumes):
    # erstes Achsen-Objekt jeder Spalte
    ax = g.axes[0][col_idx]
    ax.set_title(vol_labels[vol], pad=8)

# 10) Feintuning jeder Facet-Achse
for row_idx, amb in enumerate(ambients):
    for col_idx, vol in enumerate(volumes):
        ax = g.axes[row_idx][col_idx]
        ax.set_ylim(0, 1)
        ax.yaxis.set_major_locator(MultipleLocator(0.1))
        ax.grid(axis="y", which="major", linestyle="--", linewidth=0.5)
        ax.set_xticklabels(order, rotation=90, ha="center", fontsize=10)
        ax.set_xlabel("")  # kein X-Label
        ax.set_ylabel("")  # kein Y-Label in Facets

    # manuelles Ambient-Label links außen
    ax0 = g.axes[row_idx][0]
    ax0.text(
        -0.30, 0.5, amb,     # weiter nach links verschoben
        ha="center", va="center",
        rotation=90, transform=ax0.transAxes,
        fontsize=12, fontweight="bold"
    )

# 11) Ein einziges Y‑Achsen-Label links außen
g.fig.text(
    0.05, 0.5,
    "Noise Type",
    ha="center", va="center",
    rotation="vertical",
    fontsize=12,
    fontweight="bold"
)

# 12) Ergebnis-Ordner anlegen & speichern
output_dir = os.path.join(os.path.dirname(csv_file), "results_for_paper")
os.makedirs(output_dir, exist_ok=True)
output_png = os.path.join(output_dir, "tfidf_cosine_facetgrid.png")
g.fig.savefig(output_png, format="png", dpi=300)

# 13) Anzeige & Pfad
plt.show()
print(f"Plot saved at: {output_png}")


In [None]:
import pandas as pd
import itertools
from scipy.stats import shapiro

# CSV mit Lexical Cosine laden
df = pd.read_csv("lexical_cosine_scores_full.csv")
df["system"] = df["technology"] + "_" + df["model"]

# Pivot in breites Format
wide = df.pivot_table(
    index=["convoID", "ambientVariant", "processedVolume"],
    columns="system",
    values="lex_cosine_sim"
)

# Shapiro-Ergebnisse sammeln
results = []
systems = [col for col in wide.columns if wide[col].notna().any()]
for a, b in itertools.combinations(systems, 2):
    x = wide[a].dropna()
    y = wide[b].dropna()
    common = x.index.intersection(y.index)
    if len(common) < 30:
        continue
    diff = x.loc[common] - y.loc[common]
    W, p_norm = shapiro(diff)
    results.append({
        "System A":      a,
        "System B":      b,
        "Shapiro W":     round(W, 3),
        "Shapiro p":     round(p_norm, 6),
        "Normal? (p>0.05)": p_norm > 0.05
    })

res_df = pd.DataFrame(results)

# Nur die Verletzungen (Normal? = False) anzeigen
violations = res_df[res_df["Normal? (p>0.05)"] == False]
print("=== Paare mit Normalitäts-Verletzung (Shapiro p ≤ 0.05) ===")
print(violations.to_string(index=False))

In [None]:
# %%  
# Signifikanz- und Effektstärken-Tests für Lexical Cosine Similarity
import pandas as pd
import itertools
from scipy.stats import shapiro, ttest_rel, wilcoxon

# 1) CSV laden und System-Label bauen
df = pd.read_csv("lexical_cosine_scores_full.csv")
df["system"] = df["technology"] + "_" + df["model"]

# 2) Pivotieren
wide = df.pivot_table(
    index=["convoID", "ambientVariant", "processedVolume"],
    columns="system",
    values="lex_cosine_sim"
)

# 3) Effektstärke-Funktion
def cohens_d_paired(x, y):
    diff = x - y
    return diff.mean() / diff.std(ddof=1)

# 4) Tests über alle Systempaare
results = []
systems = [col for col in wide.columns if wide[col].notna().any()]
for a, b in itertools.combinations(systems, 2):
    x = wide[a].dropna()
    y = wide[b].dropna()
    common = x.index.intersection(y.index)
    if len(common) < 30:
        continue  # nur Paare mit ausreichend gemeinsamen Beobachtungen

    diff = x.loc[common] - y.loc[common]
    W, p_norm = shapiro(diff)

    if p_norm > 0.05:
        stat, p_val = ttest_rel(x.loc[common], y.loc[common])
        test_name = "paired t-Test"
    else:
        stat, p_val = wilcoxon(x.loc[common], y.loc[common])
        test_name = "Wilcoxon"

    d = cohens_d_paired(x.loc[common], y.loc[common])
    results.append({
        "System A": a,
        "System B": b,
        "Test": test_name,
        "p-value": p_val,
        "Cohen's d": d
    })

# 5) DataFrame & CSV
df_results = pd.DataFrame(results)
df_results.to_csv("lex_significance_effect_sizes.csv", index=False, encoding="utf-8-sig")
display(df_results)

# Inference

## Check Multicolinearity

VIF ≈ 1: praktisch keine Kollinearität

1 < VIF < 5: geringe bis moderate Kollinearität, in der Regel unproblematisch

VIF ≥ 5 (manche Quellen sagen ≥10): erhöhte Kollinearität, mögliche Probleme bei der Interpretation der Koeffizienten




In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Design-Matrix des zuletzt gefitteten Models
X = ols_sys.model.exog
names = ols_sys.model.exog_names

vif = pd.DataFrame({
    "variable": names,
    "VIF": [variance_inflation_factor(X, i) for i in range(X.shape[1])]
})

print(vif)


--> VIF: All good

In [None]:
import pandas as pd
import statsmodels.formula.api as smf

# 1) Daten laden
df = pd.read_csv("lexical_cosine_scores_full.csv")

# 2) System-Faktor mit gewünschter Baseline
df["system"] = df["technology"] + "–" + df["model"]
levels = list(df["system"].unique())
# Stelle sicher, dass 'recapp–gsw-CH_smoothed' an erster Stelle steht
levels.remove("recapp–gsw-CH_smoothed")
levels = ["recapp–gsw-CH_smoothed"] + levels
df["system"] = pd.Categorical(df["system"], categories=levels)

# 3) AmbientVariant mit Baseline 'trafficOutside'
av = list(df["ambientVariant"].unique())
av.remove("trafficOutside")
av = ["trafficOutside"] + av
df["ambientVariant"] = pd.Categorical(df["ambientVariant"], categories=av)

# 4) ProcessedVolume mit Baseline '-35dBFS'
pv = list(df["processedVolume"].unique())
pv.remove("-35dBFS")
pv = ["-35dBFS"] + pv
df["processedVolume"] = pd.Categorical(df["processedVolume"], categories=pv)

# 5) Einfaches OLS mit cluster‐robusten SEs
formula = "lex_cosine_sim ~ C(system) + C(ambientVariant) + C(processedVolume)"
ols = smf.ols(formula, data=df).fit()
ols_clust = ols.get_robustcov_results(cov_type="cluster", groups=df["convoID"])

# 6) Ergebnisse
print(ols_clust.summary())




# Saving

In [None]:
import os
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.formula.api as smf

# 1) Arbeitsverzeichnis anzeigen
cwd = os.getcwd()
print(f"Arbeitsverzeichnis: {cwd}")

# 2) Verzeichnis für den Export erstellen (falls noch nicht vorhanden)
export_dir = os.path.join(cwd, "results_for_paper/")
os.makedirs(export_dir, exist_ok=True)

# -----------------------
# 3) Daten laden
df = pd.read_csv("lexical_cosine_scores_full.csv")

# 4) Faktoren mit gewünschter Baseline
df["system"] = df["technology"] + "–" + df["model"]
levels = list(df["system"].unique())
if "recapp–gsw-CH_smoothed" in levels:
    levels.remove("recapp–gsw-CH_smoothed")
    levels = ["recapp–gsw-CH_smoothed"] + levels
    df["system"] = pd.Categorical(df["system"], categories=levels)

av = list(df["ambientVariant"].unique())
if "trafficOutside" in av:
    av.remove("trafficOutside")
    av = ["trafficOutside"] + av
    df["ambientVariant"] = pd.Categorical(df["ambientVariant"], categories=av)

pv = list(df["processedVolume"].unique())
if "-35dBFS" in pv:
    pv.remove("-35dBFS")
    pv = ["-35dBFS"] + pv
    df["processedVolume"] = pd.Categorical(df["processedVolume"], categories=pv)

# -----------------------
# 5) OLS-Regression mit cluster‐robusten SEs
formula = "lex_cosine_sim ~ C(system) + C(ambientVariant) + C(processedVolume)"
ols = smf.ols(formula, data=df).fit()
ols_clust = ols.get_robustcov_results(cov_type="cluster", groups=df["convoID"])

print("=== OLS (cluster-robust) ===")
print(ols_clust.summary())

# Zusammenfassung in DataFrame
summary = ols_clust.summary2()
summary_df = summary.tables[1]

# Speichern der OLS-Ergebnisse
ols_csv = os.path.join(export_dir, "ols_clust_results_cosine.csv")
summary_df.to_csv(ols_csv)
ols_xlsx = os.path.join(export_dir, "ols_clust_results_cosine.xlsx")
summary_df.to_excel(ols_xlsx)
print(f"OLS-Ergebnisse gespeichert unter:\n  {ols_csv}\n  {ols_xlsx}")

# -----------------------
# 6) Multikollinearität prüfen (VIF)
#    Design-Matrix aus dem gefitteten ols-Objekt
X = ols.model.exog
names = ols.model.exog_names

vif = pd.DataFrame({
    "variable": names,
    "VIF": [variance_inflation_factor(X, i) for i in range(X.shape[1])]
})

print("=== VIF ===")
print(vif)

# Speichern der VIF-Ergebnisse
vif_csv = os.path.join(export_dir, "vif_results_cosine.csv")
vif.to_csv(vif_csv, index=False)
vif_xlsx = os.path.join(export_dir, "vif_results_cosine.xlsx")
vif.to_excel(vif_xlsx, index=False)
print(f"VIF-Ergebnisse gespeichert unter:\n  {vif_csv}\n  {vif_xlsx}")

# -----------------------
# 7) Deskriptive Statistiken für TF‑IDF‑Cosine Similarity berechnen
grouped_cosine = df.groupby(["model", "technology"])["lex_cosine_sim"]
stats_cosine_df = grouped_cosine.agg(["mean", "median", "std", "min", "max"]).reset_index()

# Speichern der deskriptiven Statistiken
cosine_stats_csv = os.path.join(export_dir, "cosine_stats_by_model.csv")
stats_cosine_df.to_csv(cosine_stats_csv, index=False)
cosine_stats_xlsx = os.path.join(export_dir, "cosine_stats_by_model.xlsx")
stats_cosine_df.to_excel(cosine_stats_xlsx, index=False)
print(f"Deskriptive Statistiken gespeichert unter:\n  {cosine_stats_csv}\n  {cosine_stats_xlsx}")
