In [None]:
from utils.lang_maps import HR_MAP 
from utils.task_vectors import TaskVector
from transformers import WhisperForConditionalGeneration
models_dir = "output_whisper-tiny"
ALL_LANGS = list(set(HR_MAP.keys()).union(set("_".join(v) for v in HR_MAP.values())))
TVs = {}
for lang in ALL_LANGS:
    if lang != "":
        TVs[lang] = TaskVector(
            pretrained_model=WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny"),
            finetuned_model=WhisperForConditionalGeneration.from_pretrained(f"{models_dir}/{lang}/final")
        )


In [None]:
import pandas as pd
import json
import os
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
import torch
model = "results/whisper-tiny"
rows = []
summary_df = pd.read_csv(f"{model}/summary.csv")
lang_hr_gains = {}
for result in os.listdir(f"{model}/hyperparameters"):
    with open(f"{model}/hyperparameters/" + result, "r") as f:
        hyps = json.load(f)
        f.close()
    if len(hyps) > 0:
        lang_name = result.split(".")[0]
        lang_hr_gains[lang_name] = {}
        best_score = min(hyps.values())
        best_lambda = min(hyps, key=hyps.get)
        score_0 = hyps["0.0"]
        lang_hr_gains[lang_name]["lambda"] = best_lambda
        lang_hr_gains[lang_name]["hr_delta"] = abs(score_0 - best_score)
        language_tv = TVs[lang_name]
        hr_tv = TVs["_".join(HR_MAP[lang_name])]
        cossim = cosine_similarity(language_tv.tv_to_vector().reshape(1, -1), hr_tv.tv_to_vector().reshape(1, -1))[0][0]
        # Calculate the element-wise difference
        difference = [language_tv.vector[k] - hr_tv.vector[k] for k in language_tv.vector.keys()] 
        # Square each element of the difference matrix
        squared_difference = [np.square(d.flatten()) for d in difference]
        # Calculate the mean of all elements in the squared_difference matrix
        mse = np.mean(torch.cat(squared_difference).numpy())
        layer_wise_mse = [np.mean(l.numpy()) for l in squared_difference]
        lang_hr_gains[lang_name]["cosine_sim"] = cossim
        lang_hr_gains[lang_name]["mse"] = mse
        lang_hr_gains[lang_name]["layer_wise_mse"] = layer_wise_mse
        lang_hr_gains[lang_name]["wer"] = best_score



In [None]:
df = pd.DataFrame(lang_hr_gains).T
df

In [None]:
df.drop('layer_wise_mse', axis=1).corr()

In [None]:
len(lang_hr_gains["kbd"]["layer_wise_mse"]), len(lang_hr_gains["ukv"]["layer_wise_mse"])


In [None]:
import matplotlib.pyplot as plt
# Plotting all lines at once
def smooth(scalars, weight):  # Weight between 0 and 1
    last = scalars[0]  # First value in the plot (first timestep)
    smoothed = list()
    for point in scalars:
        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
        smoothed.append(smoothed_val)                        # Save it
        last = smoothed_val                                  # Anchor the last smoothed value
        
    return smoothed
mse_graph = [(lang, item["layer_wise_mse"]) for lang, item in lang_hr_gains.items()]
xs = range(len(mse_graph[0][1]))
ys = np.array([smooth(v[1], 0.9) for v in mse_graph]).T
labels = [v[0] for v in mse_graph]

fig, ax = plt.subplots()

ax.plot(xs, ys, label=labels)

# Adding labels and legend (you would need to manually create labels here if desired)
ax.set_xlabel('Layer')
ax.set_ylabel('MSE')
# ax.set_yscale("log")
ax.set_title('MSE for target languages and their HR counterparts at each layer')
ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.5), ncol=5) 
ax.grid(True)
plt.show()


In [None]:
import pandas as pd
import json
import os
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np 
import torch
model = "results/whisper-tiny"
rows = []
summary_df = pd.read_csv(f"{model}/summary.csv")
lang_hr_gains = {}
for result in os.listdir(f"{model}/hyperparameters"):
    with open(f"{model}/hyperparameters/" + result, "r") as f:
        hyps = json.load(f)
        f.close()
    if len(hyps) > 0:
        lang_name = result.split(".")[0]
        lang_hr_gains[lang_name] = {}
        best_score = min(hyps.values())
        best_lambda = min(hyps, key=hyps.get)
        score_0 = hyps["0.0"]
        lang_hr_gains[lang_name]["lambda"] = best_lambda
        lang_hr_gains[lang_name]["hr_delta"] = abs(score_0 - best_score)
        language_tv = TVs[lang_name]
        hr_tv = TVs["_".join(HR_MAP[lang_name])]
        # Calculate the element-wise difference
        lr_layerwise, layer_strs = language_tv.tv_to_layer_wise()
        hr_layerwise, _ = hr_tv.tv_to_layer_wise()
        cossim_layerwise = [cosine_similarity(lr_layerwise[i].reshape(1, -1), hr_layerwise[i].reshape(1, -1))[0][0] for i in range(len(lr_layerwise))]
        difference = [np.array(lr_layerwise[i]) - np.array(hr_layerwise[i]) for i in range(len(lr_layerwise))]
        # Square each element of the difference matrix
        squared_difference = [np.square(d) for d in difference]
        # Calculate the mean of all elements in the squared_difference matrix
        layer_wise_mse = [np.mean(l) for l in squared_difference]
        lang_hr_gains[lang_name]["cosine_sim"] = np.mean(cossim_layerwise)
        lang_hr_gains[lang_name]["mse"] = np.mean(layer_wise_mse)
        lang_hr_gains[lang_name]["layer_wise_mse"] = layer_wise_mse
        lang_hr_gains[lang_name]["layer_wise_cossim"] = cossim_layerwise

        lang_hr_gains[lang_name]["wer"] = best_score
df = pd.DataFrame(lang_hr_gains).T


In [None]:
df

In [None]:
df_2 = df
new_cols_df = df_2['layer_wise_mse'].apply(pd.Series).round(2)

new_cols_df.columns = [f'mse_{i+1}' for i in range(new_cols_df.shape[1])]
df_2 = pd.concat([df_2.drop('layer_wise_mse', axis=1), new_cols_df], axis=1)

new_cols_df = df_2['layer_wise_cossim'].apply(pd.Series).round(2)

# Optionally, rename the new columns for clarity
new_cols_df.columns = [f'cossim_{i+1}' for i in range(new_cols_df.shape[1])]
df_2 = pd.concat([df_2.drop('layer_wise_cossim', axis=1), new_cols_df], axis=1)

from sklearn.metrics import r2_score
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression

cossim = np.array(df_2.cosine_sim)
delta = np.array(df_2.hr_delta)
model = LinearRegression()
model.fit(delta.reshape(-1,1), cossim)
pred = model.predict(delta.reshape(-1,1))
# Calculate R-squared
r2 = r2_score(cossim, pred)
spearman = spearmanr(cossim, delta)
print("cosine_similarity")
print(f"R-squared: {r2}")
print("spearman", spearman)

cossim = np.array(df_2.mse)
delta = np.array(df_2.hr_delta)
model = LinearRegression()
model.fit(delta.reshape(-1,1), cossim)
pred = model.predict(delta.reshape(-1,1))
# Calculate R-squared
r2 = r2_score(cossim, pred)
spearman = spearmanr(cossim, delta)
print("mse")
print(f"R-squared: {r2}")
print("spearman", spearman)

In [None]:
df_2

In [None]:

lang_hr_gains

In [None]:
import matplotlib.pyplot as plt
# Plotting all lines at once
def smooth(scalars, weight):  # Weight between 0 and 1
    last = scalars[0]  # First value in the plot (first timestep)
    smoothed = list()
    for point in scalars:
        smoothed_val = last * weight + (1 - weight) * point  # Calculate smoothed value
        smoothed.append(smoothed_val)                        # Save it
        last = smoothed_val                                  # Anchor the last smoothed value
        
    return smoothed
mse_graph = [(lang, item["layer_wise_cossim"]) for lang, item in lang_hr_gains.items()]
xs = range(len(mse_graph[0][1]))
ys = np.array([smooth(v[1], 0.9) for v in mse_graph]).T
ys = np.array([v[1]for v in mse_graph]).T

labels = [v[0] for v in mse_graph]

fig, ax = plt.subplots()

ax.plot(xs, ys, label=labels)

# Adding labels and legend (you would need to manually create labels here if desired)
ax.set_xlabel('Layer')
ax.set_ylabel('Cosine Similarity')
# ax.set_yscale("log")
ax.set_title('Cosine Similarity for target languages and their HR counterparts at each layer')
ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.5), ncol=5) 
ax.grid(True)
plt.show()


In [None]:
import pandas as pd
import json
import os
from sklearn.metrics.pairwise import cosine_similarity

model = "results/whisper-tiny"
rows = []
summary_df = pd.read_csv(f"{model}/summary.csv")
token_freq_df = pd.read_csv("token_frequency/measures.tsv", delimiter="\t")
lang_hr_gains = {}
for result in os.listdir(f"{model}/hyperparameters"):
    with open(f"{model}/hyperparameters/" + result, "r") as f:
        hyps = json.load(f)
        f.close()
    if len(hyps) > 0:
        lang_name = result.split(".")[0]
        lang_hr_gains[lang_name] = {}
        best_score = hyps[min(hyps, key=hyps.get)]#summary_df.loc[summary_df['language'] == lang_name]["wer"]
        best_lambda = min(hyps, key=hyps.get)
        score_0 = hyps["0.0"]
        lang_hr_gains[lang_name]["lambda"] = float(best_lambda)
        lang_hr_gains[lang_name]["hr_delta"] = float(abs(score_0 - best_score))
        lang_hr_gains[lang_name]["wer"] = score_0
        # language_tv = TVs[lang_name]
        # hr_tv = TVs["_".join(HR_MAP[lang_name])]
        # hr_tv = TVs["_".join(HR_MAP[lang_name])]
        # cossim = cosine_similarity(language_tv.tv_to_vector().reshape(1, -1), hr_tv.tv_to_vector().reshape(1, -1))[0][0]
        # lang_hr_gains[lang_name]["tv_cossim"] = cossim
df = pd.DataFrame(lang_hr_gains).T
df.rename_axis("lang", inplace=True)
df = df.reset_index()
merged_df =pd.merge(df, token_freq_df,on="lang")
merged_df

In [None]:
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import LinearRegression
import numpy as np

import seaborn
cossim = np.array(merged_df.cosine)
delta = np.array(merged_df.hr_delta)
wer = np.array(merged_df.wer)
model = LinearRegression()
model.fit(delta.reshape(-1,1), cossim)
pred = model.predict(delta.reshape(-1,1))
# Calculate R-squared
r2 = r2_score(cossim, pred)
spearman = spearmanr(cossim, delta)
pearson = pearsonr(cossim, delta)
print("cosine_similarity predictor for delta")
print(f"R-squared: {r2}")
print("spearman", spearman)
print("pearson", pearson)

lamb = np.array(merged_df["lambda"])
model = LinearRegression()
model.fit(lamb.reshape(-1,1), cossim)
pred = model.predict(lamb.reshape(-1,1))
# Calculate R-squared
r2 = r2_score(cossim, pred)
spearman = spearmanr(cossim, lamb)
pearson = pearsonr(cossim, lamb)
print("cosine similarity predictor for lambda")
print(f"R-squared: {r2}")
print("spearman", spearman)
print("pearson", pearson)

wer = np.array(merged_df["wer"])
model = LinearRegression()
model.fit(wer.reshape(-1,1), cossim)
pred = model.predict(wer.reshape(-1,1))
# Calculate R-squared
r2 = r2_score(cossim, pred)
spearman = spearmanr(cossim, wer)
pearson = pearsonr(cossim, wer)
print("cosine similarity predictor for wer")
print(f"R-squared: {r2}")
print("spearman", spearman)
print("pearson", pearson)


seaborn.regplot(data=merged_df, x="cosine", y="lambda")

In [None]:
seaborn.regplot(data=merged_df, x="cosine", y="hr_delta")

In [None]:
tv_cossim = np.array(merged_df["tv_cossim"])
cossim = np.array(merged_df.cosine)

model = LinearRegression()
model.fit(tv_cossim.reshape(-1,1), cossim)
pred = model.predict(tv_cossim.reshape(-1,1))
# Calculate R-squared
r2 = r2_score(cossim, pred)
spearman = spearmanr(cossim, tv_cossim)
print("TV cosine similarity predictor for token cossim")
print(f"R-squared: {r2}")
print("spearman", spearman)

seaborn.regplot(data=merged_df, x="tv_cossim", y="cosine")

In [None]:
import pandas as pd
import json
import os
from sklearn.metrics.pairwise import cosine_similarity

model = "results/whisper-large-v3"
rows = []
summary_df = pd.read_csv(f"{model}/summary.csv")
token_freq_df = pd.read_csv("token_frequency/measures.tsv", delimiter="\t")
lang_hr_gains = {}
for result in os.listdir(f"{model}/hyperparameters"):
    with open(f"{model}/hyperparameters/" + result, "r") as f:
        hyps = json.load(f)
        f.close()
    if len(hyps) > 0:
        lang_name = result.split(".")[0]
        lang_hr_gains[lang_name] = {}
        best_score = hyps[min(hyps, key=hyps.get)]#summary_df.loc[summary_df['language'] == lang_name]["wer"]
        best_lambda = min(hyps, key=hyps.get)
        score_0 = hyps["0.0"]
        lang_hr_gains[lang_name]["lambda"] = float(best_lambda)
        lang_hr_gains[lang_name]["hr_delta"] = float(abs(score_0 - best_score))
        lang_hr_gains[lang_name]["wer"] = score_0
        # language_tv = TVs[lang_name]
        # hr_tv = TVs["_".join(HR_MAP[lang_name])]
        # hr_tv = TVs["_".join(HR_MAP[lang_name])]
        # cossim = cosine_similarity(language_tv.tv_to_vector().reshape(1, -1), hr_tv.tv_to_vector().reshape(1, -1))[0][0]
        # lang_hr_gains[lang_name]["tv_cossim"] = cossim
df = pd.DataFrame(lang_hr_gains).T
df.rename_axis("lang", inplace=True)
df = df.reset_index()
merged_df =pd.merge(df, token_freq_df,on="lang")
merged_df
wer = np.array(merged_df["wer"])
hr_delta = np.array(merged_df["hr_delta"])
model = LinearRegression()
model.fit(hr_delta.reshape(-1,1), wer)
pred = model.predict(hr_delta.reshape(-1,1))
# Calculate R-squared
r2 = r2_score(wer, pred)
spearman = spearmanr(wer, hr_delta)
pearson = pearsonr(wer, hr_delta)
print("wer predictor for delta")
print(f"R-squared: {r2}")
print("spearman", spearman)
print("pearson", pearson)