In [0]:
import os
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance

maping = {'AAVE-dataset': 'African American English', 'AAVE-Filler-words-dataset': 'African American English - Filler Words',
         'AAVE-hashtags-words-dataset': 'African American English - Hashtags', 'AAVE-emojis-words-dataset': 'African American English - emojis',
         'AAVE-misspelling-words-dataset': 'African American English - misspelling', 'AAVE-misplaced-dangling-modifiers-dataset': 'African American English - Modifiers',
         'original_dataset':'Original'}

def get_log_prob_datasets(model):
    directory_path = f"./original_{model}/log_prob/"

    dataset_list = []
    original = pd.DataFrame()

    for filename in os.listdir(directory_path):
        if filename.endswith(".csv"):
            if filename in ['wasserstein_distance.csv', 'mean_log_prob.csv']:
                continue
            file_path = os.path.join(directory_path, filename)
            dataset = pd.read_csv(file_path)
            filename = filename.split('.')[0]
            if filename == 'original_dataset':
                original=dataset
                continue
            dataset_list.append((dataset, filename))
            
    return dataset_list, original

In [0]:
print ("Python")

In [0]:
llama_datasets, llama_original_dataset = get_log_prob_datasets('llama2')
mistral_datasets, mistral_original_dataset  = get_log_prob_datasets('mistral')
gemma_datasets, gemma_original_dataset = get_log_prob_datasets('gemma')
datasets = [(llama_datasets,llama_original_dataset, 'llama2'), (mistral_datasets,mistral_original_dataset, 'mistral'), (gemma_datasets,gemma_original_dataset, 'gemma')]

In [0]:
gemma_original_dataset.head()

In [0]:
col = mistral_original_dataset['Log Prob'].copy()
col = col.map(lambda x: np.exp(x))

In [0]:
abc = mistral_original_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
abc.max()

In [0]:
mistral_original_dataset['Log Prob'].to_numpy()

In [0]:
mistral_original_dataset['Log Prob'].to_numpy().mean()

In [0]:
llama_datasets[0][0].shape

In [0]:
for model_dataset, model_original, model_name in datasets:
    mean_log_prob = pd.DataFrame({'Type':[], 'Mean Log Prob': []})
    mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': 'Original','Mean Log Prob': np.mean(model_original['Log Prob'].to_numpy())}])])
    wasserstein = pd.DataFrame({'Type':[],'Wasserstein Distance Vs Standard English': []})
    org_prob = model_original['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
    for aae_dataset, aae_dataset_name in model_dataset:
        aae_prob = aae_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
        distance = wasserstein_distance(org_prob,aae_prob)
        new_row = {'Type': aae_dataset_name, 'Wasserstein Distance Vs Standard English': distance}
        wasserstein = pd.concat([wasserstein, pd.DataFrame([new_row])], ignore_index=True)
        mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': aae_dataset_name,'Mean Log Prob': np.mean(aae_dataset['Log Prob'].to_numpy())}])])
        print("=======================")
        print(f"Model: {model_name}")
        print(f"AAE Dataset: {aae_dataset_name}")
        print(f"wasserstein_distance: {distance}")
    mean_log_prob.to_csv(f"./original_{model_name}/log_prob/mean_log_prob.csv", index=False)
    wasserstein.to_csv(f"./original_{model_name}/log_prob/wasserstein_distance.csv", index=False)

In [0]:
gemma_datasets, gemma_original_dataset = get_log_prob_datasets('gemma')
gemma_mean_log = pd.read_csv(f"./gemma/log_prob/mean_log_prob.csv")
gemma_cfsb = pd.read_csv(f"./gemma/log_prob/wasserstein_distance.csv")

In [0]:
mean_log_prob = pd.DataFrame({'Type':[], 'Mean Log Prob': []})
mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': 'Original','Mean Log Prob': np.mean(gemma_original_dataset['Log Prob'].to_numpy())}])])
wasserstein = pd.DataFrame({'Type':[],'Wasserstein Distance Vs Standard English': []})
org_prob = gemma_original_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
for aae_dataset, aae_dataset_name in gemma_datasets:
        aae_prob = aae_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
        distance = wasserstein_distance(org_prob,aae_prob)
        new_row = {'Type': maping[aae_dataset_name], 'Wasserstein Distance Vs Standard English': distance}
        wasserstein = pd.concat([wasserstein, pd.DataFrame([new_row])], ignore_index=True)
        mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': maping[aae_dataset_name],'Mean Log Prob': np.mean(aae_dataset['Log Prob'].to_numpy())}])])
        print("=======================")
        print(f"Model: gemma")
        print(f"AAE Dataset: {aae_dataset_name}")
        print(f"wasserstein_distance: {distance}")

In [0]:
mean_log_prob.to_csv(f"./gemma/log_prob/mean_log_prob.csv", index=False)
wasserstein.to_csv(f"./gemma/log_prob/wasserstein_distance.csv", index=False)