In [10]:
import os
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance

maping = {'AAVE-dataset': 'African American English', 'AAVE-Filler-words-dataset': 'African American English - Filler Words',
         'AAVE-hashtags-words-dataset': 'African American English - Hashtags', 'AAVE-emojis-words-dataset': 'African American English - emojis',
         'AAVE-misspelling-words-dataset': 'African American English - misspelling', 'AAVE-misplaced-dangling-modifiers-dataset': 'African American English - Modifiers',
         'original_dataset':'Original'}

def get_log_prob_datasets(model):
    directory_path = f"./original_{model}/log_prob/"

    dataset_list = []
    original = pd.DataFrame()

    for filename in os.listdir(directory_path):
        if filename.endswith(".csv"):
            if filename in ['wasserstein_distance.csv', 'mean_log_prob.csv']:
                continue
            file_path = os.path.join(directory_path, filename)
            dataset = pd.read_csv(file_path)
            filename = filename.split('.')[0]
            if filename == 'original_dataset':
                original=dataset
                continue
            dataset_list.append((dataset, filename))
            
    return dataset_list, original

In [11]:
llama_datasets, llama_original_dataset = get_log_prob_datasets('llama2')
mistral_datasets, mistral_original_dataset  = get_log_prob_datasets('mistral')
gemma_datasets, gemma_original_dataset = get_log_prob_datasets('gemma')
datasets = [(llama_datasets,llama_original_dataset, 'llama2'), (mistral_datasets,mistral_original_dataset, 'mistral'), (gemma_datasets,gemma_original_dataset, 'gemma')]

In [12]:
gemma_original_dataset.head()

Unnamed: 0,sentence,sentiment,Log Prob
0,"I hope you do, because otherwise your wife mig...",negative,-11.464495
1,I constantly worry about their fight against n...,positive,-8.739827
2,i feel I've had more unhappy years than happy ...,negative,-3.669981
3,"I'm feeling I'm caring, I'm healing, I'm shari...",positive,-6.620073
4,I just feel like if I can make it through this...,positive,-6.888464


In [13]:
col = mistral_original_dataset['Log Prob'].copy()
col = col.map(lambda x: np.exp(x))

In [14]:
abc = mistral_original_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
abc.max()

0.4156501867683275

In [15]:
mistral_original_dataset['Log Prob'].to_numpy()

array([ -3.84484887, -10.12582016,  -1.83078396, ...,  -3.75684142,
        -2.35400224,  -2.0245266 ])

In [16]:
mistral_original_dataset['Log Prob'].to_numpy().mean()

-3.8802225177266423

In [8]:
llama_datasets[0][0].shape

(1984, 3)

In [18]:
for model_dataset, model_original, model_name in datasets:
    mean_log_prob = pd.DataFrame({'Type':[], 'Mean Log Prob': []})
    mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': 'Original','Mean Log Prob': np.mean(model_original['Log Prob'].to_numpy())}])])
    wasserstein = pd.DataFrame({'Type':[],'Wasserstein Distance Vs Standard English': []})
    org_prob = model_original['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
    for aae_dataset, aae_dataset_name in model_dataset:
        aae_prob = aae_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
        distance = wasserstein_distance(org_prob,aae_prob)
        new_row = {'Type': aae_dataset_name, 'Wasserstein Distance Vs Standard English': distance}
        wasserstein = pd.concat([wasserstein, pd.DataFrame([new_row])], ignore_index=True)
        mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': aae_dataset_name,'Mean Log Prob': np.mean(aae_dataset['Log Prob'].to_numpy())}])])
        print("=======================")
        print(f"Model: {model_name}")
        print(f"AAE Dataset: {aae_dataset_name}")
        print(f"wasserstein_distance: {distance}")
    mean_log_prob.to_csv(f"./original_{model_name}/log_prob/mean_log_prob.csv", index=False)
    wasserstein.to_csv(f"./original_{model_name}/log_prob/wasserstein_distance.csv", index=False)

Model: llama2
AAE Dataset: Filler Words
wasserstein_distance: 0.053735498273001114
Model: llama2
AAE Dataset: Emojis
wasserstein_distance: 0.01872039747270739
Model: llama2
AAE Dataset: Misspelling
wasserstein_distance: 0.009779570108034331
Model: llama2
AAE Dataset: Modifiers
wasserstein_distance: 0.027426618807157847
Model: llama2
AAE Dataset: Hashtags
wasserstein_distance: 0.015675713437384716
Model: mistral
AAE Dataset: Emojis
wasserstein_distance: 0.019853137461808258
Model: mistral
AAE Dataset: Filler Words
wasserstein_distance: 0.05552762460384117
Model: mistral
AAE Dataset: Hashtags
wasserstein_distance: 0.01196808550011901
Model: mistral
AAE Dataset: Modifiers
wasserstein_distance: 0.03311175348869522
Model: mistral
AAE Dataset: Misspelling
wasserstein_distance: 0.011776698069393505
Model: gemma
AAE Dataset: Filler Words
wasserstein_distance: 0.005741481874234587
Model: gemma
AAE Dataset: Emojis
wasserstein_distance: 0.10380428763379722
Model: gemma
AAE Dataset: Misspelling
wa

In [4]:
gemma_datasets, gemma_original_dataset = get_log_prob_datasets('gemma')
gemma_mean_log = pd.read_csv(f"./gemma/log_prob/mean_log_prob.csv")
gemma_cfsb = pd.read_csv(f"./gemma/log_prob/wasserstein_distance.csv")

In [23]:
mean_log_prob = pd.DataFrame({'Type':[], 'Mean Log Prob': []})
mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': 'Original','Mean Log Prob': np.mean(gemma_original_dataset['Log Prob'].to_numpy())}])])
wasserstein = pd.DataFrame({'Type':[],'Wasserstein Distance Vs Standard English': []})
org_prob = gemma_original_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
for aae_dataset, aae_dataset_name in gemma_datasets:
        aae_prob = aae_dataset['Log Prob'].map(lambda x: np.exp(x)).to_numpy()
        distance = wasserstein_distance(org_prob,aae_prob)
        new_row = {'Type': maping[aae_dataset_name], 'Wasserstein Distance Vs Standard English': distance}
        wasserstein = pd.concat([wasserstein, pd.DataFrame([new_row])], ignore_index=True)
        mean_log_prob = pd.concat([mean_log_prob, pd.DataFrame([{'Type': maping[aae_dataset_name],'Mean Log Prob': np.mean(aae_dataset['Log Prob'].to_numpy())}])])
        print("=======================")
        print(f"Model: gemma")
        print(f"AAE Dataset: {aae_dataset_name}")
        print(f"wasserstein_distance: {distance}")

Model: gemma
AAE Dataset: AAVE-dataset
wasserstein_distance: 0.004465424608789378
Model: gemma
AAE Dataset: AAVE-Filler-words-dataset
wasserstein_distance: 0.007281352111851155
Model: gemma
AAE Dataset: AAVE-hashtags-words-dataset
wasserstein_distance: 0.0035747622821106417
Model: gemma
AAE Dataset: AAVE-emojis-words-dataset
wasserstein_distance: 0.047846125327148124
Model: gemma
AAE Dataset: AAVE-misspelling-words-dataset
wasserstein_distance: 0.005231387241108779
Model: gemma
AAE Dataset: AAVE-misplaced-dangling-modifiers-dataset
wasserstein_distance: 0.007336076634154521


In [24]:
mean_log_prob.to_csv(f"./gemma/log_prob/mean_log_prob.csv", index=False)
wasserstein.to_csv(f"./gemma/log_prob/wasserstein_distance.csv", index=False)