# Accuracies - SFT Performance

In [4]:
import pandas as pd
import os

languages = ['english', 'german', 'french', 'italian', 'portuguese', 'hindi', 'spanish']
accuracies = []
accuracies_pretrained = []

for language in languages:
    path = f"results/outputs_8b/{language}-{language}"
    df_test = pd.read_csv(os.path.join(path, 'test.csv'))
    correct = 0
    total = 0
    df_test['answer'] = df_test['answer'].astype(str)
    for i in range(len(df_test)):
        df_test.loc[i, 'prediction'] = df_test.loc[i, 'prediction'].replace(df_test.loc[i, 'question'], '').replace("### Answer: ", '')

        ref = df_test['answer'][i]#.split()
        pred = df_test['prediction'][i]
        # for ref in refs:
        if ref.strip().lower() in pred.strip().lower():
            correct += 1
                # break
        total += 1

    accuracy = correct / total
    accuracies.append(accuracy)

for language in languages:
    path = f"results/outputs_8b/pretrained-{language}"
    df_test = pd.read_csv(os.path.join(path, 'test.csv'))
    correct = 0
    total = 0
    df_test['answer'] = df_test['answer'].astype(str)
    for i in range(len(df_test)):
        df_test.loc[i, 'prediction'] = df_test.loc[i, 'prediction'].replace(df_test.loc[i, 'question'], '').replace("### Answer: ", '')

        ref = df_test['answer'][i]#.split()
        pred = df_test['prediction'][i]
        # for ref in refs:
        if ref.strip().lower() in pred.strip().lower():
            correct += 1
                # break
        total += 1

    accuracy = correct / total
    accuracies_pretrained.append(accuracy)

df = pd.DataFrame({'language': languages, 'accuracy': accuracies, 'accuracy_pretrained': accuracies_pretrained})
# multiply each accuracy by 100 and round it to 2 decimal points
df['accuracy'] = (df['accuracy'] * 100).round(2)
df['accuracy_pretrained'] = (df['accuracy_pretrained'] * 100).round(2)
df.to_csv('results/outputs_8b/accuracy.csv', index=False)
        

# Correct Overlap and Diff

In [None]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

languages = ["english", "french",  "german", "hindi", "italian", "portuguese",  "spanish"]
# languages = ["english", "french"]

overlap = np.zeros((len(languages), len(languages)))
diffs = np.zeros((len(languages), len(languages)))

for language1 in languages:
    for language2 in languages:
        if language1 == language2:
            overlap[languages.index(language1), languages.index(language2)] = np.nan
            diffs[languages.index(language1), languages.index(language2)] = np.nan

        else:
            df1 = pd.read_csv(f'results/outputs/{language1}-{language1}/test.csv')
            df2 = pd.read_csv(f'results/outputs/{language2}-{language2}/test.csv')

            for i in range(len(df1)):
                df1.loc[i, 'prediction'] = df1.loc[i, 'prediction'].replace(df1.loc[i, 'question'], '').replace("### Answer: ", '')
                
            for i in range(len(df2)):
                df2.loc[i, 'prediction'] = df2.loc[i, 'prediction'].replace(df2.loc[i, 'question'], '').replace("### Answer: ", '')
                
            
            df1['correct'] = 0
            df2['correct'] = 0

            for i in range(len(df1)):
                refs = str(df1['answer'][i]).split()
                pred = df1['prediction'][i]
                for ref in refs:
                    if ref.strip().lower() in pred.strip().lower():
                        df1.loc[i, 'correct'] = 1
                        break

            for i in range(len(df2)):
                refs = str(df2['answer'][i]).split()
                pred = df2['prediction'][i]
                for ref in refs:
                    if ref.strip().lower() in pred.strip().lower():
                        df2.loc[i, 'correct'] = 1
                        break

            # use id column to find rows that exist in both dataframes and create the dataframe with correct columns for both languages
            common_ids = set(df1['id']).intersection(set(df2['id']))
            df1 = df1[df1['id'].isin(common_ids)]
            df2 = df2[df2['id'].isin(common_ids)]

            both_correct, at_least_one_correct = 0, 0
            only_lang1, only_lang2 = 0, 0

            for id in common_ids:
                correct1 = df1[df1['id'] == id]['correct'].values[0]
                correct2 = df2[df2['id'] == id]['correct'].values[0]
                if correct1 == 1 and correct2 == 1:
                    both_correct += 1
                if correct1 == 1 or correct2 == 1:
                    at_least_one_correct += 1
                if correct1 == 1 and correct2 == 0:
                    only_lang1 += 1
                if correct1 == 0 and correct2 == 1:
                    only_lang2 += 1

            overlap[languages.index(language1), languages.index(language2)] = both_correct / at_least_one_correct
            diffs[languages.index(language1), languages.index(language2)] = only_lang1 / at_least_one_correct
overlap = overlap * 100
diffs = diffs * 100

In [None]:
plt.figure(figsize=(7, 5))
sns.heatmap(diffs, annot=True, xticklabels=languages, fmt=".2f", yticklabels=languages, cmap="viridis")

plt.xlabel('Unknown Question for Language A')
plt.ylabel('Known Question for Language B')

plt.title('Differences between languages - Llama-1B')
# plt.title('Overlap between languages')
plt.savefig('results/outputs_1b/diffs.png', dpi=300, bbox_inches='tight')

In [None]:
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(overlap, dtype=bool))

# Draw a heatmap with the mask
plt.figure(figsize=(7, 5))
sns.heatmap(overlap[1:, :6], annot=True, xticklabels=languages[:6], fmt=".2f", yticklabels=languages[1:], cmap="viridis", mask=mask[1:, :6], vmin=32, vmax=67)

plt.xlabel('Language A')
plt.ylabel('Language B')
plt.title('Overlap between languages - Llama-8B')
# plt.title('Overlap between languages')
plt.savefig('results/outputs_8b/overlap.png', dpi=300, bbox_inches='tight')

# CoCa-CoLa Table

In [20]:
import pandas as pd
import os
import numpy as np

languages = ["french",  "german", "hindi", "italian", "portuguese",  "spanish"]

results_pretrained = {}
results_english_tuned = {}
results_finetuned = {}

for language in languages:
    df = pd.read_csv(f'results/coca-cola-3b/{language}.csv')
    results_pretrained[language] = {}
    accuracy = df[(df['model'] == 'Pretrained') & (df['split'] != 'test_english')]['accuracy'].values[0] + df[(df['model'] == 'Pretrained') & (df['split'] == 'test_english')]['accuracy'].values[0]
    cola_ratio = df[(df['model'] == 'Pretrained') & (df['split'] != 'test_english')]['accuracy'].values[0] / accuracy
    results_pretrained[language]['cumulative_accuracy'] = accuracy
    results_pretrained[language]['coca_cola_ratio'] = cola_ratio

    results_english_tuned[language] = {}
    accuracy = df[(df['model'] == 'english-tuned') & (df['split'] != 'test_english')]['accuracy'].values[0] + df[(df['model'] == 'english-tuned') & (df['split'] == 'test_english')]['accuracy'].values[0]
    cola_ratio = df[(df['model'] == 'english-tuned') & (df['split'] != 'test_english')]['accuracy'].values[0] / accuracy
    results_english_tuned[language]['cumulative_accuracy'] = accuracy
    results_english_tuned[language]['coca_cola_ratio'] = cola_ratio

    results_finetuned[language] = {}
    accuracy = df[(df['model'] == 'Finetuned') & (df['split'] != 'test_english')]['accuracy'].values[0] + df[(df['model'] == 'Finetuned') & (df['split'] == 'test_english')]['accuracy'].values[0]
    cola_ratio = df[(df['model'] == 'Finetuned') & (df['split'] != 'test_english')]['accuracy'].values[0] / accuracy
    results_finetuned[language]['cumulative_accuracy'] = accuracy
    results_finetuned[language]['coca_cola_ratio'] = cola_ratio


In [21]:
results_pretrained

{'french': {'cumulative_accuracy': np.float64(0.2057049306559954),
  'coca_cola_ratio': np.float64(0.5251140796197071)},
 'german': {'cumulative_accuracy': np.float64(0.1699715641246028),
  'coca_cola_ratio': np.float64(0.5652706616172867)},
 'hindi': {'cumulative_accuracy': np.float64(0.157633645424343),
  'coca_cola_ratio': np.float64(0.31925085021794325)},
 'italian': {'cumulative_accuracy': np.float64(0.1662928165018773),
  'coca_cola_ratio': np.float64(0.5667886054606229)},
 'portuguese': {'cumulative_accuracy': np.float64(0.1760045044763754),
  'coca_cola_ratio': np.float64(0.6336918792021713)},
 'spanish': {'cumulative_accuracy': np.float64(0.1917030059887202),
  'coca_cola_ratio': np.float64(0.6167541172545571)}}

In [22]:
results_english_tuned

{'french': {'cumulative_accuracy': np.float64(0.6255330059062931),
  'coca_cola_ratio': np.float64(0.14725679044959183)},
 'german': {'cumulative_accuracy': np.float64(0.4930484372709564),
  'coca_cola_ratio': np.float64(0.19642812084137395)},
 'hindi': {'cumulative_accuracy': np.float64(0.3826059599315413),
  'coca_cola_ratio': np.float64(0.10041640079727922)},
 'italian': {'cumulative_accuracy': np.float64(0.5316684976579708),
  'coca_cola_ratio': np.float64(0.162864714941002)},
 'portuguese': {'cumulative_accuracy': np.float64(0.5551631831910593),
  'coca_cola_ratio': np.float64(0.15990048986432648)},
 'spanish': {'cumulative_accuracy': np.float64(0.5755218326646897),
  'coca_cola_ratio': np.float64(0.15840943990948023)}}

In [23]:
results_finetuned

{'french': {'cumulative_accuracy': np.float64(0.5297363207217705),
  'coca_cola_ratio': np.float64(0.8944520241224033)},
 'german': {'cumulative_accuracy': np.float64(0.5700638441387316),
  'coca_cola_ratio': np.float64(0.8926006439008284)},
 'hindi': {'cumulative_accuracy': np.float64(0.39672807812342686),
  'coca_cola_ratio': np.float64(0.7747354530920902)},
 'italian': {'cumulative_accuracy': np.float64(0.4602325085673996),
  'coca_cola_ratio': np.float64(0.8791158677791949)},
 'portuguese': {'cumulative_accuracy': np.float64(0.5064018101167542),
  'coca_cola_ratio': np.float64(0.8510159034203246)},
 'spanish': {'cumulative_accuracy': np.float64(0.603767660910518),
  'coca_cola_ratio': np.float64(0.891835673426937)}}

# Visualization - Partial Training 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

models = ['llama-1b', 'llama-3b', 'llama-8b']
# languages = ['french', 'german', 'hindi']
languages = ['italian', 'portuguese', 'spanish']

fig, axs = plt.subplots(3, 3, figsize=(15, 8))

for i, model in enumerate(models):
    for j, language in enumerate(languages):
        df = pd.read_csv(f'results/coca-coal-partial/{model}/english-{language}-full.csv')

        if model == 'llama-1b':
            vals = {'fully-english-trained': {}, f'partialy_{language}-tuned_0_5': {}, f'partialy_{language}-tuned_0_10': {}, f'partialy_{language}-tuned_10_16': {}, f'fully-{language}-trained': {}}
        elif model == 'llama-3b':
            vals = {'fully-english-trained': {}, f'partialy_{language}-tuned_0_14': {}, f'partialy_{language}-tuned_14_27': {}, f'fully-{language}-trained': {}}
        else:
            vals = {'fully-english-trained': {}, f'partialy_{language}-tuned_0_15': {}, f'partialy_{language}-tuned_15_31': {}, f'fully-{language}-trained': {}}

        for key in vals.keys():
            vals[key]['accuracy'] = df[(df['model'] == key) & (df['split'] != 'test_english')]['accuracy'].values[0] + df[(df['model'] == key) & (df['split'] == 'test_english')]['accuracy'].values[0]
            vals[key]['ratio'] = df[(df['model'] == key) & (df['split'] != 'test_english')]['accuracy'].values[0] 

        ax = axs[i, j]
        if model == 'llama-1b':
            labels = ['SFT$_{en}$', '1-5', '1-10', '11-16', 'SFT$_{' + f'{language[:2]}' + '}$']
        elif model == 'llama-3b':
            labels = ['SFT$_{en}$', '1-14', '14-27', 'SFT$_{' + f'{language[:2]}' + '}$']
        else:
            labels = ['SFT$_{en}$', '1-15', '16-31', 'SFT$_{' + f'{language[:2]}' + '}$']

        # Create accuracy bars with colored, thicker, and transparent background
        accuracy_bars = ax.bar(vals.keys(), [vals[key]['accuracy'] for key in vals.keys()], color='aliceblue', edgecolor='xkcd:marine blue', label='Cum. Acc.', hatch='//', alpha=0.4)
        # Create ratio bars
        ratio_bars = ax.bar(vals.keys(), [vals[key]['ratio'] for key in vals.keys()], color='xkcd:marine blue', label=f'{language} Acc.', alpha=0.85)

        # Rotate bar labels
        ax.set_xticklabels(labels, fontsize=12)

        # Set y-axis range to 0 to 0.6
        ax.set_ylim(0, 0.7)

        ax.set_title(f"{language}", fontsize=12)
        ax.legend(loc='upper left', fontsize=10)

    # Add model name on the left side of each row
    fig.text(0.09, 0.83 - i * 0.33, model, va='center', ha='center', rotation='vertical', fontsize=14)

plt.tight_layout(rect=[0.1, 0, 1, 1])
plt.savefig('results/appendix_bar_plot.png', dpi=300, bbox_inches='tight')