This notebook is used to analyze our generations.

In [18]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
import json
import sqlite3
from tqdm import tqdm
from urllib.parse import unquote
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import py3langid as langid
import config
import tikzplotlib

In [32]:
data_dir='../data/'
result_dir='../result/generation/'
fig_dir='../report/Images/'

In [20]:
languages=config.languages
models=list(config.checkpoints.keys())

# DATA IMPORTATION

In [21]:
prompt=['en', 'lang']

In [22]:
generations={}
for m in models:
    generations[m]={}
    for p in prompt:
        with open(result_dir+f'biographies_{m}_{p}.json', 'r') as f:
            generations[m][p]=json.load(f)

# DATA CLEANING

We remove model-specific tokens from the generated text.

In [5]:
split_tokens = {
        'aya': '<|CHATBOT_TOKEN|>',
        'llama': 'assistant\n\n',
        'qwen': '\nassistant\n'
    }

def clean_generation(raw_text, model):
    answer=raw_text.split(split_tokens[model])[1]
    answer=answer.replace('\n', '')
    return answer

In [6]:
for m in models:
    model_type=m.split('_')[0]
    for p in prompt:
        new_gen_dict={}
        for link, lang_dict in generations[m][p].items():
            new_gen_dict[link]={}
            for lang_code, gen_list in lang_dict.items():
                new_gen_dict[link][lang_code]=[clean_generation(gen, model_type) for gen in gen_list]
        generations[m][p]=new_gen_dict

# SANITY CHECK

We apply the sanity checks (correct language and enough unique words).

In [7]:
def py3lang_detect(text):
    '''Detect the language of the provided text, as well as other probable languages'''
    lang, prob = langid.classify(text)
    return lang

In [8]:
def unique_words(sentence):
    sentence = ''.join(char.lower() if char.isalnum() or char.isspace() else ' ' for char in sentence)
    words = sentence.split()
    unique_words = set(words)
    return unique_words

In [9]:
clean_generations={}
rows=[]
with tqdm(total=len(models)*len(prompt), desc="Processing") as pbar:
    for m in models:
        clean_generations[m]={}
        for p in prompt:
            count=0
            clean_generations[m][p]={}
            for link, lang_dict in generations[m][p].items():
                new_lang_dict=lang_dict.copy()
                for lang_code, gen_list in lang_dict.items():
                    new_gen_list=gen_list.copy()
                    for gen in gen_list:
                        pred_lang = py3lang_detect(gen)
                        nb_words=len(unique_words(gen))
                        new_row = {'model':m, 'prompt':p, 'link': link, 'gold_lang': lang_code, 'pred_lang':pred_lang, 'tokens': len(gen), 'unique_tokens': nb_words}
                        rows.append(new_row)
                        if pred_lang != lang_code or nb_words<20:
                            new_gen_list.remove(gen)
                    if len(new_gen_list)==0:
                        new_lang_dict.pop(lang_code)
                    else:
                        new_lang_dict[lang_code]=new_gen_list
                        count+=len(new_gen_list)
                clean_generations[m][p][link]=new_lang_dict
            print(f"For {m} {p}, {count} generations left ({(count/(19*486*3))*100}%)")
            pbar.update(1)
       

Processing:   8%|▊         | 1/12 [00:21<03:51, 21.08s/it]

For llama_8 en, 23181 generations left (83.67987870911847%)


Processing:  17%|█▋        | 2/12 [00:46<03:56, 23.67s/it]

For llama_8 lang, 24516 generations left (88.4990253411306%)


Processing:  25%|██▌       | 3/12 [01:08<03:24, 22.73s/it]

For llama_70 en, 22590 generations left (81.54645873944119%)


Processing:  33%|███▎      | 4/12 [01:34<03:13, 24.21s/it]

For llama_70 lang, 26306 generations left (94.96065266045774%)


Processing:  42%|████▏     | 5/12 [01:57<02:45, 23.62s/it]

For qwen_7 en, 24866 generations left (89.7624720236806%)


Processing:  50%|█████     | 6/12 [02:19<02:19, 23.27s/it]

For qwen_7 lang, 24678 generations left (89.08382066276803%)


Processing:  58%|█████▊    | 7/12 [02:42<01:55, 23.11s/it]

For qwen_72 en, 20632 generations left (74.47837701249007%)


Processing:  67%|██████▋   | 8/12 [03:06<01:33, 23.26s/it]

For qwen_72 lang, 24778 generations left (89.44480542921089%)


Processing:  75%|███████▌  | 9/12 [03:29<01:09, 23.25s/it]

For aya_8 en, 19610 generations left (70.78911269944408%)


Processing:  83%|████████▎ | 10/12 [03:55<00:48, 24.18s/it]

For aya_8 lang, 22278 generations left (80.42018626813949%)


Processing:  92%|█████████▏| 11/12 [04:22<00:24, 24.91s/it]

For aya_35 en, 23414 generations left (84.52097321493032%)


Processing: 100%|██████████| 12/12 [04:48<00:00, 24.04s/it]

For aya_35 lang, 23229 generations left (83.85315139701105%)





In [None]:
for m in models:
    for p in prompt:
        with open(result_dir+f'/clean_biographies_{m}_{p}.json', 'w') as f:
            json.dump(clean_generations[m][p], f)

In [None]:
lang_df=pd.DataFrame(rows, columns=['model', 'prompt', 'link', 'gold_lang', 'pred_lang', 'tokens', 'unique_tokens'])
lang_df.to_csv(result_dir+f'lang_dectect.csv', index=False)

# ANALYSIS

We analyze the results of the sanity checks.

In [23]:
lang_df=pd.read_csv(result_dir+'lang_dectect.csv')

In [24]:
#Build new columns for lang matching and number of tokens
lang_df['match'] = (lang_df['gold_lang'] == lang_df['pred_lang']).astype(int)
lang_df['tokens'] = lang_df['tokens'].astype(int)
lang_df['unique_tokens'] = lang_df['unique_tokens'].astype(int)

In [25]:
print('Percentages of generations in correct language:', sum(lang_df.match)/len(lang_df))

0.8621910572040526

In [26]:
# Plot percentage of generations in correct language per subject LLM

aggregated_df=lang_df.groupby(['model', 'prompt', 'gold_lang']).agg(match_norm=('match', lambda x: x.sum() / x.count()))
aggregated_df = aggregated_df.reset_index()
pivoted_df = aggregated_df.pivot_table(
    index='gold_lang',
    columns=['model', 'prompt'],
    values='match_norm'
)
pivoted_df.columns = ['_'.join(col).strip() for col in pivoted_df.columns.values]
pivoted_df=pivoted_df.reindex(languages)
pivoted_df=pivoted_df[['aya_8_lang', 'aya_8_en', 'aya_35_lang', 'aya_35_en', 
                       'llama_8_lang', 'llama_8_en', 'llama_70_lang','llama_70_en',
                       'qwen_7_lang', 'qwen_7_en', 'qwen_72_lang' , 'qwen_72_en']]

plt.figure(figsize=(12, 8))
sns.heatmap(pivoted_df, cmap="coolwarm_r", linewidths=.5)
plt.title('Heatmap of Normalized Match Scores Across Models and Prompts')
plt.xlabel('Model_Prompt')
plt.ylabel('Gold Language')
tikzplotlib.save(fig_dir+"langdetect.tex")
plt.show()