In [1]:
import os
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
tab_colors = list(mcolors.TABLEAU_COLORS.keys())

""" Some helpful function on visualisation"""
def human_format(num):
    num = float('{:.3g}'.format(num))
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])

def percentage(f):
    p = f * 100
    return f'{int(p)}%' if p == int(p) else f'{p}%'

data_stat = pd.read_csv('./results/data_stat.csv')
tot_tokens = dict(zip(data_stat.lang, data_stat['#tokens_cc100']))
df_tot_tokens = pd.DataFrame(tot_tokens.items(), columns=['lang', '#tokens'])
df_tot_tokens['readable_#tokens'] = df_tot_tokens['#tokens'].apply(human_format)
df_tot_tokens.sort_values('#tokens', inplace=True)
data_stat['ratio_5M'] = data_stat['#tokens_cc100'].apply(lambda x: 5e6 / x) 

In [2]:
# for i, row in data_stat[['lang', 'ratio_5M']].iterrows():
#     print('preprocess_5M {} {};'.format(row['lang'], row['ratio_5M']))

# data_stat['ratio_10M'] = data_stat['#tokens_cc100'].apply(lambda x: 10e6 / x) 
# for i, row in data_stat[['lang', 'ratio_10M']].iterrows():
#     print('preprocess_10M {} {};'.format(row['lang'], row['ratio_10M']))

# Load save dirs for different languages

In [3]:
res = pd.read_csv('./results/cc100/nli/varied_data_amount_save_dirs_5M.csv')

In [4]:
def obtain_tag_lang(tags):
    tags = tags.strip().split(',')
    for tag in tags:
        tag = tag.strip()
        if tag in ['ar', 'bg', 'de', 'el', 'es', 'fr',
                   'hi', 'ru', 'sw', 'th', 'tr', 'ur',
                   'vi', 'zh']:
            return tag

# def obtain_tag_model_type(name):
#     return name.split('-')[-1]

def obtain_tag_model_type(tags):
    tags = tags.strip().split(',')
    for tag in tags:
        tag = tag.strip()
        if tag in ['standard', 'forget']:
            return tag    

def obtain_model_dir(save_dir):
    return save_dir.split('exps/')[-1]

def obtain_data_size(data_dir):
    # print(data_dir)
    if '-fixed-valid-' in data_dir:
        if '5M' in data_dir:
            return 5e6
        else:
            return float(data_dir.split('-')[-1])
    else:
        return 1.0

res['lang'] = res['Tags'].apply(obtain_tag_lang)
res['model_type'] = res['Tags'].apply(obtain_tag_model_type)
res['model_dir'] = res['args.save_dir'].apply(obtain_model_dir)
res['data_size'] = res['args.data'].apply(obtain_data_size)

# Splice the embedding and the body

download the paths for the specific language from wandb

In [5]:
coarse = False # !!!!!!!!!

In [6]:
# filter specific run
# res = res[(res['data_size'] == 0.01) | (res['data_size'] == 0.1)]
# res = res[res['model_type'] == 'forget']
# res = res[res['lang'] == 'ar']

cmd_template = """
export NLI_IRoberta="{}"
export ADAPT_IRoberta="{}"

EMB="${EXP_DIR}${ADAPT_IRoberta}"
BODY="${EXP_DIR}${NLI_IRoberta}"
SPLICE="${EXP_DIR}splice/xquad_mlqa/{}/"
python fb_sweep/splice_emb_body.py --emb_path $EMB --body_path $BODY --splice_path $SPLICE
"""
MAX_UPDATES = 5000
MIN_UPDATES = 0
NLI_BODY_PATH = {'standard':'cc100/qa-roberta-base/QA.squad.froberta_base.adam.clip0.0.seed1.lr3e-05.warm3999.ngpu1/checkpoint_best.pt', 
                 'forget':'cc100/qa-clip0.5.adamef.k1000/QA.squad.firoberta_base.adam.clip0.0.seed1.lr3e-05.warm3999.ngpu1/checkpoint_best.pt'}
model_type_list = []
data_size_list = []
num_update_list = []
alias_list = []
lang_list = []
for i,row in res.iterrows():
    lang = row['lang']
    model_dir = row['model_dir']
    model_type = row['model_type']
    data_size = row['data_size']
    files = os.listdir(row['args.save_dir'])
    files = [f for f in files if ('_' in f) and ('best' not in f) and ('last' not in f)]
    for file in files:
        num_updates = file.split('_')[-1].split('.')[0]
        if coarse == True:
            cond = int(num_updates) % 5000 == 0 # and int(num_updates) > 80000
        else:
            cond = int(num_updates) <= MAX_UPDATES and int(num_updates) > MIN_UPDATES
        if cond == True:
            emb_path = model_dir + '/' + file
            body_path = NLI_BODY_PATH[model_type]
            splice_path = '{}_adapt-emb-{}{}-step{}_finetune-body-en'.format(model_type, lang, data_size, num_updates)
            # print(emb_path, splice_path)
            cmd = cmd_template.format(body_path, emb_path, splice_path, 
                                      EXP_DIR='{EXP_DIR}', 
                                      ADAPT_IRoberta='{ADAPT_IRoberta}', 
                                      NLI_IRoberta='{NLI_IRoberta}')
            print('#{}'.format(splice_path))
            print(cmd)
            num_update_list.append(num_updates)
            data_size_list.append(data_size)
            model_type_list.append(model_type)
            alias_list.append(splice_path)
            lang_list.append(lang)

#forget_adapt-emb-ar5000000.0-step500_finetune-body-en

export NLI_IRoberta="cc100/qa-clip0.5.adamef.k1000/QA.squad.firoberta_base.adam.clip0.0.seed1.lr3e-05.warm3999.ngpu1/checkpoint_best.pt"
export ADAPT_IRoberta="cc100/adapt20221111-063228/forgeT.firoberta_base.adam.lr0.0007.wu10000.ms32.uf2.mu125000.s1.ngpu32/checkpoint_1_500.pt"

EMB="${EXP_DIR}${ADAPT_IRoberta}"
BODY="${EXP_DIR}${NLI_IRoberta}"
SPLICE="${EXP_DIR}splice/xquad_mlqa/forget_adapt-emb-ar5000000.0-step500_finetune-body-en/"
python fb_sweep/splice_emb_body.py --emb_path $EMB --body_path $BODY --splice_path $SPLICE

#forget_adapt-emb-ar5000000.0-step1000_finetune-body-en

export NLI_IRoberta="cc100/qa-clip0.5.adamef.k1000/QA.squad.firoberta_base.adam.clip0.0.seed1.lr3e-05.warm3999.ngpu1/checkpoint_best.pt"
export ADAPT_IRoberta="cc100/adapt20221111-063228/forgeT.firoberta_base.adam.lr0.0007.wu10000.ms32.uf2.mu125000.s1.ngpu32/checkpoint_1_1000.pt"

EMB="${EXP_DIR}${ADAPT_IRoberta}"
BODY="${EXP_DIR}${NLI_IRoberta}"
SPLICE

# Run the Splice Commands in launch_varied_data_splice.sh

In [7]:
df = pd.DataFrame({'model_type': model_type_list, 'data_size': data_size_list, 
                   'num_update': num_update_list, 'alias': alias_list, 'lang': lang_list})
df = df.drop_duplicates()

# Eval the spliced model

In [8]:
eval_cmd_template = """echo {}
export L2="{}"
export MODEL_ALIAS="{}"
export WORK_DIR="/checkpoint/yhc/inductivise-lm/inductivise-lm/"
EXP_DIR=${{WORK_DIR}}exps/
DAT_DIR=${{WORK_DIR}}datasets/
RES_DIR=${{WORK_DIR}}results/
DICT=${{DAT_DIR}}XQUAD_MLQA/$L2/bin/dict.txt
SPM=${{DAT_DIR}}cc100/$L2/spm/spm.bpe.model
SPLICED_MODEL=${{EXP_DIR}}splice/xquad_mlqa/${{MODEL_ALIAS}}/checkpoint_best.pt
SQUAD_PROCESSED_FOLDER=${{DAT_DIR}}XQUAD_MLQA/bin
OUTPUT_SAVE_DIR=${{RES_DIR}}cc100/xquad_mlqa/${{MODEL_ALIAS}}/
mkdir $OUTPUT_SAVE_DIR
MLQA_VALID_FOLDER=${{DAT_DIR}}MLQA/dev/
MLQA_L2_VALID=mlqa_valid_$L2
MLQA_L2_VALID_GOLD=$MLQA_VALID_FOLDER/dev-context-$L2-question-$L2.json
python scripts/eval_squad.py \
    --path $SPLICED_MODEL \
    --sentencepiece-model $SPM \
    --data $SQUAD_PROCESSED_FOLDER \
    --gold_data_file $MLQA_L2_VALID_GOLD \
    --gen_subset $MLQA_L2_VALID \
    --save_dir $OUTPUT_SAVE_DIR \
    --language $L2
"""

xquad_langs = ['ar', 'de', 'el', 'es', 'hi', 'ru', 'th', 'tr', 'vi', 'zh']
mlqa_langs = ['ar', 'de', 'es', 'hi', 'vi', 'zh']
sel_langs = mlqa_langs

for i, row in df.iterrows():
    if row['lang'] in sel_langs:
        eval_cmd = eval_cmd_template.format(row['alias'], row['lang'], row['alias'])
        print(eval_cmd)

echo forget_adapt-emb-ar5000000.0-step500_finetune-body-en
export L2="ar"
export MODEL_ALIAS="forget_adapt-emb-ar5000000.0-step500_finetune-body-en"
export WORK_DIR="/checkpoint/yhc/inductivise-lm/inductivise-lm/"
EXP_DIR=${WORK_DIR}exps/
DAT_DIR=${WORK_DIR}datasets/
RES_DIR=${WORK_DIR}results/
DICT=${DAT_DIR}XQUAD_MLQA/$L2/bin/dict.txt
SPM=${DAT_DIR}cc100/$L2/spm/spm.bpe.model
SPLICED_MODEL=${EXP_DIR}splice/xquad_mlqa/${MODEL_ALIAS}/checkpoint_best.pt
SQUAD_PROCESSED_FOLDER=${DAT_DIR}XQUAD_MLQA/bin
OUTPUT_SAVE_DIR=${RES_DIR}cc100/xquad_mlqa/${MODEL_ALIAS}/
mkdir $OUTPUT_SAVE_DIR
MLQA_VALID_FOLDER=${DAT_DIR}MLQA/dev/
MLQA_L2_VALID=mlqa_valid_$L2
MLQA_L2_VALID_GOLD=$MLQA_VALID_FOLDER/dev-context-$L2-question-$L2.json
python scripts/eval_squad.py     --path $SPLICED_MODEL     --sentencepiece-model $SPM     --data $SQUAD_PROCESSED_FOLDER     --gold_data_file $MLQA_L2_VALID_GOLD     --gen_subset $MLQA_L2_VALID     --save_dir $OUTPUT_SAVE_DIR     --language $L2

echo forget_adapt-emb-ar5000

# Run the eval commands in launch_varied_data_eval.py

# Parse the logs & Visualise the acc-step curve

In [9]:
acc, alias = [], []
if coarse == True:
    log_path = 'results/launch_mlqa_5M.out' #!!!!!!!!!!!!
else:
    log_path = 'results/launch_mlqa_5M_5k.out' #!!!!!!!!!!
    
with open(log_path, 'r') as f:
    for i,line in enumerate(f.readlines()):
        line = line.strip()
        if ('standard' in line or 'forget' in line) and (not ('/checkpoint' in line)):
            alias.append(line)
        if 'exact_match' in line:
            acc.append(float(line.split(':')[-1].split('}')[0]))
#         if i%7 == 6:
#             print(i, len(acc), len(alias))

df_log = pd.DataFrame({'alias': alias, 'acc': acc})
df_final = pd.merge(df, df_log, on='alias')
if coarse == True:
    df_final.to_csv('results/cc100/res_varied_data_eval_mlqa.csv')
else:
    df_final.to_csv('results/cc100/res_varied_data_eval_mlqa_0-5k.csv')

# Read the data and change the ratio into number of tokens

In [10]:
def make_legend_handle(linestyle, color):
    return plt.plot([], [], ls=linestyle, color=color)[0]

def plot_NLI(lang, coarse):
    plt.figure()
    if coarse == False:
        df_final = pd.read_csv('results/cc100/res_varied_data_eval_mlqa_0-5k.csv') #!!!!!!!
    else:
        df_final = pd.read_csv('results/cc100/res_varied_data_eval_mlqa.csv') #!!!!!!!! change to the path you want
    df_final = df_final[df_final['lang'] == lang]
    print('Language: {}, Length: {}'.format(lang, len(df_final)))
    if df_final['data_size'].values[0] < 1:
        df_final['#tokens'] = df_final['data_size'] * tot_tokens[lang]
    else:
        df_final['#tokens'] = df_final['data_size']
    linestyles = {'forget': 'solid', 'standard': '--'}
    colors = {}
    sizes = df_final[['#tokens', 'data_size']].drop_duplicates().values.tolist()
    sizes = [tuple(s) for s in sizes]
    sizes = sorted(sizes)[::-1]
    for i,size in enumerate(sizes):
        colors[size] = tab_colors[i]

    handles = [make_legend_handle(ls, 'k') for _,ls in linestyles.items()]
    handles += [make_legend_handle('solid', c) for _,c in colors.items()]

    labels = [model_type for model_type,_ in linestyles.items()]
    if df_final['data_size'].values[0] < 1:
        labels += ['{}({})'.format(percentage(size[1]), human_format(size[0])) for size,_ in colors.items()]
    else:
        labels += ['{}'.format(human_format(size[0])) for size,_ in colors.items()]
    
    for model_type in linestyles:
        for size in sizes:
            tmp = df_final[df_final['model_type'] == model_type]
            tmp = tmp[tmp['#tokens'] == size[0]]
            tmp['num_update'] = tmp['num_update'].astype(int)
            plt.plot(tmp['num_update'], tmp['acc'], 
                     color=colors[size], linestyle=linestyles[model_type])
    plt.legend(handles, labels)  
    plt.title('MLQA F1 Score vs Adaptation Steps [{}]'.format(lang.upper()))
#     if coarse == True:
#         plt.savefig('results/cc100/varied_data_{}.png'.format(lang))
#     else:
#         plt.savefig('results/cc100/varied_data_{}_0-5k.png'.format(lang))


def subplot_NLI(lang, coarse, ax):
    if coarse == False:
        df_final = pd.read_csv('results/cc100/res_varied_data_eval_mlqa_0-5k.csv')
    else:
        df_final = pd.read_csv('results/cc100/res_varied_data_eval_mlqa.csv')
    df_final = df_final[df_final['lang'] == lang]
    if df_final['data_size'].values[0] < 1:
        df_final['#tokens'] = df_final['data_size'] * tot_tokens[lang]
    else:
        df_final['#tokens'] = df_final['data_size']
    linestyles = {'forget': 'solid', 'standard': '--'}
    colors = {}
    sizes = df_final[['#tokens', 'data_size']].drop_duplicates().values.tolist()
    sizes = [tuple(s) for s in sizes]
    sizes = sorted(sizes)[::-1]
    for i,size in enumerate(sizes):
        colors[size] = tab_colors[i]

    handles = [make_legend_handle(ls, 'k') for _,ls in linestyles.items()]
    handles += [make_legend_handle('solid', c) for _,c in colors.items()]

    labels = [model_type for model_type,_ in linestyles.items()]
    if df_final['data_size'].values[0] < 1:
        labels += ['{}({})'.format(percentage(size[1]), human_format(size[0])) for size,_ in colors.items()]
    else:
        labels += ['{}'.format(human_format(size[0])) for size,_ in colors.items()]
    
    for model_type in linestyles:
        for size in sizes:
            tmp = df_final[df_final['model_type'] == model_type]
            tmp = tmp[tmp['#tokens'] == size[0]]
            tmp['num_update'] = tmp['num_update'].astype(int)
            ax.plot(tmp['num_update'], tmp['acc'], 
                     color=colors[size], linestyle=linestyles[model_type]) 
    ax.set_title('{}'.format(lang.upper()))
    ax.legend(handles, labels) 

In [None]:
for l in sel_langs:
    plot_NLI(l, coarse)

In [None]:
# fig, axs = plt.subplots(2, 7, sharex=True, sharey=True)
fig, axs = plt.subplots(1, 6)
fig.set_size_inches(64, 10)
for i,l in enumerate(sel_langs):
    x, y = int(i / 6), int(i % 6)
    # ax = axs[x, y]
    ax = axs[y]
    subplot_NLI(l, coarse, ax)

for ax in axs.flat:
    ax.set(xlabel='Adaptation Steps', ylabel='MLQA F1 Score')

# for ax in axs.flat:
#     ax.label_outer()
fig.suptitle('MLQA F1 Score vs Adaptation Steps')
# fig.legend(handles, labels)

In [14]:
if coarse == True:
    fig.savefig('results/cc100/mlqa_all_languages_5M.png')
else:
    fig.savefig('results/cc100/mlqa_all_languages_5M_5k.png')

# Compute Convergence Speed
Take the performance at 5K updates and compare it with performance at 125K

In [None]:
df_final = pd.read_csv('results/cc100/mlqa/mlqa_res_5M_eval.csv')

forget_125k = df_final[df_final['num_update'] == 125000]
forget_125k = forget_125k[forget_125k['model_type'] == 'forget'][['lang', 'acc']]

standard_125k = df_final[df_final['num_update'] == 125000]
standard_125k = standard_125k[standard_125k['model_type'] == 'standard'][['lang', 'acc']]

both_125k = pd.merge(standard_125k, forget_125k, on='lang', suffixes=['_standard', '_forget'])

df_5k = pd.read_csv('results/cc100/mlqa/mlqa_res_5M_eval_0-5k.csv')

forget_5k = df_5k[df_5k['num_update'] == 5000]
forget_5k = forget_5k[forget_5k['model_type'] == 'forget'][['lang', 'acc']]

standard_5k = df_5k[df_5k['num_update'] == 5000]
standard_5k = standard_5k[standard_5k['model_type'] == 'standard'][['lang', 'acc']]

both_5k = pd.merge(standard_5k, forget_5k, on='lang', suffixes=['_standard', '_forget'])
both = pd.merge(both_5k, both_125k, on='lang', suffixes=['_5k', '_125k'])
both['converging_forget_5k'] = 100 * (both['acc_forget_5k'] / both['acc_forget_125k']) 
both['converging_standard_5k'] = 100 * (both['acc_standard_5k'] / both['acc_standard_125k']) 
both = both[['lang', 'converging_standard_5k', 'converging_forget_5k']]

both.to_csv('results/cc100/mlqa/mlqa_converging_5k.csv')

c = both.mean()[['converging_standard_5k', 'converging_forget_5k']].values
print('Averaging Converging Percent at 5K: Standard {}, Forget {}'.format(c[0], c[1]))

# Compute the diff between forget and standard

In [2]:
import seaborn as sns

In [6]:
df_final = pd.read_csv('results/cc100/mlqa/mlqa_res_5M_eval.csv')

In [7]:
forget = df_final[df_final['num_update'] == 125000]
forget = forget[forget['model_type'] == 'forget'][['lang', 'acc']]

In [8]:
standard = df_final[df_final['num_update'] == 125000]
standard = standard[standard['model_type'] == 'standard'][['lang', 'acc']]

In [9]:
both = pd.merge(standard, forget, on='lang', suffixes=['_standard', '_forget'])

In [10]:
both['diff'] = both['acc_forget'] - both['acc_standard']

In [11]:
both.sort_values('diff', inplace=True)

In [12]:
both['relative_gain'] = 100 * both['diff'] / both['acc_standard']

In [None]:
both.sort_values('relative_gain', ascending=True, inplace=True)
ax = sns.barplot(data=both, x='lang', y='relative_gain', palette=sns.color_palette('pastel'))
ax.set(xlabel='Languages', ylabel='F1 Score Relative Gain in Percentage')
ax.set_title('Relative Gain of F1 Score on MLQA')
plt.savefig('./results/cc100/MLQA_all_languages_5M_relative_gain.png')

In [None]:
print('Average Relative Gain: {}'.format(both['relative_gain'].mean()))