In [1]:
import pandas as pd
import torch
import os
import glob
import json
import wandb


In [2]:
read_data_path = '/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/results/rundata.xlsx'
log_path = '/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/logs'
output_path = '/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/output_dump'
reading_time_path = r'/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/eval/natural_stories/modelcomp_scores.csv'
model_surprisal_path = r'/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/eval/natural_stories/storyword_model_surprisals.csv'
story_surprisal_key_path = r'/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/eval/natural_stories/story_surprisal_keys.csv'

TASKS = {
    "blimp": ["anaphor_agreement.json", "argument_structure.json", "binding.json",
              "control_raising.json", "determiner_noun_agreement.json", "ellipsis.json",
              "filler_gap.json", "irregular_forms.json", "island_effects.json",
              "npi_licensing.json", "quantifiers.json", "subject_verb_agreement.json"],
    "supplement": ["hypernym.json", "qa_congruence_easy.json", "qa_congruence_tricky.json",
               "subject_aux_inversion.json", "turn_taking.json"]
}
blimp_categories = [[x.split(".json")[0], "blimp"] for x in TASKS["blimp"]]
blimp_categories.extend([[x.split(".json")[0], "supplement"] for x in TASKS["supplement"]])

In [3]:
#Things to do:
# check if all runs in run ID has corresponding logs locally - done
# check if all runs in run ID has a output folder - done
# check if all runs in run ID has a output folder with a checkpoint - done
# check if all runs in run ID has samples and if not flag them 
# check if aLL runs have a train and eval loss in wandb
# check if all runs have blimp eval results in output folder
# check if all runs have results for reading time 


In [4]:
#Log Path
run_ids = pd.read_excel(read_data_path, sheet_name='Run IDs')
reading_time_raw_df = pd.read_csv(reading_time_path, sep='\t')
model_surprisal_raw_df = pd.read_csv(model_surprisal_path)
model_surprisal_key_df = pd.read_csv(story_surprisal_key_path)

def check_run_logs(run_ids, log_path):
    
    #recursively get all log files
    log_id_list = [os.path.basename(x) for x in glob.glob(log_path + '/**/*.out', recursive=True)]
    #print(log_id_list)
    out_log_id_list = []
    error_log_id_list = []

    for log_id in log_id_list:
        if "err" in log_id:
            error_log_id_list.append(log_id.split('.')[0])
        elif "out" in log_id:
            out_log_id_list.append(int(log_id.split('.')[0]))    
            
    return pd.DataFrame([{"run_id":run_id, "log_exists":True} if run_id in out_log_id_list else {"run_id":run_id, "log_exists":False} for run_id in run_ids])

def check_output_dump(run_ids, output_path):
    
    output_dump_list = os.listdir(output_path)
    output_dump_ids = []
    
    for out_id in output_dump_list:
        if out_id.startswith('out'):
            if "curr" in out_id:
                a_id = out_id.split('-')[-1]
            else:
                try:
                    a_id = out_id.split('-')[4]
                except Exception as e:
                    print(e,out_id)
                    continue
            if a_id.isdigit():
                output_dump_ids.append((int(a_id), out_id))
            elif "s" in a_id or "nm" in a_id:
                output_dump_ids.append((int(a_id.split("_")[0]), out_id))
    
    ####
    #Find a way to use remote ssh thingy to look for output folders in the remote server
    #Create a list of output folders in the remote server
    #Find A union B of both folders
    #Add flag to keep track of where it is
    #Extend this to ensure huge files don't necessarily have to copied
    #And checkpoint is there atleast in one place
     
    
    #convert to df and join with run_ids which is a df
    output_dump_df = pd.DataFrame(output_dump_ids, columns=['run_id', 'output_folder_name'])
    
    output_dump_df = pd.merge(run_ids, output_dump_df, on='run_id', how='left')
    output_dump_df['output_exists'] = output_dump_df['output_folder_name'].apply(lambda x: False if pd.isnull(x) else True)
    
    #make output folder name of NaNs to None
    output_dump_df['output_folder_name'] = output_dump_df['output_folder_name'].apply(lambda x: "" if pd.isnull(x) else x)
    
    return output_dump_df

def check_od_checkpoint(run_ids, output_path):
    cpt_ids = run_ids[run_ids['output_exists'] == True][['run_id', 'output_folder_name']]
    #check if outputpath + output_folder_name has a ckpt.pt file
    
    ckpt_exists = []
    for index, row in cpt_ids.iterrows():
        if os.path.exists(os.path.join(output_path, row['output_folder_name'], 'ckpt.pt')):
            ckpt_exists.append(True)
        else:
            ckpt_exists.append(False)
    
    cpt_ids['ckpt_exists'] = ckpt_exists
    cpt_ids = cpt_ids[['run_id', 'ckpt_exists']]
    cpt_ids = pd.merge(run_ids[['run_id']], cpt_ids, on='run_id', how='left')
    cpt_ids['ckpt_exists'] = cpt_ids['ckpt_exists'].apply(lambda x: False if pd.isnull(x) else x)
    
    return cpt_ids
    
def check_samples(run_ids, output_path):
    sample_ids = run_ids[run_ids['output_exists'] == True][['run_id', 'output_folder_name']]
    #check if outputpath + output_folder_name has a sample_news_article_44000.txt file
    
    sample_exists = []
    for index, row in sample_ids.iterrows():
        if os.path.exists(os.path.join(output_path, row['output_folder_name'], 'sample_news_article_44000.txt')):
            sample_exists.append(True)
        else:
            sample_exists.append(False)
        
    sample_ids['sample_exists'] = sample_exists
    sample_ids = sample_ids[['run_id', 'sample_exists']]
    sample_ids = pd.merge(run_ids[['run_id']], sample_ids, on='run_id', how='left')
    sample_ids['sample_exists'] = sample_ids['sample_exists'].apply(lambda x: False if pd.isnull(x) else x)
    
    return sample_ids
    
def get_blimp_eval(run_ids, output_path):
    
    blimp_ids = run_ids[run_ids['output_exists'] == True][['run_id', 'output_folder_name']]
    #check if outputpath + output_folder_name has a folder named zeroshot
    
    blimp_exists = []
    for index, row in blimp_ids.iterrows():
        if os.path.exists(os.path.join(output_path, row['output_folder_name'], 'zeroshot')):
            blimp_exists_row = {
                "run_id": row['run_id'],
                "blimp_exists": True
            }
            for category, category_class in blimp_categories:
                blimp_category_path = os.path.join(output_path, row['output_folder_name'], 'zeroshot', category, "eval_results.json")
                if os.path.exists(blimp_category_path):
                    with open(blimp_category_path, 'r') as f:
                        blimp_category_results = json.load(f)
                        blimp_exists_row[category] = blimp_category_results["eval_accuracy"]            
        
            blimp_exists.append(blimp_exists_row)
        else:
            blimp_exists.append(
                {
                    "run_id": row['run_id'],
                    "blimp_exists": False
                }
            )
    
    blimp_exists_df = pd.DataFrame(blimp_exists)
    blimp_exists_df = pd.merge(run_ids[["run_id"]], blimp_exists_df, on='run_id', how='left')
    blimp_exists_df['blimp_exists'] = blimp_exists_df['blimp_exists'].apply(lambda x: False if pd.isnull(x) else x)
    
    for blimp_df_col in blimp_exists_df.columns:
        if blimp_df_col not in ['run_id', 'blimp_exists']:
            blimp_exists_df[blimp_df_col] = blimp_exists_df[blimp_df_col]*100

    blimp_cols = [x[0] for x in blimp_categories if x[1] == "blimp"]
    supplement_cols = [x[0] for x in blimp_categories if x[1] == "supplement"]
    
    blimp_exists_df["blimp_avg"] = blimp_exists_df[blimp_cols].mean(axis=1)
    blimp_exists_df["supplement_avg"] = blimp_exists_df[supplement_cols].mean(axis=1)
    
    blimp_exists_df["total_avg"] = blimp_exists_df[blimp_cols + supplement_cols].mean(axis=1)
    
    
    return blimp_exists_df

def get_loss_data():
    api = wandb.Api()
    runs = api.runs("abishekthamma/wikipedia")
    
    wandb_run_data = {}
    for run in runs:
        try:
            run_data_key = run.config["out_dir"].split("/")[-1] if "output_dump" in run.config["out_dir"] else run.config["out_dir"]
            if run_data_key.startswith('out'):
                if "curr" in run_data_key:
                    try:
                        a_id = run_data_key.split('-')[-1]
                    except Exception as e:
                        print(e,run_data_key)
                        continue
                else:
                    try:
                        a_id = run_data_key.split('-')[4]
                    except Exception as e:
                        print(e,run_data_key)
                        continue
                if a_id.isdigit():
                    run_data_key = int(a_id)
                elif "s" in a_id or "nm" in a_id:
                    run_data_key = int(a_id.split("_")[0])
                    
        except Exception as e:
            if "6607670" in run.name:
                run_data_key = "6607670"
            else:
                raise e

        wandb_run_data[run_data_key] = {
            "train_loss": run.summary.get("train/loss"),
            "val_loss": run.summary.get("val/loss"),
            "n_layer": run.config.get("n_layer"),
            "block_size": run.config.get("block_size"),
            "n_head": run.config.get("n_head"),
            "n_embd": run.config.get("n_embd"),
            "batch_size": run.config.get("batch_size"),
            "learning_rate": run.config.get("learning_rate"),
            "dataset": run.config.get("dataset"),
            "seed": run.config.get("torch_seed_default", 1337),
        }
        
    wandb_run_data_df = pd.DataFrame(wandb_run_data).T.reset_index()
    wandb_run_data_df = wandb_run_data_df.rename(columns={"index":"run_id"})
    wandb_run_data_df['run_id'] = wandb_run_data_df['run_id'].astype(int)
    
    return wandb_run_data_df

def get_reading_time_data(reading_time_raw_df):
    rdt_dict = []
    for rd_raw_row in reading_time_raw_df.to_dict(orient='records'):
        run_data_key = rd_raw_row['model_id']
        if run_data_key.startswith('out'):
            if "curr" in run_data_key:
                try:
                    a_id = run_data_key.split('-')[-1]
                except Exception as e:
                    print(e,run_data_key)
                    continue
            else:
                try:
                    a_id = run_data_key.split('-')[4]
                except Exception as e:
                    print(e,run_data_key)
                    continue
            if a_id.isdigit():
                run_data_key = int(a_id)
            elif "s" in a_id or "nm" in a_id:
                run_data_key = int(a_id.split("_")[0])
                
            rd_raw_row['run_id'] = run_data_key
            rdt_dict.append(rd_raw_row)
            
    reading_time_df = pd.DataFrame(rdt_dict)
    
    reading_time_subjectwise_df = reading_time_df.drop(columns=['model_id', "dataset"])
    reading_time_df = reading_time_df.drop(columns=["subject_id", "model_id", "dataset"]).groupby('run_id').mean().reset_index()
    reading_time_df["reading_time_exists"] = True
    
    return reading_time_df, reading_time_subjectwise_df

def get_model_surprisal_data(model_surprisal_raw_df):
    
    model_surprisal_raw_df.rename(columns={"model_id":"run_id"}, inplace=True)
    #Get unique model IDs and add a column to saw surprisal_data_exists
    model_surprisal_raw_df['model_surprisal_data_exists'] = True
    model_surprisal_exist_df = model_surprisal_raw_df[['run_id', 'model_surprisal_data_exists']].drop_duplicates()
    
    
    return model_surprisal_raw_df, model_surprisal_exist_df
    

def add_additional_run_details_columns(run_ids):
    #Add additional columns to run_ids based on the output folder name

    #if Curr in output folder name, then it is a curriculum learning run
    
    run_ids['curriculum_learning'] = run_ids['output_folder_name'].apply(lambda x: True if "curr" in x else False)
    
    #Split output folder if curriculum learning is true and get type of curriculum learning
    run_ids['curriculum_type'] = run_ids['output_folder_name'].apply(lambda x: x.split('-')[-2].split('_')[-1] if "curr" in x else None)
    run_ids["masking"] = run_ids['output_folder_name'].apply(lambda x: False if "nomask" in x else True)
    #print(run_ids['output_folder_name'].apply(lambda x: x.split('-')
    
    #Print rows where output_folder_name.apply(lambda x: x.split('-')) has less than 4 parts
    print(run_ids[run_ids['output_folder_name'].apply(lambda x: len(x.split('-')) < 4)])
    run_ids["mask_type"] = run_ids['output_folder_name'].apply(lambda x: x.split('-')[3] if "nomask" not in x else "Non")
     
    # #mask type as linear if lin in mask type, exponential if e or ee in mask type, logaritmic if log in mask type, sigmoid if sig in mask type
    run_ids["mask_type2"] = run_ids["mask_type"].apply(lambda x: "linear" if "lin" in x  
                                                       else "exponential_new" if "ee" in x
                                                       else "logarithmic" if "log" in x 
                                                       else "exponential" if "e" in x   
                                                       else "sigmoid" if "sig" in x 
                                                       else "Non")
    run_ids["mask_decay_rate"] = run_ids["mask_type"].apply(lambda x: 0 if ("Non" in x) or ("nomask" in x) or ("lin" in x) else x.split('_')[1][-3:])
    
    
    run_ids["mask_decay_rate"] = run_ids["mask_decay_rate"].apply(lambda x: 0.5 if x == "0p5" else float(x))
    
    #If ee2000 in mask type then mask_decay_rate is 2000 else it it is whatever it currently is
    #print( "Sanity check \n \n",  run_ids[run_ids["mask_type"].str.contains("ee2000")])
    
    run_ids["mask_decay_rate"] = run_ids.apply(lambda x: 2000 if x["mask_type"].find("ee2000") != -1 else x["mask_decay_rate"], axis=1)
    
    
    # #split mask_type and check if it has 3 parts, last part is the echoic memory size
    run_ids["echoic_memory"] = run_ids["mask_type"].apply(lambda x: int(x.split('_')[-1][2:]) if (x is not None) and (len(x.split('_')) == 3) else 1)
    run_ids["mask_type"] = run_ids["mask_type2"]
    run_ids = run_ids.drop(columns=['mask_type2'])
    # 
    
    return run_ids

    

run_ids = pd.merge(run_ids, check_run_logs(run_ids['run_id'], log_path), on='run_id', how='left')
run_ids = pd.merge(run_ids, check_output_dump(run_ids['run_id'], output_path), on='run_id', how='left')
run_ids = pd.merge(run_ids, check_od_checkpoint(run_ids, output_path), on='run_id', how='left')
run_ids = pd.merge(run_ids, check_samples(run_ids, output_path), on='run_id', how='left')

blimp_eval_df = get_blimp_eval(run_ids, output_path)

blimp_cols = [x[0] for x in blimp_categories if x[1] == "blimp"]
supplement_cols = [x[0] for x in blimp_categories if x[1] == "supplement"]


train_val_loss_df = get_loss_data()

run_ids = pd.merge(run_ids, blimp_eval_df[['run_id', 'blimp_exists']], on='run_id', how='left')
run_ids = pd.merge(run_ids, train_val_loss_df, on='run_id', how='left')
run_ids['wandb_exists'] = run_ids['train_loss'].apply(lambda x: False if pd.isnull(x) else True)
run_ids = run_ids.drop(columns=['train_loss', 'val_loss'])
train_val_loss_df = pd.merge(run_ids[['run_id', 'wandb_exists']], train_val_loss_df[["run_id", "train_loss", "val_loss"]], on='run_id', how='left')

reading_time_df, reading_time_subjectwise_df = get_reading_time_data(reading_time_raw_df)
model_surprisal_raw_df, model_surprisal_exist_df = get_model_surprisal_data(model_surprisal_raw_df)

run_ids = pd.merge(run_ids, reading_time_df[["run_id","reading_time_exists"]], on='run_id', how='left')
run_ids['reading_time_exists'] = run_ids['reading_time_exists'].apply(lambda x: False if pd.isnull(x) else True)

run_ids = pd.merge(run_ids, model_surprisal_exist_df, on='run_id', how='left')
run_ids['model_surprisal_data_exists'] = run_ids['model_surprisal_data_exists'].apply(lambda x: False if pd.isnull(x) else True)


######## Patchwork #########
#replace output folder name of just the cell "out-babylm_full_bpe_8k-6x6-mask_-6617787" to "out-babylm_full_bpe_8k-6x6-mask_log001-6617787"
run_ids.loc[run_ids['output_folder_name'] == "out-babylm_full_bpe_8k-6x6-mask_-6617787", 'output_folder_name'] = "out-babylm_full_bpe_8k-6x6-mask_log001-6617787"

run_ids = add_additional_run_details_columns(run_ids)




list index out of range out-wikipedia_char-mask-e002
list index out of range out-wikipedia_char-mask_e010
list index out of range out-wikipedia_bpe-nomask
list index out of range out-wikipedia-char-mask_e100
list index out of range out-wikipedia_bpe-mask_lin
list index out of range out-wikipedia-char-mask_e500
list index out of range out-wikipedia_char-mask_e004
list index out of range out-wikipedia_100M-char-mask
list index out of range out-wikipedia-char-mask_e250
list index out of range out-wikipedia-char
list index out of range out-wikipedia_100M-char
list index out of range out-wikipedia-char
list index out of range out-wikipedia_100M-char
list index out of range out-wikipedia_100M-char-mask
list index out of range out-wikipedia_100M-char-mask
list index out of range out-wikipedia_100M-char-mask
list index out of range out-wikipedia-char-mask
list index out of range out-wikipedia-char-mask
list index out of range out-wikipedia-char-mask
list index out of range out-wikipedia-char-m

In [5]:
run_ids

Unnamed: 0,run_id,log_exists,output_folder_name,output_exists,ckpt_exists,sample_exists,blimp_exists,n_layer,block_size,n_head,...,seed,wandb_exists,reading_time_exists,model_surprisal_data_exists,curriculum_learning,curriculum_type,masking,mask_type,mask_decay_rate,echoic_memory
0,5444724,True,out-babylm_full_bpe-4x4-nomask-5444724,True,True,False,True,4,128,4,...,1337,True,True,True,False,,False,Non,0.0,1
1,5445338,True,out-babylm_wocdes_full_bpe-4x4-nomask-5445338,True,True,False,True,4,128,4,...,1337,True,True,True,False,,False,Non,0.0,1
2,5492054,True,out-babylm_full_bpe-8x8-nomask-5492054,True,True,True,True,8,512,8,...,1337,True,True,True,False,,False,Non,0.0,1
3,5492134,True,out-babylm_full_bpe-6x6-nomask-5492134,True,True,True,True,6,256,6,...,1337,True,True,True,False,,False,Non,0.0,1
4,5496426,True,out-babylm_full_bpe_8k-8x8-nomask-5496426,True,True,True,True,8,512,8,...,1337,True,True,True,False,,False,Non,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,7678429,True,out-babylm_full_bpe_8k-6x6-mask_ee0p5_em10-767...,True,True,True,False,6,256,6,...,616,True,False,False,False,,True,exponential_new,0.5,10
184,7678430,True,out-babylm_full_bpe_8k-6x6-mask_ee0p5_em10-767...,True,True,True,False,6,256,6,...,466,True,False,False,False,,True,exponential_new,0.5,10
185,7679909,True,out-babylm_full_bpe_8k-6x6-mask_ee0p5_em05-767...,True,True,True,False,6,256,6,...,2347,True,False,False,False,,True,exponential_new,0.5,5
186,7761451,True,out-babylm_full_bpe_100M_8k-12x12-nomask-77614...,True,True,True,True,12,256,12,...,1337,True,False,False,False,,False,Non,0.0,1


In [6]:
#Sort columns in particular order
#run_ids = run_ids[['run_id', 'output_folder_name', 'n_layer', 'n_head', 'block_size', 'n_embd', 'batch_size', 'learning_rate', 'dataset', 'output_exists', 'log_exists', 'ckpt_exists', 'sample_exists', 'blimp_exists', 'wandb_exists', 'reading_time_exists']]
run_ids = run_ids[['run_id',  'output_folder_name', 'n_layer', 'n_head', 'block_size', 'n_embd', 'batch_size', 'learning_rate', "seed" , 'masking', 'mask_type', 'mask_decay_rate', 'echoic_memory', 'curriculum_learning', 'curriculum_type', 'dataset', 'log_exists', 'output_exists', 'ckpt_exists', 'sample_exists', 'blimp_exists', 'wandb_exists', 'reading_time_exists', 'model_surprisal_data_exists']]

run_ids

Unnamed: 0,run_id,output_folder_name,n_layer,n_head,block_size,n_embd,batch_size,learning_rate,seed,masking,...,curriculum_type,dataset,log_exists,output_exists,ckpt_exists,sample_exists,blimp_exists,wandb_exists,reading_time_exists,model_surprisal_data_exists
0,5444724,out-babylm_full_bpe-4x4-nomask-5444724,4,4,128,256,32,0.001,1337,False,...,,babylm_full_bpe,True,True,True,False,True,True,True,True
1,5445338,out-babylm_wocdes_full_bpe-4x4-nomask-5445338,4,4,128,256,32,0.0005,1337,False,...,,babylm_wocdes_full_bpe,True,True,True,False,True,True,True,True
2,5492054,out-babylm_full_bpe-8x8-nomask-5492054,8,8,512,512,32,0.0005,1337,False,...,,babylm_full_bpe,True,True,True,True,True,True,True,True
3,5492134,out-babylm_full_bpe-6x6-nomask-5492134,6,6,256,384,32,0.0005,1337,False,...,,babylm_full_bpe,True,True,True,True,True,True,True,True
4,5496426,out-babylm_full_bpe_8k-8x8-nomask-5496426,8,8,512,512,32,0.0005,1337,False,...,,babylm_full_bpe_8k,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,7678429,out-babylm_full_bpe_8k-6x6-mask_ee0p5_em10-767...,6,6,256,384,32,0.0005,616,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
184,7678430,out-babylm_full_bpe_8k-6x6-mask_ee0p5_em10-767...,6,6,256,384,32,0.0005,466,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
185,7679909,out-babylm_full_bpe_8k-6x6-mask_ee0p5_em05-767...,6,6,256,384,32,0.0005,2347,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
186,7761451,out-babylm_full_bpe_100M_8k-12x12-nomask-77614...,12,12,256,768,32,0.00001,1337,False,...,,babylm_full_bpe_100M_8k,True,True,True,True,True,True,False,False


In [7]:
#Run IDS without logs
print("No logs for the following run ids", [i for i in run_ids[run_ids['log_exists'] == False]['run_id']], "\n")

#Run IDS without output dump
print("Copy output dump to local for the following run ids", [i for i in run_ids[run_ids['output_exists'] == False]['run_id']], "\n")

#Run IDS without checkpoint
print("Copy checkpoint to local for the following run ids", [i for i in run_ids[run_ids['ckpt_exists'] == False]['run_id']], "\n")

#Run IDS without samples
print("No samples for the following run ids", [i for i in run_ids[run_ids['sample_exists'] == False]['run_id']], "\n")

#Run IDS without blimp eval
print("No blimp eval for the following run ids", [i for i in run_ids[run_ids['blimp_exists'] == False]['run_id']], "\n")

#Run IDS without wandb data
print("No wandb data for the following run ids", [i for i in run_ids[run_ids['wandb_exists'] == False]['run_id']], "\n")

#Run IDS without reading time data
print("No reading time data for the following run ids", [i for i in run_ids[run_ids['reading_time_exists'] == False]['run_id']], "\n")

#Run IDS without model surprisal data
print("No model surprisal data for the following run ids", [i for i in run_ids[run_ids['model_surprisal_data_exists'] == False]['run_id']], "\n")

No logs for the following run ids [] 

Copy output dump to local for the following run ids [] 

Copy checkpoint to local for the following run ids [] 

No samples for the following run ids [5444724, 5445338] 

No blimp eval for the following run ids [7678383, 7678384, 7678385, 7678386, 7678387, 7678388, 7678389, 7678390, 7678391, 7678392, 7678393, 7678394, 7678395, 7678396, 7678397, 7678398, 7678399, 7678400, 7678401, 7678402, 7678403, 7678404, 7678405, 7678406, 7678407, 7678408, 7678409, 7678410, 7678411, 7678412, 7678413, 7678414, 7678415, 7678416, 7678417, 7678418, 7678419, 7678420, 7678421, 7678422, 7678423, 7678425, 7678426, 7678427, 7678428, 7678429, 7678430, 7679909] 

No wandb data for the following run ids [] 

No reading time data for the following run ids [7678383, 7678384, 7678385, 7678386, 7678387, 7678388, 7678389, 7678390, 7678391, 7678392, 7678393, 7678394, 7678395, 7678396, 7678397, 7678398, 7678399, 7678400, 7678401, 7678402, 7678403, 7678404, 7678405, 7678406, 767840

In [8]:
#Generate command to run for runs without blimp eval 
for index, run_id_row in run_ids[(run_ids['blimp_exists'] == False) & (run_ids['output_exists'] == True)].iterrows():
    #print(run_id_row)
    output_dir = os.path.join(output_path, run_id_row['output_folder_name'])
    #eval_data_dir = r'/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k'
    eval_data_dir = os.path.join('/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/', run_id_row['dataset'])
    output_dir = os.path.join(output_path, run_id_row['output_folder_name'])
    #python babylm_eval.py --model_type=nanogpt --output_dir $output_dir --data_dir $data_dir --tasks all
    print(f'python babylm_eval.py --model_type=nanogpt --output_dir {output_dir} --data_dir {eval_data_dir} --tasks all', "\n")



python babylm_eval.py --model_type=nanogpt --output_dir /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/output_dump/out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678383_s42 --data_dir /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k --tasks all 

python babylm_eval.py --model_type=nanogpt --output_dir /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/output_dump/out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678384_s2347 --data_dir /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k --tasks all 

python babylm_eval.py --model_type=nanogpt --output_dir /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/output_dump/out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678385_s616 --data_dir /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k --tasks all 

python babylm_eval.py --model_type=nanogpt --output_dir /home/abishekthamma/PycharmPr

In [9]:
#Generate command to run for runs without reading time data
reading_time_list = [i for i in run_ids[(run_ids['reading_time_exists'] == False) & (run_ids['output_exists'] == True)]['output_folder_name']]
reading_time_list


['out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678383_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678384_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678385_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678386_s466',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678387_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678388_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678389_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678390_s466',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678391_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678392_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678393_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678394_s466',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678395_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678396_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678397_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678398_s466',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_

In [10]:
#Generate command to run for runs without model surprisal data
model_surprisal_list = [i for i in run_ids[(run_ids['model_surprisal_data_exists'] == False) & (run_ids['output_exists'] == True)]['output_folder_name']]
model_surprisal_list


['out-babylm_full_bpe_8k-6x6-mask_log001-6617787',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678383_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678384_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678385_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678386_s466',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678387_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678388_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678389_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678390_s466',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678391_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678392_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678393_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678394_s466',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678395_s42',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678396_s2347',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678397_s616',
 'out-babylm_full_bpe_8k-6x6-mask_ee003_em01-7678

In [11]:
#Generate command to run for runs without blimp eval 
for index, run_id_row in run_ids[(run_ids['blimp_exists'] == False) & (run_ids['output_exists'] == True)].iterrows():
    #print(run_id_row)
    output_dir = os.path.join(output_path, run_id_row['output_folder_name'])
    eval_data_dir = r'/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k'
    output_dir = os.path.join(output_path, run_id_row['output_folder_name'])
    #python babylm_eval.py --model_type=nanogpt --output_dir $output_dir --data_dir $data_dir --tasks all
    #print(f'python babylm_eval.py --model_type=nanogpt --output_dir {output_dir} --data_dir {eval_data_dir} --tasks all', "\n")
    
    print(f'sbatch --export=output_dir_dynamic={output_dir.split("output_dump/")[-1]} job_exp4.sh')


sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678383_s42 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678384_s2347 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678385_s616 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em01-7678386_s466 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678387_s42 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678388_s2347 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678389_s616 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em05-7678390_s466 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_em10-7678391_s42 job_exp4.sh
sbatch --export=output_dir_dynamic=out-babylm_full_bpe_8k-6x6-mask_ee001_e

In [12]:
run_ids[(run_ids['blimp_exists'] == False) & (run_ids['output_exists'] == True)]

Unnamed: 0,run_id,output_folder_name,n_layer,n_head,block_size,n_embd,batch_size,learning_rate,seed,masking,...,curriculum_type,dataset,log_exists,output_exists,ckpt_exists,sample_exists,blimp_exists,wandb_exists,reading_time_exists,model_surprisal_data_exists
138,7678383,out-babylm_full_bpe_8k-6x6-mask_ee001_em01-767...,6,6,256,384,32,0.0005,42,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
139,7678384,out-babylm_full_bpe_8k-6x6-mask_ee001_em01-767...,6,6,256,384,32,0.0005,2347,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
140,7678385,out-babylm_full_bpe_8k-6x6-mask_ee001_em01-767...,6,6,256,384,32,0.0005,616,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
141,7678386,out-babylm_full_bpe_8k-6x6-mask_ee001_em01-767...,6,6,256,384,32,0.0005,466,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
142,7678387,out-babylm_full_bpe_8k-6x6-mask_ee001_em05-767...,6,6,256,384,32,0.0005,42,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
143,7678388,out-babylm_full_bpe_8k-6x6-mask_ee001_em05-767...,6,6,256,384,32,0.0005,2347,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
144,7678389,out-babylm_full_bpe_8k-6x6-mask_ee001_em05-767...,6,6,256,384,32,0.0005,616,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
145,7678390,out-babylm_full_bpe_8k-6x6-mask_ee001_em05-767...,6,6,256,384,32,0.0005,466,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
146,7678391,out-babylm_full_bpe_8k-6x6-mask_ee001_em10-767...,6,6,256,384,32,0.0005,42,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False
147,7678392,out-babylm_full_bpe_8k-6x6-mask_ee001_em10-767...,6,6,256,384,32,0.0005,2347,True,...,,babylm_full_bpe_8k,True,True,True,True,False,True,False,False


In [13]:
raw_gptwee_path = r"/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/results/raw/GPT_wee_raw.csv"
#raw_gptwee_df = 
pd.concat([blimp_eval_df.merge(run_ids[['run_id', 'output_folder_name']], on='run_id', how='left'), pd.read_csv(raw_gptwee_path)])
#.concat(pd.read_csv(raw_gptwee_path), on='output_folder_name', how='left')

Unnamed: 0,run_id,blimp_exists,anaphor_agreement,argument_structure,binding,control_raising,determiner_noun_agreement,ellipsis,filler_gap,irregular_forms,...,hypernym,qa_congruence_easy,qa_congruence_tricky,subject_aux_inversion,turn_taking,blimp_avg,supplement_avg,total_avg,output_folder_name,model
0,5444724.0,True,81.134969,69.980601,68.091422,66.813964,86.343145,64.953811,68.098350,78.524173,...,48.837209,59.3750,32.727273,86.045377,64.642857,68.421423,58.325543,65.452047,out-babylm_full_bpe-4x4-nomask-5444724,
1,5445338.0,True,61.707566,54.886033,49.747700,58.837826,48.329356,48.325635,53.330221,56.793893,...,51.162791,40.6250,41.818182,47.914125,55.000000,52.411162,47.304020,50.909062,out-babylm_wocdes_full_bpe-4x4-nomask-5445338,
2,5492054.0,True,76.175869,69.835112,67.616503,67.764030,83.691329,63.625866,66.355431,80.508906,...,46.395349,56.2500,43.030303,74.823128,69.285714,67.340828,57.956899,64.580849,out-babylm_full_bpe-8x8-nomask-5492054,
3,5492134.0,True,85.531697,72.478177,69.249035,68.603624,87.562981,66.108545,67.864924,80.254453,...,47.209302,56.2500,38.181818,78.238595,64.285714,70.570998,56.833086,66.530435,out-babylm_full_bpe-6x6-nomask-5492134,
4,5496426.0,True,75.971370,69.786615,71.000297,65.775519,82.962079,64.896074,66.588858,81.730280,...,50.000000,54.6875,36.363636,75.384240,64.642857,67.986423,56.215647,64.524430,out-babylm_full_bpe_8k-8x8-nomask-5496426,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,,,73.820000,71.910000,68.970000,66.260000,88.360000,54.560000,68.670000,86.060000,...,49.420000,57.8100,28.480000,80.090000,54.290000,,,,,16k
5,,,82.870000,69.510000,65.240000,63.210000,85.520000,55.430000,66.650000,77.560000,...,50.930000,53.1200,33.330000,83.460000,56.790000,,,,,16k (cu.)
6,,,63.800000,70.600000,67.100000,66.500000,78.500000,62.000000,63.800000,67.500000,...,50.000000,54.7000,31.500000,80.300000,57.100000,,,,,OPT
7,,,81.500000,67.100000,67.300000,67.900000,90.800000,76.400000,63.500000,87.400000,...,49.400000,31.3000,32.100000,71.700000,53.200000,,,,,RoBERTa


In [14]:
reading_time_df

Unnamed: 0,run_id,data_points,mse_surprisal,rmse_surprisal,r2_surprisal,explained_variance,corr_surprisal,reading_time_exists
0,5444724,5141.103659,24067.383866,141.230611,-0.145010,0.011777,0.118032,True
1,5445338,5141.103659,24081.020288,141.282510,-0.145426,0.011221,0.113489,True
2,5492054,5141.103659,24135.468743,141.427194,-0.147981,0.009245,0.107022,True
3,5492134,5141.103659,24106.138202,141.344541,-0.146702,0.010494,0.112315,True
4,5496426,5141.103659,24062.746080,141.209002,-0.145139,0.012731,0.120776,True
...,...,...,...,...,...,...,...,...
134,7047464,5141.103659,24017.103053,141.087669,-0.143389,0.014377,0.128256,True
135,7047466,5141.103659,24017.970779,141.083083,-0.143224,0.014418,0.128322,True
136,7047467,5141.103659,24025.225297,141.112226,-0.144050,0.014034,0.127015,True
137,7047468,5141.103659,24047.835047,141.165226,-0.144814,0.013435,0.123885,True


In [15]:
#Data Frames to write - Run IDS, Blimp Eval, Train Val Loss, Reading Time

writer = pd.ExcelWriter(read_data_path, engine='xlsxwriter')
run_ids[["run_id"]].to_excel(writer, sheet_name='Run IDs', index=False)
run_ids.to_excel(writer, sheet_name='Run Details', index=False)
blimp_eval_df.to_excel(writer, sheet_name='BLIMP', index=False)
train_val_loss_df.to_excel(writer, sheet_name='Loss', index=False)
reading_time_df.to_excel(writer, sheet_name='Reading Time', index=False)
reading_time_subjectwise_df.to_excel(writer, sheet_name='Reading Time Subjectwise', index=False)

writer.close()

In [21]:
run_ids["run_id"].to_list()

[5444724,
 5445338,
 5492054,
 5492134,
 5496426,
 5496427,
 5734459,
 5734464,
 5734465,
 5734467,
 5734550,
 5757736,
 5757737,
 5983308,
 5983309,
 5988018,
 5988019,
 5988020,
 5988022,
 5989080,
 5989082,
 6486043,
 6486044,
 6603578,
 6603579,
 6603580,
 6607670,
 6617787,
 6620547,
 6620548,
 6621801,
 6681922,
 6681937,
 6681938,
 6681939,
 6681940,
 6681941,
 6681942,
 6681944,
 6681945,
 6681946,
 6681947,
 6681948,
 6681949,
 6681950,
 6681951,
 6683308,
 6683309,
 6683310,
 6683311,
 6689751,
 6689752,
 6689753,
 6810203,
 6810205,
 6810296,
 6810297,
 6810320,
 6810321,
 6810323,
 6810325,
 6810326,
 6839399,
 6839402,
 6839403,
 6839404,
 6839405,
 6839409,
 6839410,
 6839411,
 6839412,
 6839413,
 6839414,
 6839416,
 6839417,
 6839418,
 6839419,
 6839420,
 6839421,
 6839422,
 6839423,
 6839424,
 6839425,
 6839426,
 6839427,
 6839428,
 6839429,
 6839430,
 6839431,
 6849723,
 6849725,
 6864685,
 6890225,
 6890228,
 6890229,
 6890230,
 6890231,
 6890232,
 6890233,
 6890234,
