In [25]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split, cross_validate
import tqdm
import pandas as pd
import os
from sklearn.metrics import make_scorer
 
import pickle
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer


In [26]:
def load_tokenizer(data_dir):
    """
    Load tokenizer for natural stories evaluation.

    Args:
        data_dir (str): The directory path where the tokenizer data is stored.

    Returns:
        tokenizer (Tokenizer): The loaded tokenizer object.

    Raises:
        NotImplementedError: If stoi/itos is not supported or found.

    """
    meta_path = os.path.join(data_dir, 'meta.pkl')
    load_meta = os.path.exists(meta_path)
    if load_meta:
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
        if meta.get("custom_tokenizer", False):
            print(f"Loading custom tokenizer from {data_dir}")
            tokenizer = AutoTokenizer.from_pretrained(data_dir, use_fast=False)
        else:
            if meta.get("stoi", False):
                raise NotImplementedError("stoi/itos not supported yet")
            else:
                raise NotImplementedError("No stoi/itos found")
    else:
        print("No meta.pkl found, using default GPT-2 tokenizer")
        tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")

    if not tokenizer.eos_token:
        tokenizer.add_special_tokens({"eos_token": "</s>"})
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    tokenizer.padding_side = "left" #Add if needed?
    return tokenizer


def load_RT_data(rt_root=r'naturalstories_RTS'):
    """
    Load the processed RT data from the RT_root directory
    :param rt_root: 
    :return: processsed_RTs, processed_wordinfo, all_stories, where processed RTs are at WorkerId level ...(fill) 
    """
    
    pr_RTs = pd.read_csv(os.path.join(rt_root,'processed_RTs.tsv'), sep='\t')
    #column Item represents the story number, zone is word analogue to word number in the story. Sort by Item and Zone to get the word order in the story
    pr_RTs = pr_RTs.sort_values(by=['item', 'WorkerId', 'zone'])
    
    pr_wi = pd.read_csv(os.path.join(rt_root,'processed_wordinfo.tsv'), sep='\t')
    pr_wi = pr_wi.sort_values(by=['item', 'zone'])
    
    all_st = pd.read_csv(os.path.join(rt_root,'all_stories.tok'), sep='\t')
    all_st = all_st.sort_values(by=['item', 'zone'])
    
    return pr_RTs, pr_wi, all_st

def extract_stories_from_df(stories_df):
    """
    Extract stories from the dataframe with id as key and story as value
    :param stories_df: 
    :return: stories: Dictionary with story id as key and story as value
    """
    stories = {}
    story_ids = stories_df["item"].unique()
    for story_id in story_ids:
        story = stories_df[stories_df["item"] == story_id]
        story_text = story.sort_values(by=['zone'])['word'].str.cat(sep=' ')
        stories[story_id] = story_text

    return stories



def build_linear_model_kfold(data_df, x_type = "surprisal", y_type = "log", per_subject=False, subject_id=None):
    
    """
    Build a kfold crossvalidation linear model to predict RTs from surprisals

    Assuming data_df is the right df being passed. If per subject, it needs to be processed_RTs, else processed_wordinfo
    :param data_df: Dataframe containing the RTs and surprisals, if per_subject is True, then the dataframe should contain WorkerId.
    :param x_type: Choice between only surprisal, or 1 lag or 2 lag, can be surprisal, lag 1, lag 2
    :param per_subject: If True, then the model will be built per subject. If False, then the model will be built for all subjects, on mean RTs
    :param subject_id: If per_subject is True, then the subject_id should be provided to build the model for that subject
    :return: regr: The linear regression model, mse: Mean squared error, r2: R squared value, X_train, X_test, y_train, y_test
    :return:
    """

    if per_subject:
        if subject_id is None:
            raise ValueError("subject_id should be provided if per_subject is True")
        data_df = data_df[data_df['WorkerId'] == subject_id]

    if x_type == "surprisal":
        X = data_df[['surprisal']].to_numpy()
    elif x_type == "lag 1":
        X = data_df[["surprisal", 'surprisal_lag_1']].to_numpy()
    elif x_type == "lag 2":
        X = data_df[['surprisal', 'surprisal_lag_1', 'surprisal_lag_2']].to_numpy()
    else:
        raise ValueError("Invalid x_type, should be surprisal, lag 1 or lag 2")

    if per_subject:
        y = data_df['RT'].to_numpy()
    else:
        y = data_df['meanItemRT'].to_numpy() #assuming the right df is passed
    
    if y_type == "log":
        y = np.log(y)

    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    def custom_correlation_coeff_scorer(y_true, y_pred):
        return np.corrcoef(y_true, y_pred)[0][1]
    #print("X_train shape, X_test shape, y_train shape, y_test shape", X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    regr = linear_model.LinearRegression()
    #regr.fit(X_train, y_train)
    scores = cross_validate(regr, X, y, cv=5, scoring={'r2': 'r2', 
                                                       'neg_mean_squared_error': 'neg_mean_squared_error', 
                                                       'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
                                                       "explained_variance": "explained_variance",
                                                       "custom_correlation_coeff_scorer": make_scorer(custom_correlation_coeff_scorer)}, 
                            return_train_score=True)    
    return scores

def all_subject_summary_kfold(processed_RTs, x_type="lag 2"):

    subject_id_list = processed_RTs['WorkerId'].unique()
    subject_summary = []
    
    for subject_id in tqdm.tqdm(subject_id_list):

        #ignore the subject if the number of data points is less than 1000 (assuming 1 story has atleast 1000 data points)

        if len(processed_RTs[processed_RTs['WorkerId'] == subject_id]) < 1000:
            continue

        scores = build_linear_model_kfold(processed_RTs, x_type=x_type, per_subject=True, subject_id=subject_id)
        
        subject_summary.append({"subject_id": subject_id,
                                "data_points": len(processed_RTs[processed_RTs['WorkerId'] == subject_id]),
                                "mse_surprisal": -1*np.mean(scores['test_neg_mean_squared_error']),   
                                "rmse_surprisal": -1*np.mean(scores['test_neg_root_mean_squared_error']),
                                "r2_surprisal": np.mean(scores['test_r2']),
                                "explained_variance": np.mean(scores['test_explained_variance']),
                                "corr_surprisal": np.mean(scores['test_custom_correlation_coeff_scorer'])})
        

    return pd.DataFrame(subject_summary)


In [27]:
tokenizers_root = r"/home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data"
data_folder = r'babylm_full_bpe_8k'
data_dir = os.path.join(tokenizers_root, data_folder)

story_surprisal_keys_df = pd.read_csv("story_surprisal_keys.csv")
storyword_model_surprisals_df = pd.read_csv("storyword_model_surprisals.csv")

#pick a random model_id to test 
model_id = 6892213
tokenizer = load_tokenizer(data_dir)

story_surprisals_df  = storyword_model_surprisals_df[storyword_model_surprisals_df['model_id'] == model_id]
story_surprisals_df = pd.merge(story_surprisals_df, story_surprisal_keys_df, on='storyword_UID')[["item", "zone", "word", "surprisal"]]

processed_RTs, processed_wordinfo, all_stories = load_RT_data(rt_root=r'naturalstories_RTS')
stories = extract_stories_from_df(all_stories)

processed_RTs = processed_RTs.merge(story_surprisals_df, on=['item', 'zone', 'word'], how='left')
    

#Add addition columns for the surprisals of the previous 2 words but for in each worker, story separately
processed_RTs['surprisal_lag_1'] = processed_RTs.groupby(['item', 'WorkerId'])['surprisal'].shift(1)
processed_RTs['surprisal_lag_2'] = processed_RTs.groupby(['item', 'WorkerId'])['surprisal'].shift(2)
    

#Fill the missing values with the mean surprisal of the word
processed_RTs["surprisal_lag_1"] = processed_RTs['surprisal_lag_1'].fillna(processed_RTs.groupby(['item'])['surprisal'].transform('mean'))
processed_RTs['surprisal_lag_2'] = processed_RTs['surprisal_lag_2'].fillna(processed_RTs.groupby(['item'])['surprisal'].transform('mean'))

#processed_RTs.head()

scores_df = all_subject_summary_kfold(processed_RTs, x_type="lag 2")

print(scores_df)

Loading custom tokenizer from /home/abishekthamma/PycharmProjects/masters_thesis/ss-llm/nanoGPT/data/babylm_full_bpe_8k


100%|██████████| 180/180 [00:29<00:00,  6.01it/s]

         subject_id  data_points  mse_surprisal  rmse_surprisal  r2_surprisal  \
0    A117RW2F1MNBQ8         5198       0.078024        0.271054     -1.210958   
1    A11AUVZ4MCA7VU        10224       0.094180        0.306308      0.023275   
2    A11GA4B4SEYK44         4211       0.077781        0.270245     -0.128370   
3    A11KMPAZSE5Q0Q         5199       0.040240        0.196792     -0.077737   
4    A127R5QI5OGBIK         5194       0.091215        0.296610     -1.164325   
..              ...          ...            ...             ...           ...   
159   ASOBUAZ0IQYSJ         4115       0.143625        0.375962     -0.012565   
160   AVG2BI8CS5YKX         5033       0.096687        0.303938     -0.611411   
161   AWMGC78CSF6YL         3958       0.097738        0.311061      0.003784   
162   AWZ3AH7JH0DRO         5033       0.071312        0.263539     -0.146477   
163  A3AA8NU3WAJ3ED         2266       0.150780        0.384913     -0.587501   

     explained_variance  co




In [29]:
scores_df["corr_surprisal"].mean()

0.1470293678498752

In [None]:
#pick a random model_id to test 
model_id = 6892213
tokenizer = load_tokenizer(data_dir)


story_surprisals_df  = storyword_model_surprisals_df[storyword_model_surprisals_df['model_id'] == model_id]
story_surprisals_df = pd.merge(story_surprisals_df, story_surprisal_keys_df, on='storyword_UID')[["item", "zone", "word", "surprisal"]]

processed_RTs, processed_wordinfo, all_stories = load_RT_data(rt_root=r'naturalstories_RTS')
stories = extract_stories_from_df(all_stories)

processed_RTs = processed_RTs.merge(story_surprisals_df, on=['item', 'zone', 'word'], how='left')
    

#Add addition columns for the surprisals of the previous 2 words but for in each worker, story separately
processed_RTs['surprisal_lag_1'] = processed_RTs.groupby(['item', 'WorkerId'])['surprisal'].shift(1)
processed_RTs['surprisal_lag_2'] = processed_RTs.groupby(['item', 'WorkerId'])['surprisal'].shift(2)
    

#Fill the missing values with the mean surprisal of the word
processed_RTs["surprisal_lag_1"] = processed_RTs['surprisal_lag_1'].fillna(processed_RTs.groupby(['item'])['surprisal'].transform('mean'))
processed_RTs['surprisal_lag_2'] = processed_RTs['surprisal_lag_2'].fillna(processed_RTs.groupby(['item'])['surprisal'].transform('mean'))

#processed_RTs.head()

scores_df2 = all_subject_summary_kfold(processed_RTs, x_type="lag 2")


In [8]:
story_surprisal_keys_df.head()

Unnamed: 0,storyword_UID,item,zone,word,tokens,tokenizer
0,0,1,1,If,[333],babylm_full_bpe_8k
1,1,1,2,you,[208],babylm_full_bpe_8k
2,2,1,3,were,[394],babylm_full_bpe_8k
3,3,1,4,to,[196],babylm_full_bpe_8k
4,4,1,5,journey,[4751],babylm_full_bpe_8k


In [9]:
storyword_model_surprisals_df.head()

Unnamed: 0,model_id,storyword_UID,surprisal
0,5445338,20512,13.324539
1,5445338,20513,3.733041
2,5445338,20514,3.349813
3,5445338,20515,2.085532
4,5445338,20516,10.150342
