In [1]:
import cmlreaders as cml
from cmlreaders import CMLReader, get_data_index
import pandas as pd 
import numpy as np
import os 


# Experiment 1

In [2]:
data = cml.get_data_index(kind = 'ltp'); data = data[data['experiment']=='VFFR']
sess_evs_list = []
for i, row in data.iterrows():
    try:
        reader = CMLReader(subject=row['subject'], session=row['session'], experiment=row['experiment'])
        sess_evs = reader.load('task_events')
        sess_evs_list.append(sess_evs)
    except:
        print(row)

all_events                                                        NaN
experiment                                                       VFFR
import_type                                                     build
math_events                                                       NaN
original_session                                                    0
session                                                             0
subject                                                        LTP414
subject_alias                                                  LTP414
task_events         protocols/ltp/subjects/LTP414/experiments/VFFR...
Name: 6653, dtype: object
all_events                                                        NaN
experiment                                                       VFFR
import_type                                                     build
math_events                                                       NaN
original_session                                                

In [6]:
exp1_all_evs = pd.concat(sess_evs_list)

In [7]:
exp1_all_evs.to_csv('dataframes/exp1_all_evs.csv', index=False)

In [8]:
exp1_ffr_evs = exp1_all_evs.query('type == ["WORD", "FFR_REC_WORD"]')

In [9]:
# assign item number in the order of presentation
item_col = 'item_name'
item_num_col = 'item_num'
item_num_df = exp1_ffr_evs.query("type=='WORD'").drop_duplicates(subset=item_col, ignore_index=True
                                    )[item_col].reset_index().rename(columns={'index': item_num_col})
item_num_df[item_num_col] = item_num_df[item_num_col] + 1
events_new = exp1_ffr_evs.merge(item_num_df, on=item_col, suffixes=('', '_new'), 
                          how='left', sort=False)#.sort_values('mstime')
events_new.fillna({'item_num_new': -999}, inplace=True)

In [10]:
exp1_ffr_evs = events_new

In [11]:
# number of sessions per subject
n_sess_df = exp1_ffr_evs.groupby('subject', as_index=False).agg({'session': 'nunique'})

In [12]:
# subjects who contributed fewer than 7 sessions
few_sess_subs_7 = n_sess_df.query('session < 7').subject.values.tolist()

In [13]:
# only include subjects who scored above .7 on every session
prop_correct_df = exp1_ffr_evs.query('type == "WORD"').groupby(['subject', 'session']).agg({'correct': 'sum'}).reset_index()
prop_correct_df['prop_correct'] = prop_correct_df['correct'] / (24 * 24)
low_prop_correct_subs = prop_correct_df.query('prop_correct < .7').subject.values.tolist()

In [14]:
exclude_subs = few_sess_subs_7 + low_prop_correct_subs 
exp1_ffr_evs_KateEtal22 = exp1_ffr_evs.query('subject != @exclude_subs and session >= 4 and session <= 10')

In [15]:
exp1_ffr_evs_KateEtal22.to_csv('dataframes/KateEtal22_filter_exp1_ffr_evs.csv', index=False)

In [16]:
from gensim.models import KeyedVectors
word2vec_vectors = KeyedVectors.load_word2vec_format("/scratch/rafla/GoogleNews-vectors-negative300.bin", binary=True)



In [17]:
def word_similarity(df, col1, col2, keyed_vector=None):
    try:
        return keyed_vector.similarity(df[col1].lower(), df[col2].lower())
    except:
        return np.nan

In [18]:
# get all pairs of items
items = item_num_df.item_name.values
sem_sim_df = pd.MultiIndex.from_product([items, items], names=['item_1', 'item_2']).to_frame(index=False)

In [19]:
# compute similarity of all pairs
sem_sim_df['similarity'] = sem_sim_df.apply(word_similarity, 
               axis=1, col1='item_1', 
               col2='item_2', 
               keyed_vector=word2vec_vectors)

In [20]:
sem_sim_num_df = sem_sim_df.merge(
    item_num_df, left_on='item_1', right_on=item_col).merge(
    item_num_df, left_on='item_2', right_on=item_col, suffixes=('_1', '_2')).drop(columns=['item_1', 'item_2'])

In [21]:
sem_sim_num_df.to_csv('dataframes/exp1_sem_sim_num_df.csv', index=False)

# Experiment 2

In [22]:
data = cml.get_data_index(kind = 'ltp'); data = data[data['experiment']=='ltpRepFR']
sess_evs_list = []
for i, row in data.iterrows():
    try:
        reader = CMLReader(subject=row['subject'], session=row['session'], experiment=row['experiment'])
        sess_evs = reader.load('task_events')
        sess_evs_list.append(sess_evs)
    except:
        print(row)

In [23]:
exp2_all_evs = pd.concat(sess_evs_list)

In [24]:
exp2_all_evs.to_csv('dataframes/exp2_all_evs.csv', index=False)

In [25]:
exp2_ffr_evs = exp2_all_evs.query('type == ["WORD", "REC_WORD", "FFR_REC_WORD"]')

In [26]:
item_col = 'item_name'
item_num_col = 'item_num'
item_num_df = exp2_ffr_evs.query("type=='WORD'").drop_duplicates(subset=item_col, ignore_index=True
                                    )[item_col].reset_index().rename(columns={'index': item_num_col})
item_num_df[item_num_col] = item_num_df[item_num_col] + 1
events_new = exp2_ffr_evs.merge(item_num_df, on=item_col, suffixes=('', '_new'), 
                          how='left', sort=False)#.sort_values('mstime')
events_new.fillna({'item_num_new': -999}, inplace=True)

In [27]:
exp2_ffr_evs = events_new

In [28]:
exp2_ffr_evs.to_csv('dataframes/exp2_ffr_evs.csv', index=False)

In [29]:
def word_similarity(df, col1, col2, keyed_vector=None):
    try:
        return keyed_vector.similarity(df[col1].lower(), df[col2].lower())
    except:
        return np.nan

In [30]:
# get all pairs of items
items = item_num_df.item_name.values
sem_sim_df = pd.MultiIndex.from_product([items, items], names=['item_1', 'item_2']).to_frame(index=False)

In [33]:
# compute similarity of all pairs
sem_sim_df['similarity'] = sem_sim_df.apply(word_similarity, 
               axis=1, col1='item_1', 
               col2='item_2', 
               keyed_vector=word2vec_vectors)

In [34]:
sem_sim_num_df = sem_sim_df.merge(
    item_num_df, left_on='item_1', right_on=item_col).merge(
    item_num_df, left_on='item_2', right_on=item_col, suffixes=('_1', '_2')).drop(columns=['item_1', 'item_2'])

In [35]:
sem_sim_num_df.to_csv('dataframes/exp2_sem_sim_num_df.csv', index=False)