## Simulation of Pilot Data

### Setup

In [1]:
#| code-summary: code -- load dependencies and data

from narrative_cmr.evaluation import semantic_data_likelihood, semantic_objective_function
from narrative_cmr.models import LandscapeRevised, Semantic_CMR
from scipy.optimize import differential_evolution
from numba.typed import List, Dict
from numba.core import types
from numba import njit
from psifr import fr
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer, util
import spacy
import warnings
warnings.filterwarnings('ignore')

# boundary constants
lb = np.finfo(float).eps
ub = 1-np.finfo(float).eps

# load recall data frame
data = pd.read_csv('data/psifr_sbs.csv')
data = data.loc[data.time_test==2]
events = fr.merge_free_recall(
    data, list_keys=['item_index', 'cycle', 'story_index', 
                     'story_name', 'time_test'])

# separate trial arrays by story index
pilot_trials = []
for story_index in range(6):
    trials_df = events.loc[events.story_index==story_index].pivot_table(index=['subject', 'list'], columns='output', values='input')
    pilot_trials.append(trials_df.to_numpy(na_value=0).astype('int64'))
pilot_trials = List(pilot_trials)

# separate events by story index
pilot_events = []
for story_index in range(6):
    pilot_events.append(events.loc[events.story_index==story_index])

In [2]:
#| code-summary: code -- prepare connectivity matrices
# 
# # prepare baseline semantic connections using extracted story units
# paraphrase-MiniLM-L12-v2
# average_word_embeddings_glove.6B.300d
# average_word_embeddings_glove.840B.300d
# stsb-distilbert-base
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')
units = events.pivot_table(index=['story_name', 'input'], values='item', aggfunc='first').reset_index()
connections = {}
remove_stopwords = False
nlp = spacy.load('en_core_web_sm')

for story_name in ['Fisherman', 'Supermarket', 'Flight', 'Cat', 'Fog', 'Beach']:
    
    sentences = units.loc[units.story_name==story_name].item.values.tolist()
    
    clean_sentences = []
    for i in range(len(sentences)):
        if remove_stopwords:
            text_token = nlp(sentences[i])
            clean_sentences.append(' '.join([word.text for word in text_token if not word.is_stop]))
        else:
            clean_sentences.append(sentences[i])
    
    #Compute embeddings
    embeddings = model.encode(clean_sentences, convert_to_tensor=True)

    #Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings).numpy() + 1 #TODO: Figure out if abs is ok
    cosine_scores[np.eye(len(cosine_scores), dtype='bool')] = 1
    connections[story_name] = cosine_scores

base_connections = List([connections[key] for key in ['Fisherman', 'Supermarket', 'Flight', 'Cat', 'Fog', 'Beach']])

# prepare LM-R seamntic connections using baseline connections
experiences = {}

cycle_table = events.pivot_table(index=['story_name'], columns='input', values='cycle')

for story_name in ['Fisherman', 'Supermarket', 'Flight', 'Cat', 'Fog', 'Beach']:
    v = cycle_table.loc[story_name].values
    v = v[~np.isnan(v)]
    
    next_experience = []
    current_cycle = 0
    experiences[story_name] = []

    for unit_index, cycle_index in enumerate(v):
        if current_cycle != cycle_index:
            experiences[story_name].append(next_experience)
            next_experience = [unit_index]
            current_cycle = cycle_index
        else:
            next_experience.append(unit_index)

sim_connections = {}
for story_name in ['Fisherman', 'Supermarket', 'Flight', 'Cat', 'Fog', 'Beach']:

    # initialize model and store initial sim_distance_rank df
    model = LandscapeRevised(connections[story_name])

    # add a further inner loop over cycles in story_name
    for cycle_index, cycle in enumerate(experiences[story_name]):
        model.experience([cycle])

    assert np.all(model.connections >= 0)
    sim_connections[story_name] = model.connections.copy()

landscape_connections = List([sim_connections[key] for key in ['Fisherman', 'Supermarket', 'Flight', 'Cat', 'Fog', 'Beach']])

### Regular CMR

In [3]:
!nbdev_build_lib
import narrative_cmr
import importlib
importlib.reload(narrative_cmr.models)
from narrative_cmr.models import Semantic_CMR

Converted Cutler_Poster_Reproduction.ipynb.
Converted 00_Text_Preprocessing.ipynb.
Converted 01_Data_Preparation.ipynb.
Converted Data_Likelihood_Under_Model.ipynb.
Converted Model_Comparison.ipynb.
Converted Retrieval_Clustering_by_Representational_Similarity.ipynb.
Converted Retrieval_Support_by_Unit_Connectivity.ipynb.
Converted Landscape_Model.ipynb.
Converted Semantic_CMR.ipynb.


In [4]:
#| code-summary: code -- 1) fit PrototypeCMR participant-by-participant
#| output: false

cmr_free_parameters = (
    'encoding_drift_rate',
    'start_drift_rate',
    'recall_drift_rate',
    'shared_support',
    'item_support',
    'learning_rate',
    'primacy_scale',
    'primacy_decay',
    'stop_probability_scale',
    'stop_probability_growth',
    'choice_sensitivity',
#    'semantic_scale'
)

cmr_bounds = [
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, 100),
    (lb, 100),
    (lb, ub),
    (lb, 10),
    (lb, 10),
#    (lb, ub)
]

# cost function to be minimized
# ours scales inversely with the probability that the data could have been 
# generated using the specified parameters and our model
@njit(fastmath=True, nogil=True)
def init_cmr(presentation_count, similarities, parameters):
    return Semantic_CMR(presentation_count, similarities, parameters)

cost_function = semantic_objective_function(
    pilot_trials,  
    base_connections,
    Semantic_CMR,
    {'semantic_scale': 0}, 
    cmr_free_parameters)

cmr_result = differential_evolution(cost_function, cmr_bounds, disp=True)
cmr_result

IndexError: index 26 is out of bounds for axis 0 with size 26

In [None]:
!nbdev_build_lib
import narrative_cmr
import importlib
importlib.reload(narrative_cmr.models)
from narrative_cmr.models import Semantic_CMR

In [None]:
#| code-summary: code -- 1) fit PrototypeCMR participant-by-participant
#| output: false

cmr_free_parameters = (
    'encoding_drift_rate',
    'start_drift_rate',
    'recall_drift_rate',
    'shared_support',
    'item_support',
    'learning_rate',
    'primacy_scale',
    'primacy_decay',
    'stop_probability_scale',
    'stop_probability_growth',
    'choice_sensitivity',
    'semantic_scale'
)

cmr_bounds = [
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, 100),
    (lb, 100),
    (lb, ub),
    (lb, 10),
    (lb, 10),
    (lb, 10)
]

# cost function to be minimized
# ours scales inversely with the probability that the data could have been 
# generated using the specified parameters and our model
@njit(fastmath=True, nogil=True)
def init_cmr(presentation_count, similarities, parameters):
    return Semantic_CMR(presentation_count, similarities, parameters)

cost_function = semantic_objective_function(
    pilot_trials,  
    base_connections,
    Semantic_CMR,
    {}, 
    cmr_free_parameters)

semantic_cmr_result = differential_evolution(cost_function, cmr_bounds, disp=True)
semantic_cmr_result

In [None]:
#| code-summary: code -- 1) fit PrototypeCMR participant-by-participant
#| output: false

cmr_free_parameters = (
    'encoding_drift_rate',
    'start_drift_rate',
    'recall_drift_rate',
    'shared_support',
    'item_support',
    'learning_rate',
    'primacy_scale',
    'primacy_decay',
    'stop_probability_scale',
    'stop_probability_growth',
    'choice_sensitivity',
    'semantic_scale'
)

cmr_bounds = [
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, ub),
    (lb, 100),
    (lb, 100),
    (lb, ub),
    (lb, 10),
    (lb, 10),
    (lb, 10)
]

# cost function to be minimized
# ours scales inversely with the probability that the data could have been 
# generated using the specified parameters and our model
@njit(fastmath=True, nogil=True)
def init_cmr(presentation_count, similarities, parameters):
    return Semantic_CMR(presentation_count, similarities, parameters)

cost_function = semantic_objective_function(
    pilot_trials,  
    landscape_connections,
    Semantic_CMR,
    {}, 
    cmr_free_parameters)

landscape_cmr_result = differential_evolution(cost_function, cmr_bounds, disp=True)
landscape_cmr_result