In this notebook, we will create two .csv files to store the free recall data scored on the level of "idea units".

Furthermore, we will create sentence-level (sl) scores (using a `>50%` rule).

* AL1 study  **-->** **psifr_al1_(sl).csv**

* AL1 + ML1 study **-->** **psifr_al1_ml1_(sl).csv**

## AL1

In [1]:
# | code-summary: import dependencies, load raw data
import os
import glob
import pandas as pd
import numpy as np
from psifr import fr
import pandas as pd

data_path_AL1 = "C:/Users/elija/Vanderbilt/Aboud, Katherine - Team/Projects/AL1_NIH_DP5/AL1_Data_Backup/AL1_scanned/*/audio/*_free_recall*.xlsx"
data_paths_AL1 = glob.glob(data_path_AL1)

datasets_AL1 = [
    pd.read_excel(data_path, [0], engine="openpyxl")[0] for data_path in data_paths_AL1
]

datasets = datasets_AL1


#| code-summary: track study and recall events in a long table, pairing with trial-level variables

results = []

data_columns = [
    "subject",
    "list",
    "trial_type",
    "position",
    "item",
    "item_string",
    "subject_id",
    "story_list",
    "passage",
    "visit",
    "session",
    "modality",
    "list_length",
    "recTex"
]

for subject_index, data in enumerate(datasets):
    if "Passage index" in data.columns:
        groupby_columns = ["Passage index", "Visit", "Session", "List"]
    else:
        groupby_columns = ["Passage", "Visit", "Session", "List"]

    for trial_index, trial in enumerate(data.groupby(groupby_columns)):
        # sequence of idea units encoded during study
                # trial[0] is a tuple specifying group information (Passage, Visit, Session)
                # trial[1] is dataframe for that group

        # identify trial-level variables
        passage_index = int(trial[0][0])
        visit = int(trial[0][1])
        session = int(trial[0][2])
        subject_id = trial[1][data.columns[0]].values[0]
        story_list = trial[1]["List"].values[0]
        modality = trial[1]["Modality"].values[0]

        # sequence of idea units encoded during study
        source_units = trial[1]["origText"].values

        # build study event list based on extracted story information
        for unit_index, unit in enumerate(source_units):
            results.append(
                [
                    subject_index,
                    trial_index,
                    "study",
                    unit_index + 1,
                    unit_index,
                    unit,
                    subject_id,
                    story_list,
                    passage_index,
                    visit,
                    session,
                    modality,
                    len(source_units),
                    "NA"
                ]
            )

        # clean up column containing positions of recalled idea units
        clean_recall_positions = trial[1]["posRec"].values
        for i in range(len(clean_recall_positions)):
            if type(clean_recall_positions[i]) == str:
                clean_recall_positions[i] = clean_recall_positions[i].strip()
                if clean_recall_positions[i] == "NA":
                    clean_recall_positions[i] = np.nan
                else:
                    raise ValueError(
                        "Unexpected value in posRec column: {}".format(
                            clean_recall_positions[i]
                        )
                    )
        trial[1]["posRec"] = clean_recall_positions
        trial = trial[1].sort_values(by="posRec")

        ## (ADDITION of Recall Text)
        try:
            recTex = trial["recTex"].to_list()
        except KeyError:
            print(subject_id, "does not have 'recTex' column.")
            assert(False)
        # build recall event list based on extracted story information
        recall_positions = trial["posRec"].values
        serial_positions = trial["serialPos"].values
        posRec = 1
        for i in range(len(recall_positions)):
            if np.isnan(recall_positions[i]):
                break

            # move to next entry if value can't be cast as integer
            try:
                serialPos = int(serial_positions[i]) - 1
            except ValueError:
                continue

            results.append(
                [
                    subject_index,
                    trial_index,
                    "recall",
                    posRec,
                    serialPos,
                    source_units[serialPos],
                    subject_id,
                    story_list,
                    passage_index,
                    visit,
                    session,
                    modality,
                    len(source_units),
                    recTex[posRec-1]
                ]
            )

            posRec += 1

df = pd.DataFrame(results,columns=data_columns)

df["item_string"] = [string.strip() for string in df["item_string"].to_list()]

#| code-summary: score and save psifr recall dataframe
merged = fr.merge_free_recall(df, list_keys=data_columns[5:-1], recall_keys=["recTex"])
merged.to_csv("data/psifr_al1.csv", index=False)

## AL1 + ML1

In [2]:
# | code-summary: import dependencies, load raw data
import os
import glob
import pandas as pd
import numpy as np
from psifr import fr
import pandas as pd

data_path_AL1 = "C:/Users/elija/Vanderbilt/Aboud, Katherine - Team/Projects/AL1_NIH_DP5/AL1_Data_Backup/AL1_scanned/*/audio/*_free_recall*.xlsx"
data_paths_AL1 = glob.glob(data_path_AL1)

datasets_AL1 = [
    pd.read_excel(data_path, [0], engine="openpyxl")[0] for data_path in data_paths_AL1
]

data_path_ML1 = "C:/Users/elija/Vanderbilt/Aboud, Katherine - Team/Projects/ML1_xTech_tACS/ML1_Data_Backup/ML1_scanned/*/*_free_recall*.xlsx"
data_paths_ML1 = glob.glob(data_path_ML1)
datasets_ML1 = [
    pd.read_excel(data_path, [0], engine="openpyxl")[0] for data_path in data_paths_ML1
]

datasets = datasets_AL1 + datasets_ML1 


#| code-summary: track study and recall events in a long table, pairing with trial-level variables

results = []

data_columns = [
    "subject",
    "list",
    "trial_type",
    "position",
    "item",
    "item_string",
    "subject_id",
    "story_list",
    "passage",
    "visit",
    "session",
    "modality",
    "list_length",
    "recTex"
]

for subject_index, data in enumerate(datasets):
    if "Passage index" in data.columns:
        groupby_columns = ["Passage index", "Visit", "Session", "List"]
    else:
        groupby_columns = ["Passage", "Visit", "Session", "List"]

    for trial_index, trial in enumerate(data.groupby(groupby_columns)):
        # sequence of idea units encoded during study
                # trial[0] is a tuple specifying group information (Passage, Visit, Session)
                # trial[1] is dataframe for that group

        # identify trial-level variables
        passage_index = int(trial[0][0])
        visit = int(trial[0][1])
        session = int(trial[0][2])
        subject_id = trial[1][data.columns[0]].values[0]
        story_list = trial[1]["List"].values[0]
        modality = trial[1]["Modality"].values[0]

        # sequence of idea units encoded during study
        source_units = trial[1]["origText"].values

        # build study event list based on extracted story information
        for unit_index, unit in enumerate(source_units):
            results.append(
                [
                    subject_index,
                    trial_index,
                    "study",
                    unit_index + 1,
                    unit_index,
                    unit,
                    subject_id,
                    story_list,
                    passage_index,
                    visit,
                    session,
                    modality,
                    len(source_units),
                    "NA"
                ]
            )

        # clean up column containing positions of recalled idea units
        clean_recall_positions = trial[1]["posRec"].values
        for i in range(len(clean_recall_positions)):
            if type(clean_recall_positions[i]) == str:
                clean_recall_positions[i] = clean_recall_positions[i].strip()
                if clean_recall_positions[i] == "NA":
                    clean_recall_positions[i] = np.nan
                else:
                    raise ValueError(
                        "Unexpected value in posRec column: {}".format(
                            clean_recall_positions[i]
                        )
                    )
        trial[1]["posRec"] = clean_recall_positions
        trial = trial[1].sort_values(by="posRec")

        ## (ADDITION of Recall Text)
        try:
            recTex = trial["recTex"].to_list()
        except KeyError:
            print(subject_id, "does not have 'recTex' column.")
            assert(False)
        # build recall event list based on extracted story information
        recall_positions = trial["posRec"].values
        serial_positions = trial["serialPos"].values
        posRec = 1
        for i in range(len(recall_positions)):
            if np.isnan(recall_positions[i]):
                break

            # move to next entry if value can't be cast as integer
            try:
                serialPos = int(serial_positions[i]) - 1
            except ValueError:
                continue

            results.append(
                [
                    subject_index,
                    trial_index,
                    "recall",
                    posRec,
                    serialPos,
                    source_units[serialPos],
                    subject_id,
                    story_list,
                    passage_index,
                    visit,
                    session,
                    modality,
                    len(source_units),
                    recTex[posRec-1]
                ]
            )

            posRec += 1

df = pd.DataFrame(results,columns=data_columns)

df["item_string"] = [string.strip() for string in df["item_string"].to_list()]

#| code-summary: score and save psifr recall dataframe
merged = fr.merge_free_recall(df, list_keys=data_columns[5:-1], recall_keys=["recTex"])
merged.to_csv("data/psifr_al1_ml1.csv", index=False)


## Sentence-Level Scoring

In [11]:
# (0) Define scoring function

def sentence_scorer(merged):
    """Transform idea unit scoring to sentence level scoring

    Args:
        merged (DataFrame): scored free recall dataframe on level of idea units
    """
    groupby_columns = ["subject", "list"]
    merged_sentence_level = pd.DataFrame(columns=merged.columns)

    for trial_index, trial in enumerate(merged.groupby(groupby_columns)):

        # Creating trial data frames
        trial = trial[1]   
        scored_trial =  pd.DataFrame(columns=merged.columns)


        # Initialize iteration variables
        num_sentences = 0
        sentence = pd.DataFrame(columns=merged.columns)
        

        for _, row in trial.iterrows():
            sentence.loc[len(sentence)] = row

            if row["item_string"].strip()[-1] == ".":   # If we've reached the end of the sentnece
            
                # Determine if recalled or not
                recall = sentence["recall"].mean() >= .5
                output = sentence["output"].mean() if recall else np.nan

                list_length = " ".join(trial["item_string"].to_list()).count(".")
                try:
                    recTex = [" ".join(sentence.dropna(subset=["output"]).sort_values(by=["output"])["recTex"].to_list()) if recall else np.nan]
                except:
                    print(sentence["item_string"])
                    assert(False)

                scored_sentence = {
                    # Trial-level variables
                    "subject": [row["subject"]],
                    "list": [row["list"]],
                    "study": [row["study"]],
                    "repeat": row["repeat"],
                    "intrusion": row["intrusion"],
                    "subject_id": row["subject_id"],
                    "story_list": row["story_list"],
                    "passage": row["passage"],
                    "visit" : row["visit"],
                    "session": row["session"],
                    "prior_list" : row["prior_list"],
                    "prior_input" : row["prior_input"],
                    "modality":row["modality"],
                    # Sentence-level Variables
                    "item" : [num_sentences],
                    "input": [num_sentences + 1],
                    "output": [output],
                    "recall" : [recall],
                    "item_string": " ".join(sentence["item_string"].to_list()),
                    "list_length": list_length,
                    "recTex": recTex,
                }

                scored_sentence = pd.DataFrame(data=scored_sentence, columns=merged.columns)
                scored_trial = pd.concat([scored_trial, scored_sentence], ignore_index=True)
                
                # Move to next sentence
                sentence = pd.DataFrame(columns=merged.columns)
                num_sentences += 1

        # FIXME: Case Account for Case where Average Outputs are equal!
        scored_trial["output"] = scored_trial["output"].rank(method="first")
        
        # Update sentence level scoring
        merged_sentence_level = pd.concat([merged_sentence_level, scored_trial], ignore_index=True)
        
    return merged_sentence_level
    

# (1) Score free recall data on sentence-level

df_al1 = pd.read_csv("data/psifr_al1.csv")
df_al1_ml1 = pd.read_csv("data/psifr_al1_ml1.csv")

df_al1_sl = sentence_scorer(df_al1)
df_al1_ml1_sl = sentence_scorer(df_al1_ml1)

df_al1_sl.to_csv("data/psifr_al1_sl.csv", index=False)          # Make sure that the index is not written out to csv column
df_al1_ml1_sl.to_csv("data/psifr_al1_ml1_sl.csv", index=False)

###### Testing `Series.rank()` ability to handle ties when ranking column values from least to greatest.

In [4]:
df = pd.DataFrame(index=["cat", "dog", "snake", "bird"], columns=["cost"], 
                  data=[[80],
                        [200],
                        [200],
                        [350]])

df.head().rank(method="first")

Unnamed: 0,cost
cat,1.0
dog,2.0
snake,3.0
bird,4.0


## Scoring Validation

In [5]:
# FIXME: Add a code that checks that all recall sessions have good scorings 
## 1) PosRec Column is not repeated in any trial
## 2) If Recall == 1, recTex should have something in it
## 3) Use a language model to investigate cases where "idea unit" and "recTex" are very different