### SparklyRGT Template: Baseline and Acquisition Analysis 

**Requirements**
* The data must be an excel file from MEDPC2XL (trial by trial data) 
* The data, sparklyRGT.py file, and this notebook must all be in the same folder

**Getting started: Please make a copy of this (sparklyRGT_template_2) for each analysis**
- Refer to sparklyRGT_documentation for function information
- Note: depending on your analysis, you will only have to complete certain sections of the sparklyRGT_documentation
- Note: feel free to create a personal template once you've become comfortable - this is just an example

In [1]:
import os
os.chdir('..')
import sparklyRGT as rgt 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import scipy.stats as stats
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows',100)

I am being executed!


***

# 1) Load data into Python



In [3]:
#remove the leading 'M' from the TF file subject numbers and convert to integer/float 

In [4]:
#add header names from BH03 to other files so they can be concatenated properly 

In [5]:
from os import listdir
#data needs to be loaded in from OSF
#either download them from OSF and upload to github, or load in directly from OSF 

#OSF files are CSVs (except BH03 is xlsx) and load_multiple_data loads in excel files
#sparklyrgt.py needs to be edited so that either excel files or CSVs can be loaded in

path = '../sparklyRGT_tutorial/data/'
file_names = [f for f in listdir(path)]

df = rgt.load_multiple_data(file_names, path, reset_sessions = False)

df.head()
df.to_csv('../sparklyRGT_tutorial/output.csv')

In [28]:

# set reset_sessions = false
# run all exclusions
# add:
# check if there's any rats that have less than 20 session numbers -- exclude them

# figure out which file is printing -- order of file 
# then run reset sections --> set session number 1 -> n for each subject number
# rats will have at least 20 sessions (after exclusion above)
# write function where it checks whether each subject has 1-20 session number

# save a csv of session 18-20 (after reset), 2 files, cue and classic


def check_sessions(df): ##checks that the 'Session' column has correct, and non-missing session numbers
    pd.set_option('display.max_rows', None) ##otherwise it will ... the middle rows (only give the head and tail)
    print(df.groupby(['Subject','StartDate','Session'])['Trial'].max())
    pd.set_option('display.max_rows',df.Subject.max()) ##this sets the number of displayed rows to the number of subjects

    

In [6]:
from os import listdir
path = '../sparklyRGT_tutorial/'
file_names = ['output.csv']

df = rgt.load_data(file_names, reset_sessions = False)
# df = df.drop(df.columns[[0]], axis=1)

task_list = df.groupby(['MSN'])['Subject'].unique()

  df = rgt.load_data(file_names, reset_sessions = False)


In [7]:
subjects = df.drop_duplicates(subset=['Subject', 'Session'])
subjects_n = subjects[['Subject', 'Session']]
zero_session = []
for index, row in subjects_n.iterrows():
    if row["Session"] == 0:
        zero_session.append(row["Subject"])
        
# drop subjects that had a 0 session
df = rgt.drop_subjects(df, zero_session)

## Data cleaning

### Check session numbers for each rat and drop subjects

In [8]:
#missing data

#we need to only take rats that have consecutive sessions from 1 to 5
#some rats will be missing sessions and need to be excluded

# removes subjects where the first 5 sessions are not consecutive
subjects = df.drop_duplicates(subset=['Subject', 'Session'])
subjects_n = subjects[['Subject', 'Session']]
# sort sessions in ascending order to check for consecutive sessions
subjects_n = subjects_n.sort_values(by=['Subject', 'Session'])
# count number of subjects
n = subjects_n['Subject'].nunique()
drop_subs = []

i = 0
temp = subjects_n
# iterate through each subject and check if they have the first 5 sessions consecutive
while i < n:
    # look at first 5 rows from subject
    check_consec = temp.head()
    # get subject number
    num = check_consec['Subject'].iloc[0]
    # convert first 5 sessions into number list
    con_list = check_consec['Session'].tolist()
    # list -> int
    con_list = list(map(int, con_list))
    # check if list is not consecutive, if so, add to drop list
    if sorted(con_list) != list(range(min(con_list), max(con_list)+1)):
        drop_subs.append(num)
    i = i + 1
    # remove the subject from the list, move to next subject
    temp = temp[temp.Subject != num]
print(drop_subs)
# drop subjects that did not have consecutive sessions (5)
df2 = rgt.drop_subjects(df, drop_subs)
df2

[425, 426, 427, 428, 925, 926, 927, 929, 930, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1118, 1318, 1322, 1326]


Unnamed: 0.1,Unnamed: 0,MSN,StartDate,StartTime,Subject,Group,Box,Experiment,Comment,Session,...,Pun_Persev_H5,Pun_HeadEntry,Pun_Dur,Premature_Resp,Premature_Hole,Rew_Persev_H1,Rew_Persev_H2,Rew_Persev_H3,Rew_Persev_H4,Rew_Persev_H5
0,0,rGT_A-cue,01/23/16,8:13:19,173,0.0,1,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,rGT_A-cue,01/23/16,8:13:19,173,0.0,1,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,rGT_A-cue,01/23/16,8:13:19,173,0.0,1,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,rGT_A-cue,01/23/16,8:13:19,173,0.0,1,0.0,,1.0,...,2.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,rGT_A-cue,01/23/16,8:13:19,173,0.0,1,0.0,,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
521536,721550,rGT_B-cue,11/15/18,16:10:57,1408,0.0,8,GB01,,40.0,...,1.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521537,721551,rGT_B-cue,11/15/18,16:10:57,1408,0.0,8,GB01,,40.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521538,721552,rGT_B-cue,11/15/18,16:10:57,1408,0.0,8,GB01,,40.0,...,0.0,4.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
521539,721553,rGT_B-cue,11/15/18,16:10:57,1408,0.0,8,GB01,,40.0,...,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
#check if there are any subjects that were run on more than one task version (including 5CSRT, FC)

#to display all the task names in the dataframe:
# df2.MSN.unique()
# drop duplicate rows
tasks = df2.drop_duplicates(subset=['Subject', 'MSN'])
# look at subjects and tasks
tasks_n = tasks[['Subject', 'MSN']]
# sort by subject number
tasks_n = tasks_n.sort_values(by=['Subject'])
# find subject numbers that appear more than once in list
tasks_n_dup = tasks_n[tasks_n.duplicated(['Subject'], keep=False)]
tasks_n_dup = tasks_n_dup.Subject.unique()
duplicate_tasks = []
# iterate through subjects with more than 1 task and save them to list
for i in tasks_n_dup:
    duplicate_tasks.append(i)
print(duplicate_tasks)
#drop any subjects that were run on more than one task 
final_subjects = rgt.drop_subjects(df2, duplicate_tasks)

        Unnamed: 0        MSN StartDate StartTime  Subject  Group  Box  \
0                0  rGT_A-cue  01/23/16   8:13:19      173    0.0    1   
58              58  rGT_A-cue  01/23/16   8:13:22      174    0.0    2   
127            127  rGT_B-cue  01/23/16   8:13:26      175    0.0    3   
226            882  rGT_A-cue  01/23/16   8:52:34      185    0.0    1   
324            980  rGT_A-cue  01/23/16   8:52:38      186    0.0    2   
...            ...        ...       ...       ...      ...    ...  ...   
476254      688893  rGT_B-cue  10/15/18  15:15:48     1404    0.0    4   
476302      688941  rGT_A-cue  10/15/18  15:15:55     1405    0.0    5   
476413      689052  rGT_B-cue  10/15/18  15:16:00     1406    0.0    6   
476542      689181  rGT_A-cue  10/15/18  15:16:08     1407    0.0    7   
476651      689290  rGT_B-cue  10/15/18  15:16:16     1408    0.0    8   

       Experiment  Comment  Session  ...  Pun_Persev_H5  Pun_HeadEntry  \
0             0.0      NaN      1.0  

In [35]:
less_20 = final_subjects.drop_duplicates(subset=['Subject','Session'])
# look at subjects and tasks
less = less_20[['Subject', 'Session']]
less = less.sort_values(by=['Subject', 'Session'])
candidates = less['Subject'].unique()
rejects = []
for i in candidates:
    if ((less[less.Subject == i].shape[0]) < 20):
        rejects.append(i)
# print(rejects)
over_20_sessions = rgt.drop_subjects(final_subjects, rejects)
print(over_20_sessions)

In [40]:
# reset sessions
# assuming order of data doesn't matter in ML algorithm
candidates = over_20_sessions['Subject'].unique()
for i in candidates:
    temp = over_20_sessions.loc[over_20_sessions['Subject'] == i]
    # sort temp by sessions, and remove duplicate rows
    for r in temp:
        #iterate through rows in temp and replace row that contain same values in the main df:over_20_sessions  
        print(r)

        Unnamed: 0        MSN StartDate StartTime  Subject  Group  Box  \
394360      688477  rGT_B-cue  10/15/18  14:21:22     1416    0.0    8   
394361      688478  rGT_B-cue  10/15/18  14:21:22     1416    0.0    8   
394362      688479  rGT_B-cue  10/15/18  14:21:22     1416    0.0    8   
394363      688480  rGT_B-cue  10/15/18  14:21:22     1416    0.0    8   
394364      688481  rGT_B-cue  10/15/18  14:21:22     1416    0.0    8   
...            ...        ...       ...       ...      ...    ...  ...   
426682      720799  rGT_B-cue  11/15/18  15:08:55     1416    0.0    8   
426683      720800  rGT_B-cue  11/15/18  15:08:55     1416    0.0    8   
426684      720801  rGT_B-cue  11/15/18  15:08:55     1416    0.0    8   
426685      720802  rGT_B-cue  11/15/18  15:08:55     1416    0.0    8   
426686      720803  rGT_B-cue  11/15/18  15:08:55     1416    0.0    8   

       Experiment  Comment  Session  ...  Pun_Persev_H5  Pun_HeadEntry  \
394360       GB01      NaN     21.0  

In [36]:
final_subjects.to_csv('sockeye_data.csv')

In [None]:
# split the dataframe by cued and classic rats, and save them as two separate CSVs (both with column headers)

#all cued tasks will have 'cue' in the MSN (A and B version)
#all classic tasks should have 'Classic' (A and B) - either rGT or RGT 

#upload to sparklyRGT/data 