### SparklyRGT Template: Baseline and Acquisition Analysis 

**Requirements**
* The data must be an excel file from MEDPC2XL (trial by trial data) 
* The data, sparklyRGT.py file, and this notebook must all be in the same folder

**Getting started: Please make a copy of this (sparklyRGT_template_2) for each analysis**
- Refer to sparklyRGT_documentation for function information
- Note: depending on your analysis, you will only have to complete certain sections of the sparklyRGT_documentation
- Note: feel free to create a personal template once you've become comfortable - this is just an example

In [1]:
import os
os.chdir('C:\\Users\\dexte\\sparklyRGT\\sparklyRGT_tutorial')
import sparklyRGT as rgt 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import scipy.stats as stats
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows',100)

I am being executed!


***

# 1) Load data into Python



In [2]:
file_names = ['BH07_raw_free_S29-30.xlsx'] 

df = rgt.load_data(file_names)

df.head()
# df[['Chosen', 'option']]


Unnamed: 0,MSN,StartDate,StartTime,Subject,Group,Box,Experiment,Comment,Session,Trial,...,Pun_Persev_H5,Pun_HeadEntry,Pun_Dur,Premature_Resp,Premature_Hole,Rew_Persev_H1,Rew_Persev_H2,Rew_Persev_H3,Rew_Persev_H4,Rew_Persev_H5
0,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,1.0,...,3,3,30,0,0,0,0,0,0,0
1,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,2.1,...,0,0,0,1,5,0,0,0,0,0
2,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,2.0,...,3,2,30,0,0,0,0,0,0,0
3,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,3.0,...,0,0,0,0,0,0,0,0,0,0
4,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,4.0,...,2,2,30,0,0,0,0,0,0,0


***
# 2A) Baseline & Acquisition Analysis


In [3]:
control_group = [] #In this example: Tg negative rats

exp_group = [] #In this example: Tg positive rats

group_names = {0: '',
              1: ''} 

group_list = []

title = '' #for plotting

startsess = 29 #first session you would like to include in figures
endsess = 30 #last session you would like to include in figures

## Data cleaning

### Check session numbers for each rat

In [4]:
rgt.check_sessions(df)

Subject  StartDate   Session
1        2020-10-09  29         131.1
         2020-10-10  30         124.0
2        2020-10-09  29          76.1
         2020-10-10  30          81.0
3        2020-10-09  29          49.0
         2020-10-10  30          45.0
4        2020-10-09  29         103.0
         2020-10-10  30          97.0
5        2020-10-09  29          68.1
         2020-10-10  30          69.0
6        2020-10-09  29          88.0
         2020-10-10  30          75.0
7        2020-10-09  28          53.0
         2020-10-10  29          65.0
         2020-10-13  30          56.1
8        2020-10-09  29         124.0
         2020-10-10  30         121.0
9        2020-10-09  29          62.0
         2020-10-10  30          61.0
11       2020-10-09  29         132.0
         2020-10-10  30         136.1
12       2020-10-09  29          54.0
         2020-10-10  30          72.0
13       2020-10-09  29          67.0
         2020-10-10  30          60.0
14       2020-10-09  

### Drop/edit session numbers

In [5]:
df2 = rgt.drop_sessions(df, [28])

### Check that you dropped/edited the desired session(s)

In [6]:
rgt.check_sessions(df2) 

Subject  StartDate   Session
1        2020-10-09  29         131.1
         2020-10-10  30         124.0
2        2020-10-09  29          76.1
         2020-10-10  30          81.0
3        2020-10-09  29          49.0
         2020-10-10  30          45.0
4        2020-10-09  29         103.0
         2020-10-10  30          97.0
5        2020-10-09  29          68.1
         2020-10-10  30          69.0
6        2020-10-09  29          88.0
         2020-10-10  30          75.0
7        2020-10-10  29          65.0
         2020-10-13  30          56.1
8        2020-10-09  29         124.0
         2020-10-10  30         121.0
9        2020-10-09  29          62.0
         2020-10-10  30          61.0
11       2020-10-09  29         132.0
         2020-10-10  30         136.1
12       2020-10-09  29          54.0
         2020-10-10  30          72.0
13       2020-10-09  29          67.0
         2020-10-10  30          60.0
14       2020-10-09  29          80.1
         2020-10-10  

## Data processing

### Calculate variables for each rat


In [7]:
df_sum = rgt.get_summary_data(df2) #change to df instead of df2 if you didn't do any session editing
df_sum 

Unnamed: 0,29P1,29P2,29P3,29P4,30P1,30P2,30P3,30P4,risk29,risk30,collect_lat29,collect_lat30,choice_lat29,choice_lat30,omit29,omit30,trial29,trial30,prem29,prem30
1,90.8397,0.0,8.39695,0.763359,83.0645,0.0,16.9355,0.0,81.6794,66.129,1.077168,1.286471,0.638321,0.655323,0,0,131.1,124.0,26.404494,24.848485
2,9.33333,65.3333,10.6667,14.6667,4.93827,76.5432,0.0,18.5185,49.3333,62.963,1.607407,1.387458,1.057733,1.249012,1,0,76.1,81.0,30.275229,33.606557
3,2.17391,8.69565,56.5217,32.6087,0.0,11.6279,67.4419,20.9302,-78.2609,-76.7442,1.023333,1.026522,2.833261,3.652791,3,2,49.0,45.0,3.921569,6.25
4,5.88235,73.5294,4.90196,15.6863,13.4021,64.9485,3.09278,18.5567,58.8235,56.701,1.263699,1.133429,1.00098,0.885258,1,0,103.0,97.0,13.445378,24.21875
5,0.0,1.53846,98.4615,0.0,9.375,7.8125,79.6875,3.125,-96.9231,-65.625,0.807188,1.098889,3.135846,3.17625,3,5,68.1,69.0,8.108108,16.86747
6,2.5,70.0,0.0,27.5,2.94118,63.2353,0.0,33.8235,45.0,32.3529,1.055714,1.061136,2.172,1.859265,8,7,88.0,75.0,9.278351,15.730337
7,3.07692,10.7692,64.6154,21.5385,1.81818,10.9091,69.0909,18.1818,-72.3077,-74.5455,0.839412,0.929643,1.012615,1.194545,0,0,65.0,56.1,16.666667,34.52381
8,2.41935,92.7419,0.806452,4.03226,2.47934,94.2149,1.65289,1.65289,90.3226,93.3884,1.791649,1.724421,1.850484,1.956033,0,0,124.0,121.0,10.144928,12.94964
9,3.27869,3.27869,90.1639,3.27869,5.26316,7.01754,82.4561,5.26316,-86.8852,-75.4386,0.748,0.711429,2.300328,2.754211,1,4,62.0,61.0,20.512821,11.594203
11,10.687,88.5496,0.763359,0.0,8.14815,91.1111,0.0,0.740741,98.4733,98.5185,1.406731,1.543853,1.950153,1.667852,1,0,132.0,136.1,2.222222,7.534247


### Get the risk status of the rats


In [8]:
df_sum, risky, optimal = rgt.get_risk_status(df_sum, startsess, endsess)

print(df_sum[['mean_risk','risk_status']]) 
print(risky, optimal) 

    mean_risk  risk_status
1   73.904211          1.0
2   56.148148          1.0
3  -77.502528          2.0
4   57.762280          1.0
5  -81.274038          2.0
6   38.676471          1.0
7  -73.426573          2.0
8   91.855505          1.0
9  -81.161921          2.0
11  98.495900          1.0
12  32.942097          1.0
13 -33.705357          2.0
14  43.214286          1.0
15 -41.344124          2.0
16 -30.296896          2.0
17   0.633446          1.0
18  41.629689          1.0
19   4.054054          1.0
20  91.116505          1.0
21  11.868687          1.0
22  62.978469          1.0
23  75.345912          1.0
24  71.178218          1.0
25 -98.437500          2.0
26  35.270270          1.0
27  44.732745          1.0
28  43.102453          1.0
29 -30.433996          2.0
30  94.573643          1.0
31  80.796731          1.0
32   1.587302          1.0
[3, 5, 7, 9, 13, 15, 16, 25, 29] [1, 2, 4, 6, 8, 11, 12, 14, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 30, 31, 32]


### Export your data to an Excel file 


In [9]:
# rgt.export_to_excel(df_sum, groups = group_list, column_name = '', new_file_name = '', asin = True)

## Calculate means and SEMs for your experimental groups



In [10]:
mean_scores, SEM = rgt.get_means_sem(df_sum, groups = group_list, group_names = group_names)
mean_scores
# SEM

Unnamed: 0,29P1,29P2,29P3,29P4,30P1,30P2,30P3,30P4,risk29,risk30,...,choice_lat29,choice_lat30,omit29,omit30,trial29,trial30,prem29,prem30,mean_risk,risk_status


### Calculate means and SEMS for your experimental groups, split by risk status and/or sex

In [11]:
# control_risky = [subject for subject in control_group if subject in risky]
# exp_risky = [subject for subject in exp_group if subject in risky]

# control_optimal = [subject for subject in control_group if subject in optimal]
# exp_optimal = [subject for subject in exp_group if subject in optimal]


# group_list_risk = [control_risky,exp_risky, control_optimal, exp_optimal]

# #make sure the group names are in the same order as the group list!
# group_names_risk = {0:'Control risky', 
#                     1: 'Experimental risky',
#                     2: 'Control optimal',
#                     3: 'Experimental optimal'}

# mean_scores_risk, SEM_risk = rgt.get_means_sem(df_sum, group_list_risk, group_names_risk)

# mean_scores_risk

# 2B) Baseline & Acquisition Analysis: Plotting


## Bar plot of P1-P4 % choice


In [12]:
# rgt.choice_bar_plot(startsess, endsess, mean_scores, SEM)

# #To save figure:
# plt.savefig('BH07 Choice S29-30',facecolor = 'white')

## Line plot of other variables


In [14]:
# rgt.rgt_plot('risk', startsess, endsess, title, mean_scores, SEM, group_names = group_names, y_label = 'Risk score') 

# rgt.rgt_plot('prem', startsess, endsess, title, mean_scores, SEM, group_names = group_names, y_label = 'Premature responding') 

## Bar plot of other variables



In [None]:
rgt.rgt_bar_plot('risk', startsess, endsess, title, mean_scores, SEM, group_names, y_label = 'Risk score')

## Plotting by risk status 

In [None]:
rgt.choice_bar_plot(startsess, endsess, mean_scores_risk, SEM_risk)

In [None]:
rgt.rgt_plot('risk', startsess, endsess, title, mean_scores_risk, SEM_risk, group_names = group_names_risk, y_label = 'Risk score') 

In [None]:
rgt.rgt_bar_plot('prem', startsess, endsess, title, mean_scores_risk, SEM_risk, group_names = group_names_risk,y_label = 'Premature responding')

## Test RM anova

In [85]:
import statsmodels.api as sm
import pandas as pd
from patsy import dmatrices

Method: 

Create a new dataframe, with the within-subject factor (session), dependent variable (mean_risk_score), and subject identifier ('Subject') 
Input those columns into the AnovaRM function 

In [19]:
df3 = rgt.get_choices(df2)
df3.head()
df4 = df3[['Subject', 'Session', 'Trial', 'option']]
df4
# df4 = df4[df4['option'] != 0]
# df4.to_excel('df_no_option0.xlsx')
#(option == 1/tr + option == 2/tr) - (option == 3/tr - option == 4/tr)

Unnamed: 0,Subject,Session,Trial,option
0,25,29,1.0,3
1,25,29,2.1,0
2,25,29,2.0,3
3,25,29,3.0,3
4,25,29,4.0,3
...,...,...,...,...
6201,32,30,77.0,0
6202,32,30,78.0,0
6203,32,30,79.0,2
6204,32,30,80.0,4


In [95]:
groupby_trials = df4.groupby(['Subject', 'Session'], as_index = False)['Trial'].max()
groupby_trials['sub_num'] = groupby_trials['Subject']
groupby_trials['sess_num'] = groupby_trials['Session']
df_P1 = df.loc[df['option'] == 1]
df_P2 = df.loc[df['option'] == 2]
df_P3 = df.loc[df['option'] == 3]
df_P4 = df.loc[df['option'] == 4]
count_P1 = df_P1.groupby(['Subject', 'Session'], as_index = False)['option'].count()
count_P2 = df_P2.groupby(['Subject', 'Session'], as_index = False)['option'].count()
count_P3 = df_P3.groupby(['Subject', 'Session'], as_index = False)['option'].count()
count_P4 = df_P4.groupby(['Subject', 'Session'], as_index = False)['option'].count()
groupby_trials = groupby_trials.set_index(['Subject','Session']).join(count_P1.set_index(['Subject','Session']), lsuffix='_P1', rsuffix='_P2')
# groupby_trials = groupby_trials.set_index(['Subject','Session']).join(count_P2.set_index(['Subject','Session']), lsuffix='_P1', rsuffix='_P2')
groupby_trials = groupby_trials.join(count_P2.set_index(['Subject','Session']), lsuffix='_P1', rsuffix='_P2')
groupby_trials = groupby_trials.join(count_P3.set_index(['Subject','Session']), lsuffix='_test', rsuffix='_P2')
groupby_trials = groupby_trials.join(count_P4.set_index(['Subject','Session']), lsuffix='_P3', rsuffix='_P4')
groupby_trials['Trial'] = groupby_trials['Trial'].astype(int)
groupby_trials = groupby_trials.fillna(0)
groupby_trials['risk_score'] = (groupby_trials['option_P1']/groupby_trials['Trial'] + groupby_trials['option_P2']/groupby_trials['Trial'] 
                        - groupby_trials['option_P3']/groupby_trials['Trial'] - groupby_trials['option_P4']/groupby_trials['Trial'])*100
groupby_trials
# groupby_trials.columns

Unnamed: 0_level_0,Unnamed: 1_level_0,Trial,sub_num,sess_num,option_P1,option_P2,option_P3,option_P4,risk_score
Subject,Session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,29,131,1,29,119.0,0.0,11.0,1.0,81.679389
1,30,124,1,30,103.0,0.0,21.0,0.0,66.129032
2,29,76,2,29,7.0,49.0,8.0,11.0,48.684211
2,30,81,2,30,4.0,62.0,0.0,15.0,62.962963
3,29,49,3,29,1.0,4.0,26.0,15.0,-73.469388
...,...,...,...,...,...,...,...,...,...
30,30,129,30,30,1.0,121.0,7.0,0.0,89.147287
31,29,55,31,29,4.0,46.0,2.0,3.0,81.818182
31,30,90,31,30,2.0,78.0,7.0,2.0,78.888889
32,29,75,32,29,0.0,30.0,5.0,28.0,-4.000000


In [None]:
def get_risk_score(df):
    trials_range = range(len(df)) #first index to last index of groupby object
    df_no_prem = df[df['option'] != 0]
    trials = len(df_no_prem) #must change to df where option != 0 
    gb_objects = list(df.groupby(['Subject','Session']))
    count_P1 = 0
    count_P2 = 0
    count_P3 = 0
    count_P4 = 0
    for gb in gb_objects:
        for tr in trials_range:
            if df.at[tr, 'option'] == 1: 
                count_P1 += 1 
            if df.at[tr, 'option'] == 2: 
                count_P2 += 1
            if df.at[tr, 'option'] == 3: 
                count_P3 += 1
            if df.at[tr, 'option'] == 4: 
                count_P4 += 1
    return (count_P1/trials + count_P2/trials - count_P3/trials - count_P4/trials)*100

get_risk_score(df4)
#for gb in groupby objects 
    #get the risk scor

In [49]:
# def get_mean_risk(df):
#     df_no_prem = df[df['option'] != 0]
#     trials = len(df_no_prem) #must change to df where option != 0 
#     df_P1 = df.loc[df['option'] == 1]
#     df_P2 = df.loc[df['option'] == 2]
#     df_P3 = df.loc[df['option'] == 3]    
#     df_P4 = df.loc[df['option'] == 4]   
#     count_P1 = df_P1.groupby(['Subject', 'Session'], as_index = False).count()
#     count_P2 = df_P2.groupby(['Subject', 'Session'], as_index = False).count()
#     count_P3 = df_P3.groupby(['Subject', 'Session'], as_index = False).count()
#     count_P4 = df_P4.groupby(['Subject', 'Session'], as_index = False).count()
#     #groupby subject and session --> get mean risk score 
#     #for df.unique(['Subject', 'Session'])
#     return sum([count_P1/trials, count_P2/trials]) - sum([count_P3/trials + count_P4/trials])

df = df4
df_P1 = df.loc[df['option'] == 1]
df_P2 = df.loc[df['option'] == 2]
df_P3 = df.loc[df['option'] == 3]    
df_P4 = df.loc[df['option'] == 4]   
count_P1 = df_P1.groupby(['Subject', 'Session'], as_index = False).count()
count_P2 = df_P2.groupby(['Subject', 'Session'], as_index = False).count()
count_P3 = df_P3.groupby(['Subject', 'Session'], as_index = False).count()
count_P4 = df_P4.groupby(['Subject', 'Session'], as_index = False).count()
trials = df.groupby(['Subject','Session'],as_index=False)['Trial'].max() #counts non-prem responses in df (doesn't need df option != 0) **max or count
counts = count_P1.join(count_P2['option'], lsuffix='_P1', rsuffix='_P2')
counts = counts.join(count_P3['option'], rsuffix='_P3')
counts = counts.join(count_P4['option'], rsuffix='_P4')
counts = counts.join(trials['Trial'], rsuffix = 's')
counts['risk_score'] = (counts['option_P1']/counts['Trials'] + counts['option_P2']/counts['Trials'] 
                        - counts['option']/counts['Trials'] - counts['option_P4']/counts['Trials'])*100
counts

Unnamed: 0,Subject,Session,Trial,option_P1,option_P2,option,option_P4,Trials,risk_score
0,1,29,119,119,49,11.0,1.0,131.1,118.993135
1,1,30,103,103,62,21.0,11.0,124.0,107.258065
2,2,29,7,7,4,8.0,15.0,76.1,-15.768725
3,2,30,4,4,5,26.0,15.0,81.0,-39.506173
4,3,29,1,1,75,29.0,9.0,49.0,77.551020
...,...,...,...,...,...,...,...,...,...
52,29,30,10,10,124,2.0,27.0,81.0,129.629630
53,30,30,1,1,121,7.0,,91.1,
54,31,29,4,4,46,5.0,,79.0,
55,31,30,2,2,78,2.0,,72.0,


In [None]:
# count_P1_test = df_P1.groupby(['Subject', 'Session'], as_index = True)['option'].count()
# count_P1_test = df_P1.groupby(['Subject', 'Session'], as_index = False)['option'].shape[0]
# count_P1_test
# count_P2 #has no P2 choices in subject 1 session 29 and 30
# df_sums = len(df_P1) + len(df_P2) + len(df_P3) + len(df_P4)
# df_sums #=4964
# df.set_index('key').join(other.set_index('key'))
# counts = df4.groupby(['Subject', 'Session'], as_index = False).agg((['mean', 'count'])) #successfully kept all rows! 
# counts
count_P1

In [None]:
# df6 = df4.groupby(['Subject','Session'])
# gb = df6.apply(lambda x: get_risk_score())
# gb
#keyerror = 0 because the next groupby object does not start from index = 0 

In [None]:
adder = lambda x, y: x + y
print (adder (1, 2))

In [87]:
from statsmodels.stats.anova import AnovaRM

In [98]:
aovrm = AnovaRM(groupby_trials, 'risk_score', 'sub_num', within=['sess_num'])
res = aovrm.fit()

print(res)

                Anova
         F Value Num DF  Den DF Pr > F
--------------------------------------
sess_num  1.5027 1.0000 30.0000 0.2298

