In [1]:
import os
import pandas as pd
import numpy as np
#PLOTTING
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
#STATS
import scipy.stats as stats

#the following line prevents pandas from giving unecessary errors 
pd.options.mode.chained_assignment = None

If you have saved the data file into a different folder than your current working directory (i.e., where you saved this python script), you'll need to change your working directory to where you saved the data 

Using the second line of code, change '..\\PhD_data\\BHO7' to wherever you saved the file (you'll have to change the slashes if you are not using windows)

Or if this python script and the data file are saved in the same location, comment out the second line

In [2]:
#check current working directory
os.getcwd()

#change working directory
# os.chdir('..\\PhD_data\\BH07') 

###we had a lot of trouble changing the working directory 
###skipped bc unnecessary to code
os.chdir('C:\\Users\\dexte\\hathaway_1\\data')

Let's load in our data! 

I've given you a datafile with two sessions (session 29 and 30), so that the code will run relatively quickly.

Normally, we would be loading in a lot more data.

For this experiment, there are two groups - transgene positive rats (experimental) and transgene negative rats (control)


In [3]:
#we need to set a few variables for loading in the data - these will change depending on the dataset

#we are just loading in data from one file, but we could load data in from multiple files and combine them
file_names = ['BH07_raw_free_S29-30.xlsx']
##file_names is a List[str], where the strings are file names
##free indicates free choice (they get to choose from 4 options)
##free is after the forced choice training which forces exploration

group_names = ['Tg negative','Tg positive'] #control and experimental group, respectively

title = 'Nigrostriatal activation during acquisition' #title for figures, describing the experiment
startsess = 29 #first session in this dataset
endsess = 30 #last session in this dataset


#the following two lines of code assign the rat subject numbers to the experimental and control group lists
#you may notice there is no subject 10 - she died earlier in the experiment :( 
exp_group = [1, 2, 7, 8, 11, 12, 16, 19, 20, 21, 22, 25, 26, 29, 32] #Tg positive

control_group = [3, 4, 5, 6, 9, 13, 14, 15, 17, 18, 23, 24, 27, 28, 30, 31] #Tg negative

In [4]:
def load_data(fnames): 
    for i, file in enumerate(fnames): 
        if i == 0:
            df = pd.read_excel(fnames[i])
        else:
            df2 = pd.read_excel(fnames[i])
            df = df.append(df2, ignore_index = True)
        return df

df = load_data(file_names)

In [5]:
#let's check the top few lines of the dataframe

df.head()

#it should look the same as the excel file 
##note, df contains the raw dataframe!
##note the ... (some columns aren't being shown, like "Chosen")

Unnamed: 0,MSN,StartDate,StartTime,Subject,Group,Box,Experiment,Comment,Session,Trial,...,Pun_Persev_H5,Pun_HeadEntry,Pun_Dur,Premature_Resp,Premature_Hole,Rew_Persev_H1,Rew_Persev_H2,Rew_Persev_H3,Rew_Persev_H4,Rew_Persev_H5
0,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,1.0,...,3,3,30,0,0,0,0,0,0,0
1,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,2.1,...,0,0,0,1,5,0,0,0,0,0
2,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,2.0,...,3,2,30,0,0,0,0,0,0,0
3,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,3.0,...,0,0,0,0,0,0,0,0,0,0
4,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,4.0,...,2,2,30,0,0,0,0,0,0,0


Let's examine our data!

The following function will give us a summary of the session numbers +  session dates for each rat (subject), plus the number of trials they did for each session (the rightmost column) 

This gives us a quick way to see if there are any missing or incorrect session numbers

Session numbers are typed in manually by the experimenter each day, so mistakes definitely happen :) 

We can also check to make sure that all the original data we want from MED-PC was exported into the excel file that we loaded in - sometimes one or two will get missed 


In [6]:
def check_sessions(df): 
    pd.set_option('display.max_rows', None)
    print(df.groupby(['Subject', 'StartDate', 'Session'])['Trial'].max())
    pd.set_option('display.max_rows', df.Subject.max())
    
check_sessions(df)

Subject  StartDate   Session
1        2020-10-09  29         131.1
         2020-10-10  30         124.0
2        2020-10-09  29          76.1
         2020-10-10  30          81.0
3        2020-10-09  29          49.0
         2020-10-10  30          45.0
4        2020-10-09  29         103.0
         2020-10-10  30          97.0
5        2020-10-09  29          68.1
         2020-10-10  30          69.0
6        2020-10-09  29          88.0
         2020-10-10  30          75.0
7        2020-10-09  28          53.0
         2020-10-10  29          65.0
         2020-10-13  30          56.1
8        2020-10-09  29         124.0
         2020-10-10  30         121.0
9        2020-10-09  29          62.0
         2020-10-10  30          61.0
11       2020-10-09  29         132.0
         2020-10-10  30         136.1
12       2020-10-09  29          54.0
         2020-10-10  30          72.0
13       2020-10-09  29          67.0
         2020-10-10  30          60.0
14       2020-10-09  

In [7]:
# def drop_sessions(df, session_num):
#     drop_sess = list(df.loc[df['Session'] == session_num].index)
#     df.drop(drop_sess, inplace = True)
#     df.reset_index(inplace = True)

# drop_sessions(df, 28)

def drop_sessions(df, session_nums):
    'Takes in a list of session numbers, and removes the data from specified session numbers'
    for s in session_nums:
        drop_sess = list(df.loc[df['Session'] == s].index)
        df.drop(drop_sess, inplace = True)
        df.reset_index(inplace = True)
    return None ##could replace with check_sessions(df)

drop_sessions(df, [28])

In [8]:
#let's recheck the dataframe session numbers - you will see that each rat now only has two sessions

check_sessions(df)
##check_sessions body indicates it will only print "Subject", "StartDate" and "Session"

Subject  StartDate   Session
1        2020-10-09  29         131.1
         2020-10-10  30         124.0
2        2020-10-09  29          76.1
         2020-10-10  30          81.0
3        2020-10-09  29          49.0
         2020-10-10  30          45.0
4        2020-10-09  29         103.0
         2020-10-10  30          97.0
5        2020-10-09  29          68.1
         2020-10-10  30          69.0
6        2020-10-09  29          88.0
         2020-10-10  30          75.0
7        2020-10-10  29          65.0
         2020-10-13  30          56.1
8        2020-10-09  29         124.0
         2020-10-10  30         121.0
9        2020-10-09  29          62.0
         2020-10-10  30          61.0
11       2020-10-09  29         132.0
         2020-10-10  30         136.1
12       2020-10-09  29          54.0
         2020-10-10  30          72.0
13       2020-10-09  29          67.0
         2020-10-10  30          60.0
14       2020-10-09  29          80.1
         2020-10-10  

Now that we've loaded in our data, the first thing we need to do is create an options column that specifies whether the rats chose P1, P2, P3 or P4, instead of the hole number (which is stored in the 'Chosen' column)

We need to make sure that we account for version A and version B

We can do this by referencing the MSN column in the dataframe. 
MSN stands for 'MED-STATE NOTATION' which is the language that MEDPC programs are written in. 
You can think of MSN as the task name

In [9]:
df['MSN'].unique()

array(['rGT_A-cue', 'rGT_B-cue'], dtype=object)

In [10]:
#this function sets up an option column with correct P1 to P4 configuration for version A and B
def get_choices(df):
    configA = np.array([1, 4, 0, 2, 3]) #this is the order for version A - i.e., hole 1 corresponds to P1
    configB = np.array([4, 1, 0, 3, 2]) #this is the order for version B - i.e., hole 1 corresponds to P4

    df['option'] = df['MSN'].str.contains("B").values*configB[df['Chosen'].astype('int').ravel()-1].astype('int') + \
        df['MSN'].str.contains("A").values*configA[df['Chosen'].astype('int').ravel()-1].astype('int')
    
    for i in range(len(df)):
        if df['Chosen'][i] == 0:
            df['option'][i] = 0
    return df

df = get_choices(df)
df

Unnamed: 0,index,MSN,StartDate,StartTime,Subject,Group,Box,Experiment,Comment,Session,...,Pun_HeadEntry,Pun_Dur,Premature_Resp,Premature_Hole,Rew_Persev_H1,Rew_Persev_H2,Rew_Persev_H3,Rew_Persev_H4,Rew_Persev_H5,option
0,0,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,...,3,30,0,0,0,0,0,0,0,3
1,1,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,...,0,0,1,5,0,0,0,0,0,0
2,2,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,...,2,30,0,0,0,0,0,0,0,3
3,3,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,...,0,0,0,0,0,0,0,0,0,3
4,4,rGT_A-cue,2020-10-09,11:01:00,25,0.0,1,0.0,,29,...,2,30,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6201,6515,rGT_B-cue,2020-10-10,11:29:57,32,,5,,,30,...,0,0,0,0,0,0,0,0,0,0
6202,6516,rGT_B-cue,2020-10-10,11:29:57,32,,5,,,30,...,0,0,0,0,0,0,0,0,0,0
6203,6517,rGT_B-cue,2020-10-10,11:29:57,32,,5,,,30,...,0,10,0,0,0,0,0,0,0,2
6204,6518,rGT_B-cue,2020-10-10,11:29:57,32,,5,,,30,...,0,40,0,0,0,0,0,0,0,4


In [11]:
df['option']

0       3
1       0
2       3
3       3
4       3
       ..
6201    0
6202    0
6203    2
6204    4
6205    4
Name: option, Length: 6206, dtype: int32

Now that we know the P1 to P4 choice of each rat, we need to summarize this choice by converting it to a percentage

I.e., what is the percentage choice of each option for each rat, for each session?

The code for this is two different functions; the first can calculate the percentage choice for P1-P4 for a single session.
The second function uses the first one to calculate the %choice for all sessions

Note: this is by far the slowest function in this script, so I'm hoping to optimize in the future

In [31]:
def get_sum_choice(num, df): 
    df1 = df.loc[df['Session'] == num]
    
    subs = df1.Subject.unique()
    subs.sort()
    percentage = pd.DataFrame(columns = [str(num) + 'P1', str(num) + 'P2', str(num) + 'P3', str(num) + 'P4'])
    
    for sub in subs: 
        for i, column in enumerate(percentage.columns): 
            percentage.at[sub, column] = (len(df1.loc[(df1.option == i + 1) & ##P1 is 1 in the option column
                                            (df1.Subject == sub)]))/(len(df1.loc[(df1['option'] != 0) & 
                                                                                (df.Subject == sub)])) *100
    return percentage
    
    
num = 29
# df1 = df.loc[df['Session'] == 29]
# subs = df1.Subject.unique()
# subs.sort()
# subs
percentage = pd.DataFrame(columns = [str(num) + 'P1', str(num) + 'P2', str(num) + 'P3', str(num) + 'P4'])
percentage

df_summary = get_sum_choice(29, df)
df_summary

Unnamed: 0,29P1,29P2,29P3,29P4
1,90.8397,0.0,8.39695,0.763359
2,9.33333,65.3333,10.6667,14.6667
3,2.17391,8.69565,56.5217,32.6087
4,5.88235,73.5294,4.90196,15.6863
5,0.0,1.53846,98.4615,0.0
6,2.5,70.0,0.0,27.5
7,3.07692,10.7692,64.6154,21.5385
8,2.41935,92.7419,0.806452,4.03226
9,3.27869,3.27869,90.1639,3.27869
11,10.687,88.5496,0.763359,0.0


In [13]:
#now let's do it for all sessions - in this case 29 and 30
def get_sum_choice_all(df):
    #create an empty list to store the sessions individually
    df_sess = []
    for num in np.sort(df['Session'].unique()): ##for each session number in list of session numbers 
        df_sess.append(get_sum_choice(num,df)) ##append the summary info from get_sum_choice to the above list ###appending side-to-side? ####just makes a list
    #then turn that list into a df
    ##recall, get_sum_choice outputs a dataframe

    df1 = pd.concat(df_sess, axis=1) ###appends the list of dataframes side by side
    #let's also calculate the risk score for each session - (P1 + P2) - (P3 + P4)
    for num in np.sort(df['Session'].unique()):
        df1['risk'+ str(num)] = df1[str(num)+'P1'] + df1[str(num)+'P2']- df1[str(num)+'P3'] - df1[str(num)+'P4']
        ##these are all names of columns 
        ##df1[str(num)+'P2'] is 29P2
    return df1

##feedback: make an object for np.sort(df['Session'].unique()) --> nit-picky!
##note: df1["column_name"] gets that column

#let's try it out! 
df_summary = get_sum_choice_all(df)
df_summary

##feedback: strange that the variable df, and the object df are used interchangeably 

ValueError: All objects passed were None

OK! Now that we have our choice data summarized, we can move on to the other variables - thankfully much more straightforward haha

First let's do percentage of premature responses. To calculate this, we need to sum up the total number of premature responses for each rat, and divide by the number of trials initiated (so premature trials and completed trials are both in the denominator).
- ##both in the denominator, therefore, we must count the .1's as well for trials initiated

From this point forward, we will be appending all subsequent variables to the df_summary we created above

So we need to pass both the raw data (df) and the summary data (df_summary) to each function

In [None]:
#extract premature response percentages from raw dataframe, and append it to the summary df
def get_premature(df_raw,df_sum):
    #add up the number of premature responses made by (each subject for each session)--> the group
    #save this information to a dataframe called prem_resp
    prem_resp = df_raw.groupby(['Subject', 'Session'],as_index=False)['Premature_Resp'].sum()

##prem_resp took the raw df, grouped by subject and session, and summed the premature responses --> made a df
    
    #calculate the number of initiated trials for each subject for each session 
    prem_resp['Trials'] = df_raw.groupby(['Subject','Session'],as_index=False)['Trial'].count()['Trial']
    ##makes a new column called 'Trials' which counts the # of trials initiated
    ##'Trial' is an existing column in the raw df (in the .xlsx file)
    ###why is there 2 'Trial'? #### just takes the trials - try a new cell 

    #calculate the premature percent by dividing # of premature responses by # of trials initiated, times 100    
    prem_resp['prem_percent'] = prem_resp['Premature_Resp']/prem_resp['Trials'] * 100
    ###can we say this in English? easy

    #add this information to the summary dataframe
    #the column name will be 'prem' + session number - i.e., prem29 for session 29
    for num in np.sort(df_raw['Session'].unique()): #for each session in the raw dataframe
        #for that session, extract the prem_percent column from prem_resp and add it to the summary dataframe
        #set the index as the subject number, so it matches the summary dataframe
        df_sum['prem' + str(num)] = prem_resp.loc[prem_resp['Session']==num].set_index('Subject')['prem_percent']
        ####locate session == num, set index to Subject (1-32), and call prem_percent column of prem_resp --> assign it to df_sum(yadayada)
    return df_sum

#let's try it out!
df_summary = get_premature(df, df_summary)
df_summary

Next, let's calculate our latencies! This is even simpler than premature responding because we don't need to calculate any additional information - we only need to find the mean value for each rat and session

The following function calculates the mean choice latency and mean collect latency for each rat and each session

In [None]:
def get_latencies(df_raw,df_sum):
    #extract only completed trials (including non-completed trials will skew the mean, as the latency is zero for those trials)
    df_raw = df_raw.loc[df_raw['Chosen'] != 0] ##'Chosen' = 0 indicates a prem_response or omission
    #group by subject and session, then calculate the mean collect latency
    collect_lat = df_raw.groupby(['Subject','Session'],as_index=False)['Collect_Lat'].mean()
    #group by subject and session, then calculate the mean choice latency
    choice_lat = df_raw.groupby(['Subject','Session'],as_index=False)['Choice_Lat'].mean()
    
    #add this information to the summary dataframe - same method as used above for premature responding
    for num in np.sort(df_raw['Session'].unique()):
        df_sum['collect_lat' + str(num)] = collect_lat.loc[collect_lat['Session']==num].set_index('Subject')['Collect_Lat']
    for num in np.sort(df_raw['Session'].unique()):
        df_sum['choice_lat' + str(num)] = choice_lat.loc[choice_lat['Session']==num].set_index('Subject')['Choice_Lat']
        ###we're setting the index to subject, so why is ['Choice_Lat'] here?
    return df_sum

#let's run the function
df_summary = get_latencies(df, df_summary)
df_summary

In [None]:
choice_lat = df.groupby(['Subject','Session'],as_index=False)['Choice_Lat'].mean()
print(choice_lat)

Ok, we're almost done creating our summary dataframe. We only have two variables to go - omissions and trials completed

These ones are straightforward - for omissions, we will simply count them (they're coded as a 1 in the raw dataframe, so we can just sum the column for each rat/session)

For trials, we will take the max number in the trials column of the raw dataframe


I'm going to put these two functions together, since they have the same structure as the functions above

In [None]:
def get_omit(df_raw,df_sum):
    #group by subject and session and sum the 'omit' column
    omit = df_raw.groupby(['Subject','Session'],as_index=False)['Omit'].sum() 
##takes the raw dataframe, groups by subject and session, and takes the sum of the "Omit" column 
    #append this information to the summary dataframe
    for num in np.sort(df_raw['Session'].unique()): #gets all unique numbers in the session column
        df_sum['omit' + str(num)] = omit.loc[omit['Session']==num].set_index('Subject')['Omit']
    return df_sum

def get_trials(df_raw,df_sum):
    #group by subject and session and get the max number in the trial column
    trials = df_raw.groupby(['Subject','Session'],as_index=False)['Trial'].max()
    #append this information to the summary dataframe
    for num in np.sort(df_raw['Session'].unique()):
        df_sum['trial' + str(num)] = trials.loc[trials['Session']==num].set_index('Subject')['Trial']
    return df_sum

df_summary = get_omit(df, df_summary)
df_summary = get_trials(df, df_summary)

In [None]:
#let's look at our completed summary dataframe - yay!

df_summary

In [None]:
#we can also write a function that runs all the above functions to create the summary dataframe
#I don't recommend running this because it will do what we already did, and is kind of slow haha
# so it's just an FYI :)

def get_summary_data(df_raw):
    df_raw = get_choices(df_raw)
    df_sum = get_sum_choice_all(df_raw)
    df_sum = get_latencies(df_raw,df_sum)
    df_sum = get_omit(df_raw,df_sum)
    df_sum = get_trials(df_raw,df_sum)
    df_sum = get_premature(df_raw,df_sum)
    return df_sum

df_summary = get_summary_data(df)
# df_summary
# get_summary_data(df)

##this just does what we did earlier again, in one cell. No real point. 

In [None]:
#one more thing we can do - assign rats as risky or optimal, depending on their risk score

def get_risk_status(df_sum, startsess, endsess):
    #get risk status from specified sessions
    #create lists for indexing based on risk status
    risky = []
    optimal = []
    startsess = 'risk' + str(startsess)
    endsess = 'risk' + str(endsess)
    #calculate the mean risk score from the specified sessions
    df_sum['mean_risk'] = df_sum.loc[:,startsess:endsess].mean(axis=1) ###did this create a 'mean_risk' column?
    for sub in df_sum.index: #for each subject
        if df_sum.at[sub,'mean_risk'] > 0: #if the mean risk for that subject is above zero
            df_sum.at[sub,'risk_status'] = 1 #assign them a risk status of 1
            optimal.append(sub) #and add them to the 'optimal' list
        elif df_sum.at[sub,'mean_risk'] < 0: #if the mean risk for that subject is below zero
            df_sum.at[sub,'risk_status'] = 2 #assign them a risk status of 2
            risky.append(sub) #and append them to the 'risky' list
    return df_sum, risky, optimal

df_summary, risky, optimal = get_risk_status(df_summary, startsess, endsess) 
#remember, startsess and endsess were defined at the beginning of this script
###what does it mean to assign comma separated objects?
#### get_risk_status outputs 2 lists, and a dataframe *has to be same order as the return statement

print(df_summary[['mean_risk','risk_status']]) ##printed 2 of many columns in df_summary
print(risky, optimal)
# print(df_summary) #--> huge table with mean_risk and risk_status at the rightmost edge

In [None]:
one, two, three = 1,2,3
one

Ok so now we have our beautiful summary dataframe! There are two things that we can do with this:

1.) export the summary df to an excel file, which we can import into SPSS to run stats on. For this to be the most useful, we will also create a column that specifies whether the rat is in the control group or experimental group

2.) calculate the means for the experimental group and the control group, to create figures

In [None]:
'''this function will save the dataframe as an excel file
groups is a list of lists - i.e., a list of the control group list, and the experimental group list which we defined
at the start of this script.
groupname is the name of the column that will specify control vs experimental - in this case we want it to be
transgene status (or tg_status for short)
filename is the name of the exported excel sheet.
'''
def export_to_excel(df,groups,groupname,filename):
    dfs = []
    for group in groups: #this splits the dataframe by group
        dfs.append(df.loc[group])
    for i,df in enumerate(dfs): #this assigns a number to the tg_status column - in this case, 0 for control, 1 for experimental
        df[groupname] = i ##i should be 0 and 1
    df_export = pd.concat(dfs) #this recombines the dataframes
    df_export.sort_index(inplace = True) #this sorts the subjects so they're in the right order after combining
    df_export.to_excel(filename, index_label = 'Subject')

#the filename I've chosen is BH07_free_S29-30 - same as the original filename except without 'raw', since it's summary data
export_to_excel(df_summary, [control_group, exp_group], 'tg_status', 'BH07_free_S29-30.xlsx')

#it should now be in your current working directory!
##yes it is! Must view it in Excel
###may want to change argument names 

In [None]:
#ok, let's create a dataframe that has the means for each variable for experimental vs control group
#we'll also create a second dataframe that has the standard error of the means

def get_group_means_sem(df_sum,groups, group_names):
    dfs = []
    #first split the dataframe based on experimental vs control
    for group in groups:
        dfs.append(df_sum.loc[group])
    #create two dataframes - one for the means, one for the SEM
    mean_scores = pd.DataFrame(columns=list(df_sum.columns))
    stderror = pd.DataFrame(columns=mean_scores.columns)
    #calculate the mean and standard errors, and store them in the above dataframes
    for column in mean_scores.columns:
        for i in range(len(groups)):
            mean_scores.at[i,column] = dfs[i][column].mean()
            stderror.at[i,column] = stats.sem(dfs[i][column])
    #rename the rows to be the group_names (i.e., transgene positive and transgene negative)   
    mean_scores.rename(index=group_names,inplace = True)
    stderror.rename(index=group_names, inplace = True)
    return mean_scores, stderror

#for the renaming to work, group_names needs to be a dictionary
group_names = {0: 'tg negative',
              1: 'tg positive'}

mean_scores, stderror = get_group_means_sem(df_summary, [control_group, exp_group], group_names)

##control (tg negative) and experimental group should reflect dict 

mean_scores

In [None]:
#now we can create a figure! 

def rgt_plot(variable,startsess,endsess,group_names,title,scores,sem, highlight = None, var_title = None):
    ##startsess and endsess allow us to clip the session data 
    if var_title == None:
        var_title = variable
    plt.rcParams.update({'font.size': 22})
    fig,ax = plt.subplots(figsize = (20,10))
    ax.set_ylabel(var_title)
    ax.set_xlabel('Session')
    ax.set_xlim(startsess,endsess)
    ax.set_title(title + ': ' + var_title + '\n' + 'Session ' + str(startsess) + '-' + str(endsess))
    ax.spines['right'].set_linewidth(0)
    ax.spines['top'].set_linewidth(0)
    ax.spines['left'].set_linewidth(2)
    ax.spines['bottom'].set_linewidth(2)
    ax.set_xlim(startsess-.1,endsess+.1)
    x=np.arange(startsess,endsess+1)
    ax.xaxis.set_major_locator(MaxNLocator(integer=True))
   
    for i,group in enumerate(group_names):
        y = scores.loc[group,variable+str(startsess):variable+str(endsess)]
        plt.errorbar(x, y,
                     yerr = sem.loc[group,variable+str(startsess):variable+str(endsess)], 
                     label=group,linewidth=4, capsize = 8)
    if highlight != None:
        plt.axvline(highlight, 0, 1, color = 'gray', lw = 1)
        ax.fill_between([highlight,endsess], ax.get_ylim()[0], ax.get_ylim()[1], facecolor='gray', alpha=0.2)
    ax.legend()
    
rgt_plot('risk',startsess,endsess,['tg negative','tg positive'],title,mean_scores,stderror, var_title='risk score')

In [None]:
def choice_bar_plot(startsess, endsess, scores, sem,cmap = 'default'):
    sess = list(range(startsess,endsess + 1))
    labels = ['P1','P2','P3','P4']
    df = pd.DataFrame()
    df1 = pd.DataFrame()
    if cmap == 'Paired':
        colors = [plt.cm.Paired(5),plt.cm.Paired(1),plt.cm.Paired(4),plt.cm.Paired(0)]
    if cmap == 'default':
        colors = [plt.cm.Set1(1),plt.cm.Set1(0)]
    for choice in labels:
        df[choice] = scores.loc[:, [col for col in scores.columns if choice in col 
                                    and int(col[:col.index('P')]) in sess]].mean(axis = 1)
        df1[choice] = sem.loc[:, [col for col in scores.columns if choice in col 
                                    and int(col[:col.index('P')]) in sess]].mean(axis = 1)
    ax = df.transpose().plot.bar(rot = 0, yerr = df1.transpose(), capsize = 8, figsize = (20,8))
    
    # Add some text for labels, title and custom x-axis tick labels, etc.
    plt.rcParams.update({'font.size': 18})
    ax.set_ylabel('% Choice', fontweight = 'bold', fontsize = 18)
    ax.set_title('P1-P4 Choice', fontweight = 'bold', fontsize = 22, pad = 20)
    ax.set_ylim(bottom = 0)
    ax.spines['right'].set_linewidth(0)
    ax.spines['top'].set_linewidth(0)
    ax.spines['left'].set_linewidth(2)
    ax.spines['bottom'].set_linewidth(2)
    ax.legend()
    
choice_bar_plot(startsess,endsess,mean_scores,stderror)