In [1]:
# packages
import os
import pandas as pd
import math
from scipy import io
import numpy as np
from numpy import squeeze
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import zero_one_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot') 

In [2]:
allData = pd.read_csv('sessionTrialTable.csv')
allData.keys()

Index(['animalName', 'sessionDate', 'trialNum', 'totalCellNum', 'gender',
       'genotype', 'weight_g', 'ketamine_day', 'correlationScore',
       'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance',
       'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth',
       'timeSinceKetamine', 'ketamineAdministered'],
      dtype='object')

In [3]:
# Check size information
print("num_cols =",len(allData.keys()))
print("num_rows =",len(allData))

# Check for duplicate rows
print("num_dup =",np.sum(pd.DataFrame.duplicated(allData)))

num_cols = 19
num_rows = 5000
num_dup = 0


In [4]:
# Check for NaNs and see where they are coming from
np.sum(pd.isna(allData))

animalName               0
sessionDate              0
trialNum                 0
totalCellNum             0
gender                   0
genotype                 0
weight_g                 0
ketamine_day             0
correlationScore         0
lickAccuracy             0
lickNumber               0
avgFR                    1
avgSingleCellVariance    1
varianceFR               3
avgTrialSpeed            0
varianceSpeed            0
medianCellDepth          0
timeSinceKetamine        0
ketamineAdministered     0
dtype: int64

In [5]:
# Remove any rows with nans
allDataNN = pd.DataFrame.dropna(allData,'index')
print("After Drop NaN")
print("num_rows =",len(allDataNN))

After Drop NaN
num_rows = 4997


In [6]:
ketBool = allDataNN['ketamineAdministered']
neuralData = allDataNN[['animalName','totalCellNum',
       'gender', 'genotype', 'weight_g',
       'ketamine_day', 'correlationScore', 'lickAccuracy',
       'lickNumber', 'avgFR', 'avgSingleCellVariance',
       'varianceFR', 'avgTrialSpeed', 'varianceSpeed',
       'medianCellDepth','timeSinceKetamine','sessionDate','trialNum']]
# Drop timeSinceKetamine, sessionDate, trialNum as covariates when it is time
# Convert categorical columns
le = LabelEncoder()
neuralData_LE = neuralData.copy()
neuralData_LE['gender'] = le.fit_transform(neuralData_LE['gender'])
neuralData_LE['genotype'] = le.fit_transform(neuralData_LE['genotype'])

In [7]:
AugData = neuralData_LE.copy()

AugData['totalCellNumxCorrelationScore'] = AugData['totalCellNum']*AugData['correlationScore']
AugData['totalCellNumxLickAccuracy'] = AugData['totalCellNum']*AugData['lickAccuracy']
AugData['totalCellNumxLickNumber'] = AugData['totalCellNum']*AugData['lickNumber']
AugData['totalCellNumxAvgFR'] = AugData['totalCellNum']*AugData['avgFR']
AugData['totalCellNumxAvgSingleCellVariance'] = AugData['totalCellNum']*AugData['avgSingleCellVariance']
AugData['totalCellNumxVarianceFR'] = AugData['totalCellNum']*AugData['varianceFR']
AugData['totalCellNumxAvgTrialSpeed'] = AugData['totalCellNum']*AugData['avgTrialSpeed']
AugData['totalCellNumxVarianceSpeed'] = AugData['totalCellNum']*AugData['varianceSpeed']

AugData['genderxCorrelationScore'] = AugData['gender']*AugData['correlationScore']
AugData['genderxLickAccuracy'] = AugData['gender']*AugData['lickAccuracy']
AugData['genderxLickNumber'] = AugData['gender']*AugData['lickNumber']
AugData['genderxAvgFR'] = AugData['gender']*AugData['avgFR']
AugData['genderxAvgSingleCellVariance'] = AugData['gender']*AugData['avgSingleCellVariance']
AugData['genderxVarianceFR'] = AugData['gender']*AugData['varianceFR']
AugData['genderxAvgTrialSpeed'] = AugData['gender']*AugData['avgTrialSpeed']
AugData['genderxVarianceSpeed'] = AugData['gender']*AugData['varianceSpeed']

AugData['genotypexCorrelationScore'] = AugData['genotype']*AugData['correlationScore']
AugData['genotypexLickAccuracy'] = AugData['genotype']*AugData['lickAccuracy']
AugData['genotypexLickNumber'] = AugData['genotype']*AugData['lickNumber']
AugData['genotypexAvgFR'] = AugData['genotype']*AugData['avgFR']
AugData['genotypexAvgSingleCellVariance'] = AugData['genotype']*AugData['avgSingleCellVariance']
AugData['genotypexVarianceFR'] = AugData['genotype']*AugData['varianceFR']
AugData['genotypexAvgTrialSpeed'] = AugData['genotype']*AugData['avgTrialSpeed']
AugData['genotypexVarianceSpeed'] = AugData['genotype']*AugData['varianceSpeed']

AugData['weight_gxCorrelationScore'] = AugData['weight_g']*AugData['correlationScore']
AugData['weight_gxLickAccuracy'] = AugData['weight_g']*AugData['lickAccuracy']
AugData['weight_gxLickNumber'] = AugData['weight_g']*AugData['lickNumber']
AugData['weight_gxAvgFR'] = AugData['weight_g']*AugData['avgFR']
AugData['weight_gxAvgSingleCellVariance'] = AugData['weight_g']*AugData['avgSingleCellVariance']
AugData['weight_gxVarianceFR'] = AugData['weight_g']*AugData['varianceFR']
AugData['weight_gxAvgTrialSpeed'] = AugData['weight_g']*AugData['avgTrialSpeed']
AugData['weight_gxVarianceSpeed'] = AugData['weight_g']*AugData['varianceSpeed']

AugData['ketamine_dayxCorrelationScore'] = AugData['ketamine_day']*AugData['correlationScore']
AugData['ketamine_dayxLickAccuracy'] = AugData['ketamine_day']*AugData['lickAccuracy']
AugData['ketamine_dayxLickNumber'] = AugData['ketamine_day']*AugData['lickNumber']
AugData['ketamine_dayxAvgFR'] = AugData['ketamine_day']*AugData['avgFR']
AugData['ketamine_dayxAvgSingleCellVariance'] = AugData['ketamine_day']*AugData['avgSingleCellVariance']
AugData['ketamine_dayxVarianceFR'] = AugData['ketamine_day']*AugData['varianceFR']
AugData['ketamine_dayxAvgTrialSpeed'] = AugData['ketamine_day']*AugData['avgTrialSpeed']
AugData['ketamine_dayxVarianceSpeed'] = AugData['ketamine_day']*AugData['varianceSpeed']

AugData['medianCellDepthxCorrelationScore'] = AugData['medianCellDepth']*AugData['correlationScore']
AugData['medianCellDepthxLickAccuracy'] = AugData['medianCellDepth']*AugData['lickAccuracy']
AugData['medianCellDepthxLickNumber'] = AugData['medianCellDepth']*AugData['lickNumber']
AugData['medianCellDepthxAvgFR'] = AugData['medianCellDepth']*AugData['avgFR']
AugData['medianCellDepthxAvgSingleCellVariance'] = AugData['medianCellDepth']*AugData['avgSingleCellVariance']
AugData['medianCellDepthxVarianceFR'] = AugData['medianCellDepth']*AugData['varianceFR']
AugData['medianCellDepthxAvgTrialSpeed'] = AugData['medianCellDepth']*AugData['avgTrialSpeed']
AugData['medianCellDepthxVarianceSpeed'] = AugData['medianCellDepth']*AugData['varianceSpeed']

## Split into train and test before standardizing
### Note that we need to save training mean and std dev for each covariate being standardized so that we can standardize the test set properly! 

In [8]:
# Set seed for reproducibility and split into train and test
X_train, X_test, y_train, y_test = train_test_split(AugData, ketBool.values.ravel(), test_size=0.2, random_state=2019)

In [9]:
AugData.keys()

Index(['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g',
       'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber',
       'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed',
       'varianceSpeed', 'medianCellDepth', 'timeSinceKetamine', 'sessionDate',
       'trialNum', 'totalCellNumxCorrelationScore',
       'totalCellNumxLickAccuracy', 'totalCellNumxLickNumber',
       'totalCellNumxAvgFR', 'totalCellNumxAvgSingleCellVariance',
       'totalCellNumxVarianceFR', 'totalCellNumxAvgTrialSpeed',
       'totalCellNumxVarianceSpeed', 'genderxCorrelationScore',
       'genderxLickAccuracy', 'genderxLickNumber', 'genderxAvgFR',
       'genderxAvgSingleCellVariance', 'genderxVarianceFR',
       'genderxAvgTrialSpeed', 'genderxVarianceSpeed',
       'genotypexCorrelationScore', 'genotypexLickAccuracy',
       'genotypexLickNumber', 'genotypexAvgFR',
       'genotypexAvgSingleCellVariance', 'genotypexVarianceFR',
       'genotypexAvgTrialSpeed', 'genoty

In [10]:
# columns to standardize
cols_for_std = ['totalCellNum', 'weight_g',
       'correlationScore', 'lickAccuracy', 'lickNumber',
       'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed',
       'varianceSpeed', 'medianCellDepth', 'totalCellNumxCorrelationScore',
       'totalCellNumxLickAccuracy', 'totalCellNumxLickNumber',
       'totalCellNumxAvgFR', 'totalCellNumxAvgSingleCellVariance',
       'totalCellNumxVarianceFR', 'totalCellNumxAvgTrialSpeed',
       'totalCellNumxVarianceSpeed', 'genderxCorrelationScore',
       'genderxLickAccuracy', 'genderxLickNumber', 'genderxAvgFR',
       'genderxAvgSingleCellVariance', 'genderxVarianceFR',
       'genderxAvgTrialSpeed', 'genderxVarianceSpeed',
       'genotypexCorrelationScore', 'genotypexLickAccuracy',
       'genotypexLickNumber', 'genotypexAvgFR',
       'genotypexAvgSingleCellVariance', 'genotypexVarianceFR',
       'genotypexAvgTrialSpeed', 'genotypexVarianceSpeed',
       'weight_gxCorrelationScore', 'weight_gxLickAccuracy',
       'weight_gxLickNumber', 'weight_gxAvgFR',
       'weight_gxAvgSingleCellVariance', 'weight_gxVarianceFR',
       'weight_gxAvgTrialSpeed', 'weight_gxVarianceSpeed',
       'ketamine_dayxCorrelationScore', 'ketamine_dayxLickAccuracy',
       'ketamine_dayxLickNumber', 'ketamine_dayxAvgFR',
       'ketamine_dayxAvgSingleCellVariance', 'ketamine_dayxVarianceFR',
       'ketamine_dayxAvgTrialSpeed', 'ketamine_dayxVarianceSpeed',
       'medianCellDepthxCorrelationScore', 'medianCellDepthxLickAccuracy',
       'medianCellDepthxLickNumber', 'medianCellDepthxAvgFR',
       'medianCellDepthxAvgSingleCellVariance', 'medianCellDepthxVarianceFR',
       'medianCellDepthxAvgTrialSpeed', 'medianCellDepthxVarianceSpeed']

In [11]:
# Compute mean and stdev of selected columns and save to dicts for easy access
# Standardize while going
# Compute mean and stdev ONLY over training set, then use these vals on both train and test
mean = {}
stdev = {}
X_train_std = X_train.copy()
X_test_std = X_test.copy()
for c in cols_for_std:
    mean[c] = np.mean(X_train[c])
    stdev[c] = np.std(X_train[c])
    X_train_std[c] = (X_train[c]-mean[c])/stdev[c]
    X_test_std[c] = (X_test[c]-mean[c])/stdev[c]

In [12]:
trainC = pd.DataFrame(X_train_std)
trainC['ketBool'] = y_train
testC = pd.DataFrame(X_test_std)
testC['ketBool'] = y_test

In [18]:
trainC.to_csv('trainC.csv',index=None)

In [17]:
testC.to_csv('testC.csv',index=None)

In [23]:
# Count NaNs - just to verify there are none!
sum(np.array(pd.isna(trainC)))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])

### The rest of this NB is old, saving JIC

In [None]:
# Set seed for reproducibility and split into train and test
X, X_test, y, y_test = train_test_split(stdNeuralDataAug,ketBool.values.ravel(), test_size=0.2, random_state=2019)

In [None]:
# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
skf = StratifiedKFold(n_splits=num_folds)
for train_index, test_index in skf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

In [None]:
# Run basic log reg model on train set, check performance against train
# Run model, tuning over C_param (l2 penalty by default)
for C_param in [0.01, 0.1, 1, 10, 100, 1000, 5000, 10000, 50000]:
    model = linear_model.LogisticRegression(solver='lbfgs',max_iter=10000,C = C_param).fit(X, y) 
    y_pred = model.predict(X)
    print(zero_one_loss(y, y_pred))

In [None]:
C_param = 1000
zo_loss = []
accuracy = []
num_folds=10
for i in range(0,num_folds):
    LRmodel = linear_model.LogisticRegression(solver='lbfgs',max_iter=10000, C = C_param).fit(X_train[i], y_train[i]) 
    y_pred = model.predict(X_test[i])
    zo_loss.append(zero_one_loss(y_test[i],y_pred))
    accuracy.append(accuracy_score(y_test[i],y_pred))

avg_zo_loss = np.mean(zo_loss)
avg_acc = np.mean(accuracy)
print("Average zero-one loss across folds:",avg_zo_loss)
print("Average accuracy across folds:",avg_acc)

In [None]:
AugData