## Linear Regression

### Predicting time since ketamine administration

In [1]:
# packages
import os
import pandas as pd
import math
from scipy import io
import numpy as np
from numpy import squeeze
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import zero_one_loss
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot') 

#### Load in data and perform checks

In [2]:
allData = pd.read_csv('postKetamineTable.csv')

In [3]:
allData.keys()

Index(['animalName', 'sessionDate', 'trialNum', 'totalCellNum', 'gender',
       'genotype', 'weight_g', 'ketamine_day', 'correlationScore',
       'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance',
       'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth',
       'timeSinceKetamine', 'ketamineAdministered'],
      dtype='object')

In [4]:
# Check size information
print("num_cols =",len(allData.keys()))
print("num_rows =",len(allData))

# Check for duplicate rows
print("num_dup =",np.sum(pd.DataFrame.duplicated(allData)))

num_cols = 19
num_rows = 5000
num_dup = 0


In [5]:
# Check for NaNs and see where they are coming from
np.sum(pd.isna(allData))

animalName               0
sessionDate              0
trialNum                 0
totalCellNum             0
gender                   0
genotype                 0
weight_g                 0
ketamine_day             0
correlationScore         0
lickAccuracy             0
lickNumber               0
avgFR                    0
avgSingleCellVariance    0
varianceFR               5
avgTrialSpeed            0
varianceSpeed            0
medianCellDepth          0
timeSinceKetamine        0
ketamineAdministered     0
dtype: int64

In [6]:
# Remove any rows with nans (since we have so few)
allDataNN = pd.DataFrame.dropna(allData,'index')
print("After Drop NaN")
print("num_rows =",len(allDataNN))

After Drop NaN
num_rows = 4995


In [7]:
ketBool = allDataNN['ketamineAdministered']
timeSinceKetamine = allDataNN['timeSinceKetamine']
sessionDate = allDataNN['sessionDate']
trialNum = allDataNN['trialNum']
neuralData = allDataNN[['animalName', 'totalCellNum',
       'gender', 'genotype', 'weight_g',
       'ketamine_day', 'correlationScore', 'lickAccuracy',
       'lickNumber', 'avgFR', 'avgSingleCellVariance',
       'varianceFR', 'avgTrialSpeed', 'varianceSpeed',
       'medianCellDepth']]

In [8]:
# Convert categorical columns
le = LabelEncoder()
neuralData_LE = neuralData.copy()
neuralData_LE['animalName'] = le.fit_transform(neuralData_LE['animalName'])
neuralData_LE['gender'] = le.fit_transform(neuralData_LE['gender'])
neuralData_LE['genotype'] = le.fit_transform(neuralData_LE['genotype'])
features = list(neuralData_LE.keys())

In [9]:
# Standardize data (better to do this after train/test split, but leave here for now)
stdNeuralData = StandardScaler().fit_transform(neuralData_LE) 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [10]:
# Split off test set for later
X, X_test, y, y_test = train_test_split(stdNeuralData,timeSinceKetamine.values.ravel(), test_size=0.2)

In [11]:
# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
kf = KFold(n_splits=num_folds)
for train_index, test_index in kf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

In [12]:
# Run basic linreg model on full train set, check performance against train
model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)

In [13]:
print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

Intercept:  1875.5526469826927
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [ 116.39000078 -107.51141321 -385.59068284  255.6088727   593.27675056
 -334.99744817  273.202288    297.93747845   52.95187834 -565.83410337
  246.91131644  158.03965144 -424.51338755  332.49527697 -407.19216994]
RMSE:  1888.750094922188
R2: 0.266587075639684


In [14]:
scaled_RMSE = rmse/(max(y)-min(y))
print(scaled_RMSE)

0.13743040945871798


In [15]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X_train[i],y_train[i])
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1896.8814177215384
Average Scaled RMSE across Folds: 0.13802206582700094


##### Reduced model without metadata

In [16]:
neuralDataR = allDataNN[['correlationScore','lickAccuracy',
       'lickNumber', 'avgFR', 'avgSingleCellVariance',
       'varianceFR', 'avgTrialSpeed', 'varianceSpeed',]]
featuresR = list(neuralDataR.keys())
stdNeuralDataR = StandardScaler().fit_transform(neuralDataR) 

# Split off test set for later
XR, XR_test, yR, yR_test = train_test_split(stdNeuralDataR,timeSinceKetamine.values.ravel(), test_size=0.2)

# Split for cross validation, use 10 folds
num_folds = 10
XAR = np.array(XR)
yAR = np.array(yR)
XR_train = []
XR_test = []
yR_train = []
yR_test = []
kfR = KFold(n_splits=num_folds)
for train_index, test_index in kfR.split(XAR, yAR):
    XR_train.append(XAR[train_index])
    XR_test.append(XAR[test_index])
    yR_train.append(yAR[train_index])
    yR_test.append(yAR[test_index])
    
# Run basic linreg model on full train set, check performance against train
modelR = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(XR,yR)
print("Intercept (R): ",modelR.intercept_)
print(featuresR,modelR.coef_)
#print(model.coef_)
yR_pred = modelR.predict(XR)
rmseR = np.sqrt(mean_squared_error(yR,yR_pred))
print("RMSE (R): ",rmseR)
r2 = r2_score(yR,yR_pred)
print("R2:",r2)

Intercept (R):  1866.8551352336037
['correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed'] [ 2.85641140e+02  3.50092949e+02 -9.74898616e-02 -7.38194918e+02
  3.08547617e+02  3.62499593e+02 -4.49027237e+02  1.07440771e+02]
RMSE (R):  1979.1121765677885
R2: 0.1728733008903981


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [17]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(XR_train[i],yR_train[i])
    yR_pred = model.predict(XR_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(yR_test[i],yR_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1980.4627729868957
Average Scaled RMSE across Folds: 0.1441036643974597


##### Now let's try ridge regression

In [18]:
# Run ridge reg model on full train set, check performance against train
model = linear_model.Ridge(alpha=1.0,fit_intercept=True,normalize=False,copy_X=True).fit(X,y)

In [19]:
print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

Intercept:  1875.5523803577441
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [ 116.05161068 -107.59518654 -384.96510542  255.55940287  592.87310499
 -335.0552752   273.13536383  297.91140757   53.02568185 -565.07082578
  246.20447515  157.95379817 -424.34788686  332.23274974 -407.01262592]
RMSE:  1888.7501860267903
R2: 0.2665870048867617


In [20]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.Ridge(alpha=1.0,fit_intercept=True,normalize=False,copy_X=True).fit(X,y)
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1886.3371253540033
Average Scaled RMSE across Folds: 0.137254835465812


##### Now let's try Lasso

In [21]:
# Run ridge reg model on full train set, check performance against train
model = linear_model.Lasso(alpha=1.0,fit_intercept=True,normalize=False,copy_X=True).fit(X,y)

In [22]:
print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

Intercept:  1875.5454711595385
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [ 107.05750859 -105.96339335 -377.42288644  253.98555583  591.82907158
 -335.11019238  272.21286063  297.29931949   52.3577342  -558.60824522
  240.30263364  157.1553639  -423.37343729  330.36717974 -405.44074031]
RMSE:  1888.7617792065917
R2: 0.2665780014570033


In [23]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.Ridge(alpha=1.0,fit_intercept=True,normalize=False,copy_X=True).fit(X,y)
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1886.3371253540033
Average Scaled RMSE across Folds: 0.137254835465812


##### Now let's try with some second order interaction terms

In [24]:
AugData = neuralData_LE.copy()
AugData.keys()

Index(['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g',
       'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber',
       'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed',
       'varianceSpeed', 'medianCellDepth'],
      dtype='object')

In [25]:
primaryF = ['correlationScore', 'lickAccuracy', 'lickNumber',
       'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed',
       'varianceSpeed']
secondaryF = ['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g',
       'ketamine_day','medianCellDepth']

In [26]:
AugData['animalNamexCorrelationScore'] = AugData['animalName']*AugData['correlationScore']
AugData['animalNamexLickAccuracy'] = AugData['animalName']*AugData['lickAccuracy']
AugData['animalNamexLickNumber'] = AugData['animalName']*AugData['lickNumber']
AugData['animalNamexAvgFR'] = AugData['animalName']*AugData['avgFR']
AugData['animalNamexAvgSingleCellVariance'] = AugData['animalName']*AugData['avgSingleCellVariance']
AugData['animalNamexVarianceFR'] = AugData['animalName']*AugData['varianceFR']
AugData['animalNamexAvgTrialSpeed'] = AugData['animalName']*AugData['avgTrialSpeed']
AugData['animalNamexVarianceSpeed'] = AugData['animalName']*AugData['varianceSpeed']

In [27]:
AugData['totalCellNumxCorrelationScore'] = AugData['totalCellNum']*AugData['correlationScore']
AugData['totalCellNumxLickAccuracy'] = AugData['totalCellNum']*AugData['lickAccuracy']
AugData['totalCellNumxLickNumber'] = AugData['totalCellNum']*AugData['lickNumber']
AugData['totalCellNumxAvgFR'] = AugData['totalCellNum']*AugData['avgFR']
AugData['totalCellNumxAvgSingleCellVariance'] = AugData['totalCellNum']*AugData['avgSingleCellVariance']
AugData['totalCellNumxVarianceFR'] = AugData['totalCellNum']*AugData['varianceFR']
AugData['totalCellNumxAvgTrialSpeed'] = AugData['totalCellNum']*AugData['avgTrialSpeed']
AugData['totalCellNumxVarianceSpeed'] = AugData['totalCellNum']*AugData['varianceSpeed']

In [28]:
AugData['genderxCorrelationScore'] = AugData['gender']*AugData['correlationScore']
AugData['genderxLickAccuracy'] = AugData['gender']*AugData['lickAccuracy']
AugData['genderxLickNumber'] = AugData['gender']*AugData['lickNumber']
AugData['genderxAvgFR'] = AugData['gender']*AugData['avgFR']
AugData['genderxAvgSingleCellVariance'] = AugData['gender']*AugData['avgSingleCellVariance']
AugData['genderxVarianceFR'] = AugData['gender']*AugData['varianceFR']
AugData['genderxAvgTrialSpeed'] = AugData['gender']*AugData['avgTrialSpeed']
AugData['genderxVarianceSpeed'] = AugData['gender']*AugData['varianceSpeed']

In [29]:
AugData['genotypexCorrelationScore'] = AugData['genotype']*AugData['correlationScore']
AugData['genotypexLickAccuracy'] = AugData['genotype']*AugData['lickAccuracy']
AugData['genotypexLickNumber'] = AugData['genotype']*AugData['lickNumber']
AugData['genotypexAvgFR'] = AugData['genotype']*AugData['avgFR']
AugData['genotypexAvgSingleCellVariance'] = AugData['genotype']*AugData['avgSingleCellVariance']
AugData['genotypexVarianceFR'] = AugData['genotype']*AugData['varianceFR']
AugData['genotypexAvgTrialSpeed'] = AugData['genotype']*AugData['avgTrialSpeed']
AugData['genotypexVarianceSpeed'] = AugData['genotype']*AugData['varianceSpeed']

In [30]:
AugData['weight_gxCorrelationScore'] = AugData['weight_g']*AugData['correlationScore']
AugData['weight_gxLickAccuracy'] = AugData['weight_g']*AugData['lickAccuracy']
AugData['weight_gxLickNumber'] = AugData['weight_g']*AugData['lickNumber']
AugData['weight_gxAvgFR'] = AugData['weight_g']*AugData['avgFR']
AugData['weight_gxAvgSingleCellVariance'] = AugData['weight_g']*AugData['avgSingleCellVariance']
AugData['weight_gxVarianceFR'] = AugData['weight_g']*AugData['varianceFR']
AugData['weight_gxAvgTrialSpeed'] = AugData['weight_g']*AugData['avgTrialSpeed']
AugData['weight_gxVarianceSpeed'] = AugData['weight_g']*AugData['varianceSpeed']

In [31]:
AugData['ketamine_dayxCorrelationScore'] = AugData['ketamine_day']*AugData['correlationScore']
AugData['ketamine_dayxLickAccuracy'] = AugData['ketamine_day']*AugData['lickAccuracy']
AugData['ketamine_dayxLickNumber'] = AugData['ketamine_day']*AugData['lickNumber']
AugData['ketamine_dayxAvgFR'] = AugData['ketamine_day']*AugData['avgFR']
AugData['ketamine_dayxAvgSingleCellVariance'] = AugData['ketamine_day']*AugData['avgSingleCellVariance']
AugData['ketamine_dayxVarianceFR'] = AugData['ketamine_day']*AugData['varianceFR']
AugData['ketamine_dayxAvgTrialSpeed'] = AugData['ketamine_day']*AugData['avgTrialSpeed']
AugData['ketamine_dayxVarianceSpeed'] = AugData['ketamine_day']*AugData['varianceSpeed']

In [32]:
AugData['medianCellDepthxCorrelationScore'] = AugData['medianCellDepth']*AugData['correlationScore']
AugData['medianCellDepthxLickAccuracy'] = AugData['medianCellDepth']*AugData['lickAccuracy']
AugData['medianCellDepthxLickNumber'] = AugData['medianCellDepth']*AugData['lickNumber']
AugData['medianCellDepthxAvgFR'] = AugData['medianCellDepth']*AugData['avgFR']
AugData['medianCellDepthxAvgSingleCellVariance'] = AugData['medianCellDepth']*AugData['avgSingleCellVariance']
AugData['medianCellDepthxVarianceFR'] = AugData['medianCellDepth']*AugData['varianceFR']
AugData['medianCellDepthxAvgTrialSpeed'] = AugData['medianCellDepth']*AugData['avgTrialSpeed']
AugData['medianCellDepthxVarianceSpeed'] = AugData['medianCellDepth']*AugData['varianceSpeed']

In [33]:
# Standardize data
stdNeuralDataAug = StandardScaler().fit_transform(AugData) 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [34]:
# Split off test set for later
X, X_test, y, y_test = train_test_split(stdNeuralDataAug,timeSinceKetamine.values.ravel(), test_size=0.2)

In [35]:
# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
kf = KFold(n_splits=num_folds)
for train_index, test_index in kf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

In [37]:
# Run basic linreg model on full train set, check performance against train
model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)

print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

scaled_RMSE = rmse/(max(y)-min(y))
print(scaled_RMSE)

Intercept:  1886.4426242624386
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [ 1.37114374e+03 -9.22880395e+02 -2.16761491e+03  6.81732307e+02
  6.94932421e+02 -2.00138064e+03 -7.41255090e+02  9.98187612e+02
  7.73511118e+01 -3.49017908e+03  1.08537237e+03  1.56594626e+03
 -2.02352013e+03  3.38116018e+01 -1.60264196e+03  2.82006203e+02
  2.77357710e+02 -7.95134032e+01 -1.16562069e+03  3.33957317e+02
  2.50406452e+02 -1.02226721e+03 -6.12920023e+02  4.49963248e+02
 -4.48566775e+02 -3.31209912e+02  3.46498599e+02  2.94644264e+02
  2.25828349e+00  4.36554638e+02  5.16016461e+01 -9.96179326e+02
  2.53999519e+02  4.79809858e+02 -1.70155455e+03  3.54354736e+03
 -6.24276994e+02  1.41410517e+03 -3.16568222e+02  3.16164003e+00
  2.24124030e+02  1.16736095e+02 -2.02186504e+03  2.66185980e+03
 -7.22050274e+02  1.58

In [38]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1505.5534542864068
Average Scaled RMSE across Folds: 0.10931242380250018


##### Try Ridge reg on the augmented data set

In [39]:
# Run basic linreg model on full train set, check performance against train
model = linear_model.Ridge(alpha=1.0,fit_intercept=True,normalize=False,copy_X=True).fit(X,y)
print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

scaled_RMSE = rmse/(max(y)-min(y))
print(scaled_RMSE)

Intercept:  1885.9692921057788
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [ 1.04366004e+03 -8.22468384e+02 -2.08658523e+03  6.12975416e+02
  9.46265334e+02 -1.92337202e+03 -7.10892211e+02  9.69052141e+02
  6.23910437e+01 -2.38972100e+03 -1.53221962e+01  1.44896693e+03
 -1.96195205e+03  1.21175989e+02 -1.59739770e+03  2.73953761e+02
  2.66364744e+02 -7.42714112e+01 -7.22889290e+02  3.21529029e+02
  2.24748624e+02 -9.46583248e+02 -5.88165344e+02  4.32428193e+02
 -4.45393861e+02 -3.37463559e+02  2.48281710e+02  2.93975600e+02
  7.13368365e+00  4.19712852e+02  5.27167173e+01 -9.95114365e+02
  2.58754056e+02  4.76395068e+02 -1.43316063e+03  3.12156043e+03
 -6.10730834e+02  1.35823080e+03 -2.86308944e+02 -7.94780067e-01
  2.21747066e+02  1.22102917e+02 -1.86443366e+03  2.59755069e+03
 -7.15259483e+02  1.51

In [40]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.Ridge(alpha=1.0,fit_intercept=True,normalize=False,copy_X=True).fit(X,y)
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1506.7402041759663
Average Scaled RMSE across Folds: 0.10939858913027765


##### Let's try a different augmentation

In [41]:
AugData2 = neuralData_LE.copy()
AugData2.keys()

Index(['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g',
       'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber',
       'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed',
       'varianceSpeed', 'medianCellDepth'],
      dtype='object')

In [42]:
AugData2['corrScoreSq'] = AugData2['correlationScore']*AugData2['correlationScore']
AugData2['avgTrialSpeedSq'] = AugData2['avgTrialSpeed']*AugData2['avgTrialSpeed']
AugData2['avgFRSQ'] = AugData2['avgFR']*AugData2['avgFR']


In [43]:
# Standardize data
stdNeuralDataAug2 = StandardScaler().fit_transform(AugData2) 
# Split off test set for later
X, X_test, y, y_test = train_test_split(stdNeuralDataAug2,timeSinceKetamine.values.ravel(), test_size=0.2)

# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
kf = KFold(n_splits=num_folds)
for train_index, test_index in kf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [44]:
# Run basic linreg model on full train set, check performance against train
model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)

print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

scaled_RMSE = rmse/(max(y)-min(y))
print(scaled_RMSE)

Intercept:  1888.1720751052355
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [   -8.4902276    -19.45701449  -387.48146005   140.3208147
   600.39566312  -317.84486377   446.11577529   287.33205658
    36.55730237 -1318.87722509   -47.28339046   175.56657687
  -657.38999423   324.11341399  -434.12796164  -255.71402409
   299.65632484  1052.76693917]
RMSE:  1870.780049294536
R2: 0.279570492829024
0.135837020651291


In [45]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1863.797827017103
Average Scaled RMSE across Folds: 0.1353300426812997


##### Another augmentation yet

In [46]:
AugData3 = neuralData_LE.copy()

AugData3['corrScoreCu'] = AugData3['correlationScore']*AugData3['correlationScore']*AugData3['correlationScore']
AugData3['avgTrialSpeedCu'] = AugData3['avgTrialSpeed']*AugData3['avgTrialSpeed']*AugData3['avgTrialSpeed']
AugData3['avgFRCu'] = AugData3['avgFR']*AugData3['avgFR']*AugData3['avgFR']

In [47]:
# Standardize data
stdNeuralDataAug3 = StandardScaler().fit_transform(AugData3) 
# Split off test set for later
X, X_test, y, y_test = train_test_split(stdNeuralDataAug3,timeSinceKetamine.values.ravel(), test_size=0.2)

# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
kf = KFold(n_splits=num_folds)
for train_index, test_index in kf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [48]:
# Run basic linreg model on full train set, check performance against train
model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)

print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

scaled_RMSE = rmse/(max(y)-min(y))
print(scaled_RMSE)

Intercept:  1885.1795660508608
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [-109.64269421    5.56949394 -297.16054155  147.03517754  633.02484052
 -320.68463625  383.75356065  306.21282035   59.10936839 -885.17174324
 -124.44471122  148.60767264 -573.13065192  334.22873075 -435.51857752
 -212.57115878  234.36199972  776.5404991 ]
RMSE:  1856.8250464437979
R2: 0.29486632278414426
0.13481689794944274


In [49]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1852.5491690454987
Average Scaled RMSE across Folds: 0.1345064429994975


##### One more augmentation

In [50]:
AugData4 = neuralData_LE.copy()

AugData4['corrScoreInv'] = 1/(1+AugData4['correlationScore'])
AugData4['avgTrialSpeedInv'] = 1/(1+AugData4['avgTrialSpeed'])
AugData4['avgFRInv'] = 1/(1+AugData4['avgFR'])

# Standardize data
stdNeuralDataAug4 = StandardScaler().fit_transform(AugData4) 
# Split off test set for later
X, X_test, y, y_test = train_test_split(AugData4,timeSinceKetamine.values.ravel(), test_size=0.2)

# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
kf = KFold(n_splits=num_folds)
for train_index, test_index in kf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [51]:
# Run basic linreg model on full train set, check performance against train
model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)

print("Intercept: ",model.intercept_)
print(features,model.coef_)
#print(model.coef_)
y_pred = model.predict(X)
rmse = np.sqrt(mean_squared_error(y,y_pred))
r2 = r2_score(y,y_pred)
print("RMSE: ",rmse)
print("R2:",r2)

scaled_RMSE = rmse/(max(y)-min(y))
print(scaled_RMSE)

Intercept:  3723.2940578811526
['animalName', 'totalCellNum', 'gender', 'genotype', 'weight_g', 'ketamine_day', 'correlationScore', 'lickAccuracy', 'lickNumber', 'avgFR', 'avgSingleCellVariance', 'varianceFR', 'avgTrialSpeed', 'varianceSpeed', 'medianCellDepth'] [ 4.49896054e+01 -2.34591455e+00 -7.26028374e+02  6.13177274e+02
  1.88335768e+02 -2.12580419e+02  1.53203865e+03  1.21257508e+03
  1.53685814e+00 -1.96863846e+02  2.33661963e+00  1.33560998e+02
 -3.69485406e+01  6.52247747e+01 -9.98465322e-01 -2.23107742e+03
  8.34356473e+02 -2.46653535e+03]
RMSE:  1917.019637370434
R2: 0.2774994015017411
0.13918739480244843


In [52]:
rmse_cv = []
for i in range(0,num_folds):
    model = linear_model.LinearRegression(fit_intercept=True,normalize=False,copy_X=True,n_jobs=None).fit(X,y)
    y_pred = model.predict(X_test[i])
    rmse_cv.append(np.sqrt(mean_squared_error(y_test[i],y_pred)))
print("Average RMSE across Folds:",np.mean(rmse_cv))
print("Average Scaled RMSE across Folds:",np.mean(rmse_cv)/(max(y)-min(y)))

Average RMSE across Folds: 1913.6267263538862
Average Scaled RMSE across Folds: 0.13894104863260032
