In [1]:
library(boot)
library(glmnet)

Loading required package: Matrix
Loading required package: foreach
Loaded glmnet 2.0-16



In [2]:
set.seed(2019)

In [3]:
#TRAIN 1: ALL COVARIATES PLUS INTERACTION TERMS
train1 <- read.csv("trainC.csv")
test1 <- read.csv("testC.csv")
train1 <- subset(train1, select = -c(sessionDate, trialNum, timeSinceKetamine, animalName))
test1 <- subset(test1, select = -c(sessionDate, trialNum, timeSinceKetamine, animalName))

#TRAIN 2: ALL COVARIATES NO INTERACTION TERMS
train2 <- subset(train1, select = c(totalCellNum,gender,genotype,weight_g,ketamine_day,
                                    correlationScore,lickAccuracy,lickNumber,avgFR,
                                    avgSingleCellVariance,varianceFR,avgTrialSpeed,
                                    varianceSpeed,medianCellDepth,ketBool))
test2 <- subset(test1, select = c(totalCellNum,gender,genotype,weight_g,ketamine_day,
                                    correlationScore,lickAccuracy,lickNumber,avgFR,
                                    avgSingleCellVariance,varianceFR,avgTrialSpeed,
                                    varianceSpeed,medianCellDepth,ketBool))

# Model Generation and Test Error Estimation

## Basic Logistic Regression Model with Interaction Terms

In [4]:
k = 10
n = length(train1[,1])
fsize = round(n/k)
rmse = rep(0,k)
zoloss = rep(0,k)
for (i in 1:(k-1)){
    # Get train and validation sets
    df_train <- train1[-(((i-1)*fsize+1):(i*fsize)),]
    df_val <- train1[((i-1)*fsize+1):(i*fsize),]
    # Fit model on training and make predictions on validation
    model_cv <- glm(ketBool ~ ., data=df_train, family='binomial')
    lr_pred_lo <- predict(model_cv,df_val) # lo : log odds
    num_val = length(df_val$ketBool)
    lr_pred = rep(0,num_val)
    actual = rep(0,num_val)
    for (j in 1:num_val){
        if (lr_pred_lo[j]>0){
            lr_pred[j]=1
        }
    actual[j] = df_val$ketBool[j]
    }
    # Compute 0-1 loss for each observation
    lr_loss = abs(lr_pred-actual) # loss is 0 if NB_pred=actual, 1 otherwise
    # Compute mean 0-1 loss on the val set
    zoloss[i] = mean(lr_loss)
}
df_train <- train1[-(((k-1)*fsize+1):n),]
df_val <- train1[((k-1)*fsize+1):n,]
# Fit model on training and make predictions on validation
model_cv <- glm(ketBool ~ ., data=df_train, family='binomial')
lr_pred_lo <- predict(model_cv,df_val) # lo : log odds
num_val = length(df_val$ketBool)
lr_pred = rep(0,num_val)
actual = rep(0,num_val)
for (j in 1:num_val){
    if (lr_pred_lo[j]>0){
        lr_pred[j]=1
    }
    actual[j] = df_val$ketBool[j]
}
lr_loss = abs(lr_pred-actual)
zoloss[k] = mean(lr_loss)
test_error_est = mean(zoloss)

cat("=====================================================================\n")
cat("Logistic Regression Model with Interaction Terms\n\n")
cat("Zero-One Loss (10-fold Cross-Validation Average):",test_error_est,"\n")
cat("Accuracy (10-fold Cross-Validation Average):",1-test_error_est,"\n")
cat("=====================================================================\n")

# Train now on entire training set to get model for prediction
model1 <- glm(ketBool ~ ., data=train1, family='binomial')

Logistic Regression Model with Interaction Terms

Zero-One Loss (10-fold Cross-Validation Average): 0.09182746 
Accuracy (10-fold Cross-Validation Average): 0.9081725 


## Basic Logistic Regression without Interaction Terms

In [5]:
k = 10
n = length(train2[,1])
fsize = round(n/k)
rmse = rep(0,k)
zoloss = rep(0,k)
for (i in 1:(k-1)){
    # Get train and validation sets
    df_train <- train2[-(((i-1)*fsize+1):(i*fsize)),]
    df_val <- train2[((i-1)*fsize+1):(i*fsize),]
    # Fit model on training and make predictions on validation
    model_cv <- glm(ketBool ~ ., data=df_train, family='binomial')
    lr_pred_lo <- predict(model_cv,df_val) # lo : log odds
    num_val = length(df_val$ketBool)
    lr_pred = rep(0,num_val)
    actual = rep(0,num_val)
    for (j in 1:num_val){
        if (lr_pred_lo[j]>0){
            lr_pred[j]=1
        }
    actual[j] = df_val$ketBool[j]
    }
    # Compute 0-1 loss for each observation
    lr_loss = abs(lr_pred-actual) # loss is 0 if NB_pred=actual, 1 otherwise
    # Compute mean 0-1 loss on the val set
    zoloss[i] = mean(lr_loss)
}
df_train <- train2[-(((k-1)*fsize+1):n),]
df_val <- train2[((k-1)*fsize+1):n,]
# Fit model on training and make predictions on validation
model_cv <- glm(ketBool ~ ., data=df_train, family='binomial')
lr_pred_lo <- predict(model_cv,df_val) # lo : log odds
num_val = length(df_val$ketBool)
lr_pred = rep(0,num_val)
actual = rep(0,num_val)
for (j in 1:num_val){
    if (lr_pred_lo[j]>0){
        lr_pred[j]=1
    }
    actual[j] = df_val$ketBool[j]
}
lr_loss = abs(lr_pred-actual)
zoloss[k] = mean(lr_loss)
test_error_est = mean(zoloss)

cat("=====================================================================\n")
cat("Logistic Regression Model without Interaction Terms\n\n")
cat("Zero-One Loss (10-fold Cross-Validation Average):",test_error_est,"\n")
cat("Accuracy (10-fold Cross-Validation Average):",1-test_error_est,"\n")
cat("=====================================================================\n")

# Train now on entire training set to get model for prediction
model2 <- glm(ketBool ~ ., data=train2, family='binomial')

Logistic Regression Model without Interaction Terms

Zero-One Loss (10-fold Cross-Validation Average): 0.1413709 
Accuracy (10-fold Cross-Validation Average): 0.8586291 


# Look at Coefficients on TRAIN

## Model 1 (including interaction terms) Summary

In [10]:
summary(model1)


Call:
glm(formula = ketBool ~ ., family = "binomial", data = train1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.9532  -0.2677   0.0100   0.2355   5.0333  

Coefficients:
                                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)                           -3.07543    0.84032  -3.660 0.000252 ***
totalCellNum                           1.48468    0.36237   4.097 4.18e-05 ***
gender                                 5.27077    0.80163   6.575 4.86e-11 ***
genotype                               4.70327    0.73513   6.398 1.58e-10 ***
weight_g                               0.12584    0.35738   0.352 0.724760    
ketamine_day                          -0.71834    0.25786  -2.786 0.005340 ** 
correlationScore                      -2.30386    0.96069  -2.398 0.016479 *  
lickAccuracy                          -2.59838    0.62801  -4.137 3.51e-05 ***
lickNumber                            -0.51051    0.76272  -0.669 0.503289    
avgFR      

## Model 2 (not including interaction terms) Summary

In [11]:
summary(model2)


Call:
glm(formula = ketBool ~ ., family = "binomial", data = train2)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.1024  -0.4780   0.0845   0.4688   3.8002  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.64251    0.15630  -4.111 3.94e-05 ***
totalCellNum          -0.03728    0.06442  -0.579 0.562759    
gender                 0.46578    0.13882   3.355 0.000793 ***
genotype               0.05546    0.11283   0.492 0.623009    
weight_g              -0.46373    0.06503  -7.131 9.94e-13 ***
ketamine_day           0.16121    0.04486   3.594 0.000326 ***
correlationScore      -1.50123    0.07377 -20.351  < 2e-16 ***
lickAccuracy          -0.80960    0.05888 -13.750  < 2e-16 ***
lickNumber            -0.60947    0.06549  -9.306  < 2e-16 ***
avgFR                  1.77153    0.12885  13.749  < 2e-16 ***
avgSingleCellVariance -1.32595    0.11463 -11.567  < 2e-16 ***
varianceFR            -0.23810    0.05966  -3

# Test Performance

In [6]:
lr_pred_lo <- predict(model1,test1) # lo : log odds
num_val = length(test1$ketBool)
lr_pred = rep(0,num_val)
actual = rep(0,num_val)
for (j in 1:num_val){
    if (lr_pred_lo[j]>0){
        lr_pred[j]=1
    }
    actual[j] = test1$ketBool[j]
}
lr_loss = abs(lr_pred-actual)
zoloss[k] = mean(lr_loss)
test_error_est = mean(zoloss)

cat("=====================================================================\n")
cat("Logistic Regression Model with Interaction Terms\n\n")
cat("Zero-One Loss (Test Set):",test_error_est,"\n")
cat("Accuracy (Test Set):",1-test_error_est,"\n")
cat("=====================================================================\n")

Logistic Regression Model with Interaction Terms

Zero-One Loss (Test Set): 0.13475 
Accuracy (Test Set): 0.86525 


In [8]:
lr_pred_lo <- predict(model2,test2) # lo : log odds
num_val = length(test2$ketBool)
lr_pred = rep(0,num_val)
actual = rep(0,num_val)
for (j in 1:num_val){
    if (lr_pred_lo[j]>0){
        lr_pred[j]=1
    }
    actual[j] = test2$ketBool[j]
}
lr_loss = abs(lr_pred-actual)
zoloss[k] = mean(lr_loss)
test_error_est = mean(zoloss)

cat("=====================================================================\n")
cat("Logistic Regression Model without Interaction Terms\n\n")
cat("Zero-One Loss (Test Set):",test_error_est,"\n")
cat("Accuracy (Test Set):",1-test_error_est,"\n")
cat("=====================================================================\n")

Logistic Regression Model without Interaction Terms

Zero-One Loss (Test Set): 0.14175 
Accuracy (Test Set): 0.85825 


# Look at Coefficients on TEST

## With Interaction Terms

In [15]:
model1_test <- glm(ketBool ~ ., data=test1, family='binomial')
summary(model1_test)

“glm.fit: fitted probabilities numerically 0 or 1 occurred”


Call:
glm(formula = ketBool ~ ., family = "binomial", data = test1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-3.0289  -0.1986   0.0016   0.1849   2.9694  

Coefficients:
                                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)                           -3.65227    2.05234  -1.780 0.075147 .  
totalCellNum                           0.26300    0.88535   0.297 0.766426    
gender                                 7.81598    1.83003   4.271 1.95e-05 ***
genotype                               7.42112    1.86461   3.980 6.89e-05 ***
weight_g                               1.15238    0.83398   1.382 0.167037    
ketamine_day                          -1.58289    0.63604  -2.489 0.012823 *  
correlationScore                      -2.20880    2.19214  -1.008 0.313647    
lickAccuracy                          -2.24522    1.44600  -1.553 0.120494    
lickNumber                             0.33636    1.85504   0.181 0.856112    
avgFR       

## Without Interaction Terms

In [16]:
model2_test <- glm(ketBool ~ ., data=test2, family='binomial')
summary(model2_test)


Call:
glm(formula = ketBool ~ ., family = "binomial", data = test2)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-2.52524  -0.51562   0.05817   0.49094   3.03239  

Coefficients:
                      Estimate Std. Error z value Pr(>|z|)    
(Intercept)           -0.11054    0.29360  -0.377 0.706540    
totalCellNum          -0.20103    0.12503  -1.608 0.107873    
gender                 0.38001    0.26237   1.448 0.147507    
genotype               0.28291    0.21514   1.315 0.188518    
weight_g              -0.21089    0.13190  -1.599 0.109851    
ketamine_day          -0.07733    0.08284  -0.934 0.350543    
correlationScore      -1.27870    0.13062  -9.789  < 2e-16 ***
lickAccuracy          -0.77094    0.10958  -7.035 1.99e-12 ***
lickNumber            -0.69481    0.14704  -4.725 2.30e-06 ***
avgFR                  1.96396    0.25970   7.562 3.96e-14 ***
avgSingleCellVariance -1.21420    0.22392  -5.423 5.88e-08 ***
varianceFR            -0.54385    0.