In [44]:
library(boot)
library(glmnet)

In [2]:
set.seed(2019)

In [3]:
train <- read.csv("trainC.csv")
test <- read.csv("testC.csv")
train <- subset(train, select = -c(sessionDate, trialNum, timeSinceKetamine))
test <- subset(test, select = -c(sessionDate, trialNum, timeSinceKetamine))

#TRAIN 1: ALL COVARIATES PLUS INTERACTION TERMS
train1 <- read.csv("trainC.csv")
test1 <- read.csv("testC.csv")
train1 <- subset(train1, select = -c(sessionDate, trialNum, timeSinceKetamine))
test1 <- subset(test1, select = -c(sessionDate, trialNum, timeSinceKetamine))

#TRAIN 2: ALL COVARIATES NO INTERACTION TERMS
train2 <- subset(train1, select = c(totalCellNum,gender,genotype,weight_g,ketamine_day,
                                    correlationScore,lickAccuracy,lickNumber,avgFR,
                                    avgSingleCellVariance,varianceFR,avgTrialSpeed,
                                    varianceSpeed,medianCellDepth,ketBool))
test2 <- subset(test1, select = c(totalCellNum,gender,genotype,weight_g,ketamine_day,
                                    correlationScore,lickAccuracy,lickNumber,avgFR,
                                    avgSingleCellVariance,varianceFR,avgTrialSpeed,
                                    varianceSpeed,medianCellDepth,ketBool))

In [4]:
# First, let's do a 50% split on the training data to determine the best lambda
n = length(train[,1])
n50 = round(n/2)
train50A = train[1:n50,]
train50B = train[(n50+1):n,]

xA = model.matrix(ketBool ~ . + animalName:correlationScore
                    + animalName:lickAccuracy
                    + animalName:lickNumber
                    + animalName:avgFR
                    + animalName:avgSingleCellVariance
                    + animalName:varianceFR
                    + animalName:avgTrialSpeed
                    + animalName:varianceSpeed-1, data = train50A)
yA = train50A$ketBool

xB = model.matrix(ketBool ~ . + animalName:correlationScore
                    + animalName:lickAccuracy
                    + animalName:lickNumber
                    + animalName:avgFR
                    + animalName:avgSingleCellVariance
                    + animalName:varianceFR
                    + animalName:avgTrialSpeed
                    + animalName:varianceSpeed-1, data = train50B)
yB = train50B$ketBool

In [5]:
# Select regularization parameter over trainA (50% of training data)
model_lasso <- cv.glmnet(xA, yA, family='binomial',alpha=1)

lambda_min_lasso = model_lasso$lambda.min
cat("lambda_min_lasso = ",lambda_min_lasso,"\n")

#lambda_1se_lasso = model_lasso$lambda.1se
#cat("lambda_1se_lasso = ",lambda_1se_lasso,"\n")

best_lasso_model <- glmnet(xB, yB, family='binomial',alpha=1,lambda=lambda_min_lasso)
fitted_coef_lasso <- coef(best_lasso_model)

lambda_min_lasso =  0.001487149 
lambda_1se_lasso =  0.003770465 


In [8]:
# Select regularization parameter over trainA (50% of training data)
model_lasso <- cv.glmnet(xA, yA, family='binomial',alpha=0)
lambda_min_ridge = model_lasso$lambda.min
cat("lambda_min_ridge = ",lambda_min_ridge,"\n")

#lambda_1se_ridge = model_lasso$lambda.1se
#cat("lambda_1se_ridge = ",lambda_1se_ridge,"\n")

best_ridge_model <- glmnet(xB, yB, family='binomial',alpha=0,lambda=lambda_min_ridge)
fitted_coef_ridge <- coef(best_ridge_model)

lambda_min_ridge =  0.02988029 


## Performance on Test Set

In [45]:
x = model.matrix(ketBool ~ . + animalName:correlationScore
                    + animalName:lickAccuracy
                    + animalName:lickNumber
                    + animalName:avgFR
                    + animalName:avgSingleCellVariance
                    + animalName:varianceFR
                    + animalName:avgTrialSpeed
                    + animalName:varianceSpeed-1, data = test)
y = test$ketBool


In [None]:
pred_lo = predict(best_lasso_model, newx = x)
num_val = length(y)
lr_pred = rep(0,num_val)
actual = rep(0,num_val)
for (j in 1:num_val){
    if (pred_lo[j]>0){
        lr_pred[j]=1
    }
    actual[j] = y[j]
}
# Compute 0-1 loss for each observation
lr_loss = abs(lr_pred-actual) # loss is 0 if NB_pred=actual, 1 otherwise
# Compute mean 0-1 loss on the val set
zoloss[k] = mean(lr_loss)
test_error_est = mean(zoloss)

## Bootstrap with Lasso for confidence intervals

In [47]:
df = train50B
coef.bootL = function(data, indices) {
    dataT = data[indices,]
    xB = model.matrix(ketBool ~ . + animalName:correlationScore
                    + animalName:lickAccuracy
                    + animalName:lickNumber
                    + animalName:avgFR
                    + animalName:avgSingleCellVariance
                    + animalName:varianceFR
                    + animalName:avgTrialSpeed
                    + animalName:varianceSpeed-1, data = dataT)
    yB = dataT$ketBool
    fm = glmnet(xB, yB, family='binomial',alpha=1,lambda=lambda_min_lasso)
    cat(length(coef(fm)))
    return(coef(fm))
}

In [None]:
boot.outL = boot(df, coef.bootL, 5000)

#boot.out.lasso.5000 = boot(df, coef.boot.lasso, 5000)
#boot.out.lasso.5000

In [39]:
cf <- coef(best_lasso_model)

In [42]:
cf[3]

In [41]:
cf

181 x 1 sparse Matrix of class "dgCMatrix"
                                               s0
(Intercept)                            0.43258927
animalNameG1                          -0.43788900
animalNameG2                           .         
animalNameG3                           0.60268131
animalNameG4                           1.15443858
animalNameG5                          -0.88261012
animalNameHCN1                         3.95489335
animalNameHCNb2                       -2.20361742
animalNameHCNb4                        .         
animalNameHCNd1                        .         
animalNameHCNd2                       -0.53378286
animalNameHCNe1                        .         
animalNameHCNe2                        .         
animalNameHCNe3                        .         
animalNamenpI1                         .         
totalCellNum                           0.08646754
gender                                 .         
genotype                               .         
weight_