In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
# https://www.kaggle.com/ruslankl/mice-protein-expression/data
df = pd.read_csv('Data_Cortex_Nuclear.csv')
df['Treated_Geno'] = df['Treatment'] + df['Genotype']

remove_sparse = df.drop([987, 988, 989])
target = remove_sparse['Treated_Geno']
features = remove_sparse.drop(['MouseID', 'Behavior', 'class', 'BCL2_N', 'Treated_Geno',
                   'pCFOS_N', 'H3MeK4_N', 'EGR1_N', 'BAD_N', 'H3AcK18_N', 'ELK_N', 'MEK_N', 
                    'Bcatenin_N', 'pCAMKII_N', 'CAMKII_N', 'pS6_N', 'Genotype', 'Treatment'], axis=1)
norm_features = features.copy()
for column in norm_features.columns:
    norm_features[column] = (norm_features[column] - norm_features[column].mean()) /\
                                norm_features[column].std()
model_df = pd.concat([norm_features, target], axis=1)

feature_columns = norm_features.columns

In [3]:
train, test = train_test_split(model_df, test_size=0.2, random_state=42)

In [5]:
X_test = test[feature_columns]
Y_test = test['Treated_Geno']
X_train = train[feature_columns]
Y_train = train['Treated_Geno']

In [6]:
# baseline
Y_test.value_counts()/len(Y_test)

SalineControl       0.277778
MemantineControl    0.273148
MemantineTs65Dn     0.236111
SalineTs65Dn        0.212963
Name: Treated_Geno, dtype: float64

In [13]:
vanilla = linear_model.LogisticRegression(C=10000000000000)
vanilla.fit(X_train, Y_train)

LogisticRegression(C=10000000000000, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [14]:
vanilla.score(X_test, Y_test)

0.9305555555555556

In [35]:
vanilla.coef_

array([[-4.37301200e+01, -2.95648822e+01, -4.30958287e+00,
        -4.10509721e+01, -2.40346784e+01, -4.32374182e+01,
         6.05601892e+00,  3.18746299e+00,  3.70370126e+01,
         1.77678295e+01,  5.65785790e+01,  5.82726136e+01,
        -1.05152249e+01,  8.14762171e+01,  5.15054963e+01,
        -1.39516631e+02, -9.26036814e+01,  3.44092075e+01,
        -1.10974085e+01, -4.36847931e+01, -1.04953390e+01,
         8.44093045e+01,  1.52519045e+02, -2.88638995e+01,
         3.36222231e+01, -1.26257172e+01, -9.29216859e+01,
        -1.57442109e+02, -4.17995230e+01,  4.71178283e+01,
         3.73960074e+00,  1.20931007e+01,  3.82191125e+01,
         1.96523556e+01,  1.62576922e+01,  4.89701429e+01,
        -5.15651676e+01, -3.12359096e+01, -4.33386802e+01,
        -2.28686096e+01, -4.60521957e+00, -1.81864926e+01,
         4.59230470e+00, -2.99705176e+00, -6.73714770e+00,
         5.15721792e+01,  3.63670749e+01, -3.55556221e+01,
         1.66486578e+01, -1.90284908e+00,  6.43393645e+0

In [19]:
c_values = [.01, .1, .25, .5, .75, 1, 2, 4, 8, 16, 32, 64]

In [21]:
c_value_scores = []
for c in c_values:
    lasso = linear_model.LogisticRegression(penalty='l1', C=c)
    score = cross_val_score(lasso, X_train, Y_train, cv=5).mean()
    c_value_scores.append((c, score))

In [22]:
c_value_scores

[(0.01, 0.3415265643392094),
 (0.1, 0.8293600547747342),
 (0.25, 0.9024634561544197),
 (0.5, 0.9233071652668157),
 (0.75, 0.9314673344682207),
 (1, 0.9361320971501194),
 (2, 0.9419261197634556),
 (4, 0.939580607493071),
 (8, 0.9384175823145139),
 (16, 0.9337999502900528),
 (32, 0.9315078996609406),
 (64, 0.9233746957610546)]

In [29]:
lasso = linear_model.LogisticRegression(penalty='l1', C=2)
lasso.fit(X_train, Y_train)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
lasso.score(X_test, Y_test)

0.9259259259259259

In [37]:
lasso.coef_

array([[ 0.00000000e+00, -2.36460414e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.77457112e+00,
         3.03342908e-01, -2.26640753e-01,  8.26724529e-01,
         0.00000000e+00,  3.46611720e+00,  1.78360490e+00,
         0.00000000e+00,  1.14930567e+00,  1.86505556e+00,
        -3.93993635e+00, -2.65335744e+00,  7.53357456e-01,
        -2.46140427e-01, -1.32100269e+00,  0.00000000e+00,
         2.54759814e+00,  6.08130989e+00, -1.59275642e+00,
        -5.11546352e-01, -4.45811562e-01, -3.80916152e+00,
        -5.07127306e+00, -1.09255885e-01,  8.86198359e-01,
        -4.78639857e-02,  0.00000000e+00,  1.79201624e+00,
        -1.23203140e-02,  6.94480926e-01,  1.57370635e+00,
        -2.04563701e+00, -1.05515446e+00, -2.24527540e+00,
        -1.17979494e-01,  0.00000000e+00,  0.00000000e+00,
         6.01645732e-02,  3.42375829e-01, -6.46231538e-02,
         2.82333761e-01,  1.52722854e+00, -1.92278769e+00,
         0.00000000e+00, -3.01095679e-01,  2.30179238e+0

In [31]:
c_value_scores = []
for c in c_values:
    ridge = linear_model.LogisticRegression(penalty='l2', C=c)
    score = cross_val_score(ridge, X_train, Y_train, cv=5).mean()
    c_value_scores.append((c, score))

In [32]:
c_value_scores

[(0.01, 0.8152391470523408),
 (0.1, 0.9024099945131473),
 (0.25, 0.9256327466621646),
 (0.5, 0.9337324197958139),
 (0.75, 0.9360716010823638),
 (1, 0.9383971824777125),
 (2, 0.9430687451052115),
 (4, 0.9430757795316946),
 (8, 0.9442184048734508),
 (16, 0.9384311822057148),
 (32, 0.9395939729033891),
 (64, 0.9384581475072336)]

In [33]:
ridge = linear_model.LogisticRegression(penalty='l2', C=8)
ridge.fit(X_train, Y_train)

LogisticRegression(C=8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [34]:
ridge.score(X_test, Y_test)

0.9259259259259259

In [38]:
ridge.coef_

array([[-1.05794812e+00, -2.25395031e+00,  8.21547676e-02,
        -7.64625346e-01, -4.39594503e-02, -2.02554403e+00,
         5.45163574e-01, -3.43633789e-01,  1.03763903e+00,
         8.05998268e-01,  3.77916752e+00,  2.31000941e+00,
        -9.94049723e-02,  1.65954040e+00,  2.02267941e+00,
        -4.06506960e+00, -3.05994188e+00,  1.19612501e+00,
        -3.90438220e-01, -1.50211952e+00, -1.89008418e-01,
         3.44345751e+00,  6.57568275e+00, -1.88105637e+00,
        -1.02031074e+00, -3.55156073e-01, -4.12607646e+00,
        -5.46655600e+00, -3.24589609e-01,  1.25699572e+00,
        -5.74244173e-01,  1.09404735e-01,  2.13862212e+00,
        -2.33508671e-01,  9.39001079e-01,  1.83350824e+00,
        -2.25601143e+00, -1.16950577e+00, -2.43940628e+00,
        -3.54391965e-01, -7.48124896e-02, -3.99440229e-01,
         1.52244320e-01,  3.66599085e-01,  2.36373567e-04,
         1.07192727e+00,  1.72193947e+00, -2.23515958e+00,
         1.31173464e-01, -3.61742696e-01,  2.69875489e+0

Apparently the vanilla logistic regression does best in terms of accuracy for this particular dataset. Lasso regression does force a few coefficients to zero while still maintaining accuracy, so we would expect that Lasso will more readily generalize. Ridge regression behaves as expected with all coefficients maintaining a reasonable size, though we do not observe large coefficients in the regular logistic regression case. 