In [7]:
## Download data from 
## https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression
## change the header of the last column 'class' to 'GBT'
## save the file in csv format
import pandas as pd
df = pd.read_csv('../data/Data_Cortex_Nuclear.csv')
df.head()

Unnamed: 0,MouseID,DYRK1A_N,ITSN1_N,BDNF_N,NR1_N,NR2A_N,pAKT_N,pBRAF_N,pCAMKII_N,pCREB_N,...,pCFOS_N,SYP_N,H3AcK18_N,EGR1_N,H3MeK4_N,CaNA_N,Genotype,Treatment,Behavior,GBT
0,309_1,0.503644,0.747193,0.430175,2.816329,5.990152,0.21883,0.177565,2.373744,0.232224,...,0.108336,0.427099,0.114783,0.13179,0.128186,1.675652,Control,Memantine,C/S,c-CS-m
1,309_2,0.514617,0.689064,0.41177,2.789514,5.685038,0.211636,0.172817,2.29215,0.226972,...,0.104315,0.441581,0.111974,0.135103,0.131119,1.74361,Control,Memantine,C/S,c-CS-m
2,309_3,0.509183,0.730247,0.418309,2.687201,5.622059,0.209011,0.175722,2.283337,0.230247,...,0.106219,0.435777,0.111883,0.133362,0.127431,1.926427,Control,Memantine,C/S,c-CS-m
3,309_4,0.442107,0.617076,0.358626,2.466947,4.979503,0.222886,0.176463,2.152301,0.207004,...,0.111262,0.391691,0.130405,0.147444,0.146901,1.700563,Control,Memantine,C/S,c-CS-m
4,309_5,0.43494,0.61743,0.358802,2.365785,4.718679,0.213106,0.173627,2.134014,0.192158,...,0.110694,0.434154,0.118481,0.140314,0.14838,1.83973,Control,Memantine,C/S,c-CS-m


In [12]:
## Generate input (X) and output (y)
## Use Genotype as class label (79th column, index=78)
import numpy as np
X = df.iloc[:, 1:78].values
y = df.iloc[:, 78].values
X[0:5, 0:5]
y[0:5]

array([[0.50364388, 0.74719322, 0.4301753 , 2.81632854, 5.99015166],
       [0.51461708, 0.68906355, 0.41177034, 2.78951404, 5.68503786],
       [0.50918309, 0.7302468 , 0.41830878, 2.68720107, 5.62205854],
       [0.44210669, 0.61707615, 0.35862631, 2.4669472 , 4.97950319],
       [0.43494024, 0.61742984, 0.3588022 , 2.36578488, 4.71867866]])

In [22]:
## Preprocessing: missing values and normalization
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X_imp = imp.transform(X)
print(X[738:750, 68:70])
print(X_imp[738:750, 68:70])
X_norm=StandardScaler().fit_transform(X_imp)
print(X_norm[738:750, 68:70])

[[nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]
 [nan nan]]
[[0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]
 [0.15791424 0.13476175]]
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [24]:
## Use LabelEncoder for numerical encoding of class labels
from sklearn import preprocessing  
label_encoder = preprocessing.LabelEncoder()
y_enc = label_encoder.fit_transform(y)
y_enc

array([0, 0, 0, ..., 1, 1, 1])

In [27]:
## Naive Bayes classifier on the transformed data
## performance evaluation: 5-fold cross validation 
## metric: auc
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import GaussianNB
k=5
gnb = GaussianNB()
skf = StratifiedKFold(n_splits=k, shuffle=True)
sum_auc = 0
for train_index, test_index in skf.split(X_norm, y_enc):
    gnb = gnb.fit(X_norm[train_index], y_enc[train_index])
    pred = gnb.predict(X_norm[test_index])
    auc = roc_auc_score(y_enc[test_index], pred)
    sum_auc += auc
avg_auc = sum_auc / k
avg_auc

0.7575335397316821

In [26]:
?StratifiedKFold

[0;31mInit signature:[0m [0mStratifiedKFold[0m[0;34m([0m[0mn_splits[0m[0;34m=[0m[0;34m'warn'[0m[0;34m,[0m [0mshuffle[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Stratified K-Folds cross-validator

Provides train/test indices to split data in train/test sets.

This cross-validation object is a variation of KFold that returns
stratified folds. The folds are made by preserving the percentage of
samples for each class.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
n_splits : int, default=3
    Number of folds. Must be at least 2.

    .. versionchanged:: 0.20
        ``n_splits`` default value will change from 3 to 5 in v0.22.

shuffle : boolean, optional
    Whether to shuffle each class's samples before splitting into batches.

random_state : int, RandomState instance or None, optional, default=None
    If int, random_state is th