# Classification



In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
os.chdir("..")

In [2]:
from IPython.core.debugger import Tracer
from IPython.display import display
import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
from scipy.signal import welch
from IPython.core.debugger import Tracer
from sklearn_pandas import DataFrameMapper, cross_val_score
from p300.feature_extraction import BaseTransformer

pd.set_option('display.max_rows', 500)


In [3]:
hdf = pd.HDFStore("output/instances.h5")


hdf.keys()

['/subjects/s10229001',
 '/subjects/s10444001',
 '/subjects/s10729001',
 '/subjects/s10882001',
 '/subjects/s10924001',
 '/subjects/s11551001',
 '/subjects/s11627001',
 '/subjects/s11632001',
 '/subjects/s11693001',
 '/subjects/s12137001',
 '/subjects/s12168001',
 '/subjects/s12521001',
 '/subjects/s12702001',
 '/subjects/s12900001',
 '/subjects/s13235001',
 '/subjects/s13252001',
 '/subjects/s13431001',
 '/subjects/s13640002',
 '/subjects/s13863001',
 '/subjects/s14023001',
 '/subjects/s1414001',
 '/subjects/s1491001',
 '/subjects/s14998001',
 '/subjects/s15362001',
 '/subjects/s15424001',
 '/subjects/s15641001',
 '/subjects/s16003001',
 '/subjects/s1609001',
 '/subjects/s16266001',
 '/subjects/s1635001',
 '/subjects/s16637001',
 '/subjects/s16683001',
 '/subjects/s16779001',
 '/subjects/s16893001',
 '/subjects/s16943001',
 '/subjects/s17005001',
 '/subjects/s17435001',
 '/subjects/s17436001',
 '/subjects/s17576001',
 '/subjects/s17674001',
 '/subjects/s17962001',
 '/subjects/s1804600

Veamos cuántos sujetos tenemos

Tomemos uno: 29164001

In [4]:
from sklearn.model_selection import cross_val_score

non_features = ['id', 'array_path', 'ch_names', 'event_time', 'event_type', 'index',
       'sfreq', 'subject_id', 'target']

def get_data_from_subject(hdf, key):
    df = hdf[key]
    X = df[df.columns.difference(non_features)]
    y = df.target.as_matrix()
    
    return X.as_matrix(), y, X.columns

X, y, names = get_data_from_subject(hdf, '/subjects/s5224001')

X.shape

(1980, 112)

# SVM

In [5]:
from sklearn.svm import LinearSVC, SVC

clf = LinearSVC()

In [6]:
cross_val_score(clf, X, y, scoring='roc_auc', cv=10).mean()

0.52387511478420568

In [7]:
clf = SVC(kernel='rbf', C=0.5)

In [8]:
cross_val_score(clf, X, y, scoring='roc_auc', cv=10).mean()

0.52330578512396697

# LDA

In [9]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

clf = LinearDiscriminantAnalysis()

In [10]:
cross_val_score(clf, X, y, scoring='roc_auc', cv=10).mean()

0.7092194674012855

# LDA + RFE


In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV

clf = LinearDiscriminantAnalysis()
rfecv = RFECV(estimator=clf, step=1, scoring='roc_auc', cv=StratifiedKFold(10))
rfecv.fit(X, y)

RFECV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
   estimator=LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001),
   n_jobs=1, scoring='roc_auc', step=1, verbose=0)

In [12]:
rfecv.get_support().sum()

86

In [13]:
X_rfe = X[:, rfecv.support_]


cross_val_score(clf, X_rfe, y, scoring='roc_auc', cv=10).mean()

0.73166207529843885

# Balance de las clases

In [14]:
total = len(y)
no_targets = sum(y)
print("Total samples: {} - Total positives {}".format(total, no_targets))
print("Ratio : {}".format(no_targets / total))

Total samples: 1980 - Total positives 330
Ratio : 0.16666666666666666


Como vemos, el balance es 1 target cada 6. Eso es porque el P300 tiene 6 columnas y filas.

# LDA con todos los sujetos

In [15]:
results = []
clf = LinearDiscriminantAnalysis()

for key in hdf.keys():
    clf = LinearDiscriminantAnalysis()
    
    X, y, _ = get_data_from_subject(hdf, key)
    auc = cross_val_score(clf, X, y, scoring='roc_auc').mean()
    precision = cross_val_score(clf, X, y, scoring='precision').mean()
    recall = cross_val_score(clf, X, y, scoring='recall').mean()
    accuracy = cross_val_score(clf, X, y, scoring='accuracy').mean()
    
    results.append({
        'subject_id': key,
        'auc': auc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
    })
    
results_df = pd.DataFrame(results)

In [16]:

display(results_df)
display(results_df.mean())

Unnamed: 0,accuracy,auc,precision,recall,subject_id
0,0.802222,0.643967,0.374225,0.226667,/subjects/s10229001
1,0.811616,0.668518,0.34827,0.133333,/subjects/s10444001
2,0.822593,0.638785,0.404832,0.091111,/subjects/s10729001
3,0.824747,0.738496,0.465224,0.254545,/subjects/s10882001
4,0.824206,0.678772,0.428338,0.197619,/subjects/s10924001
5,0.823333,0.664313,0.450487,0.2,/subjects/s11551001
6,0.812778,0.592347,0.210445,0.05,/subjects/s11627001
7,0.838889,0.770187,0.558276,0.278788,/subjects/s11632001
8,0.813636,0.656242,0.378002,0.169697,/subjects/s11693001
9,0.822619,0.718673,0.438704,0.228571,/subjects/s12137001


accuracy     0.819373
auc          0.654100
precision    0.360827
recall       0.169186
dtype: float64

# LDA + RFE


RFE stands for Recursive Feature Elimination. I think it is similar to backward feature elimination.

In [17]:
from sklearn.feature_selection import RFE, RFECV

results = []

for key in hdf.keys():
    clf = LinearDiscriminantAnalysis()
    X, y, column_names = get_data_from_subject(hdf, key)
    
    rfecv = RFECV(estimator=clf, step=1, scoring='roc_auc')
    rfecv.fit(X, y)
    
    X = X[:, rfecv.support_]
    
    selected_columns = column_names[rfecv.support_]
    
    auc = cross_val_score(clf, X, y, scoring='roc_auc').mean()
    precision = cross_val_score(clf, X, y, scoring='precision').mean()
    recall = cross_val_score(clf, X, y, scoring='recall').mean()
    accuracy = cross_val_score(clf, X, y, scoring='accuracy').mean()
    
    results.append({
        'subject_id': key,
        'auc': auc,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'no_columns': sum(rfecv.support_),
        'column_names': ",".join(selected_columns),
    })
    



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [19]:
results_df = pd.DataFrame(results)
display(results_df[results_df.columns.difference(["column_names"])])
display(results_df.mean())

Unnamed: 0,accuracy,auc,no_columns,precision,recall,subject_id
0,0.826111,0.67644,76,0.438804,0.153333,/subjects/s10229001
1,0.830808,0.698099,61,0.481249,0.157576,/subjects/s10444001
2,0.828148,0.659964,14,0.25,0.006667,/subjects/s10729001
3,0.842929,0.757636,41,0.580648,0.209091,/subjects/s10882001
4,0.837698,0.703463,81,0.535745,0.195238,/subjects/s10924001
5,0.830556,0.707347,64,0.50539,0.166667,/subjects/s11551001
6,0.813889,0.595653,109,0.220873,0.053333,/subjects/s11627001
7,0.851515,0.797934,45,0.62201,0.29697,/subjects/s11632001
8,0.832828,0.709185,38,0.50857,0.10303,/subjects/s11693001
9,0.83373,0.752316,30,0.515789,0.130952,/subjects/s12137001


accuracy       0.832455
auc            0.688476
no_columns    60.581560
precision      0.430735
recall         0.154268
dtype: float64