# Draft of classification module

In [71]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import scipy.io as sio
import os
import ieeg_funcs as ief
import dgFuncs as dg
from sklearn import svm
from sklearn.externals import joblib

In [2]:
# Block for re-loading my libraries if I make updates
import imp 
imp.reload(ief) 
imp.reload(dg) 

<module 'dgFuncs' from '/Users/davidgroppe/PycharmProjects/DG_LIBRARY/dgFuncs.py'>

In [7]:
# Import list of subjects to use
path_dict=ief.get_path_dict()
use_subs_df=pd.read_csv(os.path.join(path_dict['szr_ant_root'],'use_subs.txt'),header=None,na_filter=False)
test_sub_list=['NA']
train_subs_list=[]
for sub in use_subs_df.iloc[:,0]:
    if not sub in test_sub_list:
        train_subs_list.append(sub)
        
print('Training subs: {}'.format(train_subs_list))

Training subs: ['CC', 'CJ', 'CO', 'CT', 'IB', 'JW', 'RB', 'TF']


In [15]:
# Figure out how much data there is to preallocate mem
n_ftrs=0
n_wind=0
for sub in train_subs_list:
    ftr_path=os.path.join(path_dict['ftrs_root'],'PWR',sub)
    for f in os.listdir(ftr_path):
        ftr_dict=np.load(os.path.join(ftr_path,f))
        if n_ftrs==0:
            n_ftrs=ftr_dict['db_pwr'].shape[0]
        n_wind+=np.sum(ftr_dict['peri_ictal']>=0)
print('n_ftrs=%d' % n_ftrs)
print('n_wind=%d' % n_wind)

n_ftrs=6
n_wind=95615


In [16]:
ftr_dict.keys()

['time_wind_sec', 'db_pwr', 'peri_ictal']

In [43]:
# Load all data into a giant matrix
ftrs=np.zeros((n_wind,n_ftrs))
szr_class=np.zeros(n_wind)
sub_id=np.zeros(n_wind)
wind_ct=0
sub_ct=0
for sub in train_subs_list:
    ftr_path=os.path.join(path_dict['ftrs_root'],'PWR',sub)
    for f in os.listdir(ftr_path):
        ftr_dict=np.load(os.path.join(ftr_path,f))
        neo_wind=np.sum(ftr_dict['peri_ictal']>=0)
        ftrs[wind_ct:wind_ct+neo_wind,:]=ftr_dict['db_pwr'][:,:neo_wind].T
        szr_class[wind_ct:wind_ct+neo_wind]=ftr_dict['peri_ictal'][:neo_wind]
        sub_id[wind_ct:wind_ct+neo_wind]=np.ones(neo_wind)*sub_ct
        wind_ct+=neo_wind
    sub_ct+=1

In [38]:
print(szr_class.shape)
print(ftrs.shape)

(95615,)
(95615, 6)


In [45]:
bro=szr_class[sub_id!=0]
bro.shape

(90170,)

In [None]:
# need to scale data
#class_weight='balanced' and/o

#gamma defines how much influence a single training example has. The larger gamma is, the closer other examples must be to be affected.
# Proper choice of C and gamma is critical to the SVM’s performance. One is advised to 
# use sklearn.model_selection.GridSearchCV with C and gamma spaced exponentially far apart 
# to choose good values.

In [46]:
# LOOCV on training data
left_out_id=0
C = 1.0  # SVM regularization parameter, the smaller it is, the stronger the regularization
#rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(ftrs.T, szr_class)
rbf_svc = svm.SVC(class_weight='balanced')
# rbf_svc.fit? # could add sample weight to weight each subject equally
rbf_svc.fit(ftrs[sub_id!=left_out_id,:], szr_class[sub_id!=left_out_id])
#clf = svm.SVC()
# >>> clf.fit(X, y)  


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [52]:
train_bool=sub_id!=left_out_id
valid_bool=sub_id==left_out_id

In [63]:
#training_class_hat=rbf_svc.predict(ftrs)
# training_class_hat=rbf_svc.predict(ftrs[sub_id!=left_out_id,:])
training_class_hat=rbf_svc.predict(ftrs)

In [64]:
train_bool=sub_id!=left_out_id
valid_bool=sub_id==left_out_id
ictal_bool=szr_class==1
preictal_bool=szr_class==0
jive=training_class_hat==szr_class

In [68]:
# Training Data Results
train_acc=np.mean(jive[train_bool])
print('Training accuracy: %f' % train_acc)
use_ids=np.multiply(train_bool,ictal_bool)
train_sens=np.mean(jive[use_ids])
print('Training sensitivity: %f' % train_sens)
use_ids=np.multiply(train_bool,preictal_bool)
train_spec=np.mean(jive[use_ids])
print('Training specificity: %f' % train_spec)

Training accuracy: 0.975491
Training sensitivity: 0.974138
Training specificity: 0.975614


In [69]:
# Validation Data Results
valid_acc=np.mean(jive[valid_bool])
print('Validation accuracy: %f' % valid_acc)
use_ids=np.multiply(valid_bool,ictal_bool)
valid_sens=np.mean(jive[use_ids])
print('Validation sensitivity: %f' % valid_sens)
use_ids=np.multiply(valid_bool,preictal_bool)
valid_spec=np.mean(jive[use_ids])
print('Validation specificity: %f' % valid_spec)

Validation accuracy: 0.882645
Validation sensitivity: 0.017949
Validation specificity: 0.949357


In [57]:
train_acc=np.mean(training_class_hat==szr_class[train_bool])
perionset_bool=szr_class==1
use_ids=np.multiply(perionset_bool,train_bool)
train_sens=np.mean(training_class_hat[np.multiply(perionset_bool,train_bool)]==szr_class[train_bool])
print('Training accuracy: %f' % train_acc) #0.975892903833 acc all training

Training accuracy: 0.975491


In [None]:
# Save model
# out_fname=??

In [None]:
# Load validation data and calculate false positive rate, and peri-onset latency


In [None]:
# clf = svm.SVC()
# >>> clf.fit(X, y)  
# SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
#     decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
#     max_iter=-1, probability=False, random_state=None, shrinking=True,
#     tol=0.001, verbose=False)
# >>> clf.predict([[2., 2.]])
# array([1])

In [72]:
np.savez('classification_metrics.npz',
         valid_sens=valid_sens,
         valid_spec=valid_spec,
         train_sens=train_sens,
         train_spec=train_spec,
         train_subs_list=train_subs_list,
         left_out_id=left_out_id)