In [110]:
from sktime.classification.all import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import get_data
from sktime.transformations.panel.rocket import MiniRocket, MiniRocketMultivariate 
from sktime.transformations.panel.compose import ColumnConcatenator
from sktime.classification.shapelet_based import MrSEQLClassifier
from sklearn.linear_model import RidgeClassifierCV, LogisticRegressionCV, SGDClassifier, RidgeClassifier, LogisticRegression 
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB

## data wrangling
---
Sktime takes a dataframe with a time series as a pandas series in each data entry:
- entry $x_j^i$ is the jth feature (a timeseries as a pandas series) for data point $x^i$

In [83]:
def matrix_2_df(matrix, column_prefix='lead_'):
    '''
    Converts 3-D numpy matrix to dataframe for input to sktime models.
    '''
    from collections import defaultdict
    from pandas import Series, DataFrame
    output = defaultdict(list)
    for i in matrix[:, :, :]:
        for c in range(matrix.shape[-1]):
            output[column_prefix+str(c+1)].append(Series(i[:, c:c+1].flatten()))
            
    return DataFrame(data=output)

In [21]:
from collections import defaultdict
datas = defaultdict(list)
for i in get_data.X_train[:, :, :]:
    for c in range(12):
        datas['lead_{}'.format(c)].append(pd.Series(i[:, c:c+1].flatten()))

X_train = pd.DataFrame(data=datas)

datas = defaultdict(list)
for i in get_data.X_test[:, :, :]:
    for c in range(12):
        datas['lead_{}'.format(c)].append(pd.Series(i[:, c:c+1].flatten()))

X_test = pd.DataFrame(data=datas)

## Time series forest
---
test time series forest on single lead, raw data

In [22]:
classifier = TimeSeriesForest()
classifier.fit(X_train[['lead_0']], get_data.y_train)
y_pred = classifier.predict(X_test[['lead_0']])
accuracy_score(get_data.y_test, y_pred)

0.6810933940774487

In [24]:
report_dict=classification_report(get_data.y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(data = report_dict)
report_df

Unnamed: 0,False,True,accuracy,macro avg,weighted avg
precision,0.701754,0.642857,0.681093,0.672306,0.677069
recall,0.784314,0.538043,0.681093,0.661179,0.681093
f1-score,0.740741,0.585799,0.681093,0.66327,0.675799
support,255.0,184.0,0.681093,439.0,439.0


### Multivariate Minirocket
---

In [4]:
# fit on training 
mini_mv = MiniRocketMultivariate() 
mini_mv.fit(X_train)

# transform training and testing
X_train_mvtrans = mini_mv.transform(X_train)
X_test_mvtrans = mini_mv.transform(X_test)

In [6]:
ridge_clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True).fit(X_train_mvtrans, get_data.y_train)
ridge_clf.score(X_test_mvtrans, get_data.y_test)

0.8428246013667426

In [80]:
steps = [
    ("concatenate", ColumnConcatenator()),
    ("classify", TimeSeriesForestClassifier(n_estimators=100)),
]
clf = Pipeline(steps)
clf.fit(X_train, get_data.y_train)
clf.score(X_test, get_data.y_test)

0.7471526195899773

In [8]:
shape_clf = ShapeletTransformClassifier(time_contract_in_mins=60).fit(X_train[['lead_0']], get_data.y_train)
shape_clf.score(X_test[['lead_0']], get_data.y_test)

42.886274337768555
69.9529538154602
72.27595257759094
76.29668188095093
84.84292721748352
86.56846189498901


0.6583143507972665

In [54]:
pd.DataFrame(data = Gridcv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_dim_reduce,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,137.011038,14.425885,25.513326,4.316217,0.01,passthrough,"{'clf__C': 0.01, 'dim_reduce': 'passthrough'}",0.837607,0.863248,0.851852,0.834758,0.837143,0.844921,0.010965,1
1,124.816299,12.467988,28.630904,6.206224,0.01,PCA(n_components=5),"{'clf__C': 0.01, 'dim_reduce': PCA(n_component...",0.757835,0.814815,0.746439,0.717949,0.765714,0.76055,0.0316,9
2,144.005225,10.016895,30.134759,2.743409,0.01,PCA(n_components=60),"{'clf__C': 0.01, 'dim_reduce': PCA(n_component...",0.840456,0.846154,0.82906,0.826211,0.837143,0.835805,0.007322,4
3,180.55772,7.424022,30.733291,1.79862,0.1,passthrough,"{'clf__C': 0.1, 'dim_reduce': 'passthrough'}",0.834758,0.854701,0.843305,0.820513,0.848571,0.84037,0.011892,2
4,135.790104,11.202529,33.517472,3.685712,0.1,PCA(n_components=5),"{'clf__C': 0.1, 'dim_reduce': PCA(n_components...",0.774929,0.803419,0.752137,0.723647,0.76,0.762826,0.026274,8
5,153.642905,6.945541,30.936403,4.726232,0.1,PCA(n_components=60),"{'clf__C': 0.1, 'dim_reduce': PCA(n_components...",0.826211,0.851852,0.840456,0.817664,0.817143,0.830665,0.013539,5
6,184.236702,6.068779,34.425767,3.328447,1.0,passthrough,"{'clf__C': 1.0, 'dim_reduce': 'passthrough'}",0.840456,0.854701,0.843305,0.834758,0.828571,0.840358,0.008771,3
7,149.436369,5.992271,32.437749,4.00303,1.0,PCA(n_components=5),"{'clf__C': 1.0, 'dim_reduce': PCA(n_components...",0.766382,0.814815,0.749288,0.735043,0.762857,0.765677,0.026945,7
8,71.735637,13.374527,8.13212,1.938121,1.0,PCA(n_components=60),"{'clf__C': 1.0, 'dim_reduce': PCA(n_components...",0.814815,0.85755,0.831909,0.826211,0.817143,0.829525,0.015309,6


In [15]:
# raw classifier testing
knn_raw = SGDClassifier().fit(get_data.X_train.reshape((1754, 12000)), get_data.y_train)
knn_raw.score(get_data.X_test.reshape((get_data.X_test.shape[0], 12000)), get_data.y_test)

0.5239179954441914

In [55]:
from wavelet_features import get_ecg_features

In [60]:
X_train_wav = get_ecg_features(get_data.X_train)
X_test_wav = get_ecg_features(get_data.X_test)

In [64]:
Gridcv_wav.best_estimator_

Pipeline(steps=[('scale', StandardScaler()), ('dim_reduce', 'passthrough'),
                ('clf', LogisticRegression(C=0.01, max_iter=400))])

In [67]:
output_folder = os.path.abspath(r'..\Output')
dump(Gridcv_wav, os.path.join(output_folder, 'logit_gridcv_02.joblib'))

['C:\\Users\\Eric\\Desktop\\Classes\\GT_ISYE6740\\Project\\ISYE6740_ECG_Proj\\Output\\logit_gridcv_02.joblib']

In [68]:
pd.DataFrame(Gridcv_wav.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__C,param_dim_reduce,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.237316,0.079031,0.020252,0.003619,0.01,passthrough,"{'clf__C': 0.01, 'dim_reduce': 'passthrough'}",0.786325,0.823362,0.780627,0.792023,0.774286,0.791324,0.017068,1
1,0.590352,0.113516,0.015395,0.005555,0.01,PCA(n_components=5),"{'clf__C': 0.01, 'dim_reduce': PCA(n_component...",0.698006,0.626781,0.68661,0.62963,0.665714,0.661348,0.02899,7
2,0.808606,0.093471,0.020211,0.006204,0.01,PCA(n_components=60),"{'clf__C': 0.01, 'dim_reduce': PCA(n_component...",0.780627,0.77208,0.766382,0.789174,0.785714,0.778795,0.008463,2
3,2.903848,0.352175,0.018748,0.006247,0.1,passthrough,"{'clf__C': 0.1, 'dim_reduce': 'passthrough'}",0.760684,0.789174,0.774929,0.777778,0.757143,0.771941,0.01171,5
4,0.567362,0.071796,0.030784,0.015925,0.1,PCA(n_components=5),"{'clf__C': 0.1, 'dim_reduce': PCA(n_components...",0.698006,0.626781,0.683761,0.62963,0.662857,0.660207,0.028435,8
5,0.766599,0.0363,0.03353,0.004307,0.1,PCA(n_components=60),"{'clf__C': 0.1, 'dim_reduce': PCA(n_components...",0.777778,0.763533,0.77208,0.780627,0.782857,0.775375,0.006934,4
6,4.834807,0.127594,0.007181,0.00342,1.0,passthrough,"{'clf__C': 1.0, 'dim_reduce': 'passthrough'}",0.706553,0.749288,0.735043,0.760684,0.751429,0.740599,0.018901,6
7,0.620551,0.021959,0.026134,0.01178,1.0,PCA(n_components=5),"{'clf__C': 1.0, 'dim_reduce': PCA(n_components...",0.698006,0.626781,0.683761,0.62963,0.662857,0.660207,0.028435,8
8,0.799541,0.034296,0.026362,0.006762,1.0,PCA(n_components=60),"{'clf__C': 1.0, 'dim_reduce': PCA(n_components...",0.777778,0.760684,0.769231,0.789174,0.785714,0.776516,0.01049,3


In [72]:
SGD_gridcv.score(X_test, get_data.y_test)

0.8428246013667426

In [73]:
dump(SGD_gridcv, os.path.join(output_folder, 'SGD_gridcv_01.joblib'))

['C:\\Users\\Eric\\Desktop\\Classes\\GT_ISYE6740\\Project\\ISYE6740_ECG_Proj\\Output\\SGD_gridcv_01.joblib']

In [143]:
# list of models, feel free to add more
models = [SVC(), LogisticRegression(max_iter=400), GaussianNB(), KNeighborsClassifier(),
          RandomForestClassifier(), RidgeClassifier(), AdaBoostClassifier()]

# dictionary of dictionaries of model paramenters (follow formatting if edit)
model_params = {'svc':{'svc__C':np.logspace(-3, 1, 5), 'svc__kernel':['scale', 'auto']}, 
               'logisticregression':{'logisticregression__C':np.logspace(-3, 1, 5)}, 
               'kneighborsclassifier': {'kneighborsclassifier__k':[1, 5, 7]}, 
               'randomforestclassifier': {'randomforestclassifier__n_estimators':[50, 100, 200]},
               'ridgeclassifier':{'ridgeclassifier__alpha': np.logspace(-2, 2, 5)},
               'adaboostclassifier':{'adaboostclassifier__n_estimators':[50, 100]}}

def make_gridcv(classifier):
    '''
    Given an sklearn classifier (e.g. SVC()), builds a pipeline and parameter grid based on model_params dictionary and 
    default estimators (scaling, pca). 
    
    Returns a GridSearchCV object ready to be fit to data.
    '''
    #==================================================================================
    # We will need to edit the evaluation critereon of grid cv to a different metric
    # for multilabel classification. See:
    # https://scikit-learn.org/stable/modules/model_evaluation.html#multimetric-scoring
    # 
    # Multimetric is also possible:
    # https://scikit-learn.org/stable/modules/grid_search.html#multimetric-grid-search
    #==================================================================================
    pipe = make_pipeline(StandardScaler(), PCA(), classifier)
    default_params = dict(pca=['passthrough', PCA(.80, svd_solver='full'), PCA(.90, svd_solver='full'), PCA(.95, svd_solver='full')])
    
    # update parameters with model's parameters
    default_params.update(model_params[list(pipe.named_steps.keys())[-1]])
    
    return GridSearchCV(pipe, default_params, n_jobs=-1)
    

In [144]:
ridge_grid = make_gridcv(models[-2])

In [145]:
ridge_grid.fit(X_train_wav, get_data.y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('pca', PCA()),
                                       ('ridgeclassifier', RidgeClassifier())]),
             n_jobs=-1,
             param_grid={'pca': ['passthrough',
                                 PCA(n_components=0.8, svd_solver='full'),
                                 PCA(n_components=0.9, svd_solver='full'),
                                 PCA(n_components=0.95, svd_solver='full')],
                         'ridgeclassifier__alpha': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])})

In [146]:
ridge_grid.score(X_test_wav, get_data.y_test)

0.7790432801822323

In [147]:
ridge_grid.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=0.95, svd_solver='full')),
                ('ridgeclassifier', RidgeClassifier(alpha=100.0))])