In [33]:
from experiments.scp_experiment import SCP_Experiment
from models.base_model import ClassificationModel
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier

class LogisticRegressionModel(ClassificationModel):
    """
    Linear Model
    
    Reference: 
    ------------------
    https://scikit-learn.org/stable/modules/multiclass.html#multioutputclassifier
    """
    def __init__(self, name, n_classes,  sampling_frequency, outputfolder, input_shape, 
                 downsample=1, channels=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)):
        self.name = name
        self.n_classes = n_classes
        self.sampling_frequency = sampling_frequency
        self.outputfolder = outputfolder
        self.input_shape = input_shape
        self.downsample = downsample
        self.channels = channels

    def fit(self, X_train, y_train, X_val, y_val):
        """
        Training
        """
        import datetime
        import time
        print(datetime.datetime.now(), " - Training start")
        
        # Hyperparameters
        # Select best hyperparameter based on validataion loss
        hyperparameter_candidates = [
            {'C': 1.0, 'solver': 'sag', 'max_iter': 100},
#             {'C': 10.0, 'solver': 'sag', 'max_iter': 100},
#             {'C': 0.1, 'solver': 'sag', 'max_iter': 100}, # The regularization seem to give identical results...
        ]
        model_candidates = []
        flatten = lambda X: X[:, ::self.downsample, self.channels].reshape(X.shape[0], -1)
        
        for hp in hyperparameter_candidates:
            print(datetime.datetime.now(), " - Training with ", hp)
            
            clf = MultiOutputClassifier(LogisticRegression(**hp)
                                       ).fit(flatten(X_train), y_train)
            score_train = clf.score(flatten(X_train), y_train)
            score_val = clf.score(flatten(X_val), y_val)
            
            result = {"score_train": score_train, "score_val": score_val, "model": clf, "hyperparameters": hp}
            model_candidates.append(result)
            
            print(datetime.datetime.now(), " - Result: ", result)
        
        best_model = max(model_candidates, key=lambda x: x["score_val"])
        print(datetime.datetime.now(), " - Best model: ", best_model)
        
        self.model_candidates = model_candidates
        self.classifier = best_model["model"]
        print(datetime.datetime.now(), " - Training complete")

    def predict(self, X):
        """
        Generate predictions based on X at test time
        """
        flatten = lambda X: X[:, ::self.downsample, self.channels].reshape(X.shape[0], -1)
        proba = self.classifier.predict_proba(flatten(X))
        ret = np.array(proba)[..., 1].T # Only keep the probability of the positive class
        return ret


In [34]:
# Below is a config to be used with our modified SCP_Experiment class
# which accepts a "modelclass" so that we can easily test any custom 
# defined model in a notebook
config = {'modelname': 'LogisticRegression', 
                   'modelclass': LogisticRegressionModel,
                   'modeltype': None, 
                   'parameters': {"downsample": 10, "channels": (0, 1, 2)}}

datafolder = '../data/ptbxl/'
outputfolder = '../output/'
# experiments = [
#     ('exp0', 'all'),
#     ('exp1', 'diagnostic'),
#     ('exp1.1', 'subdiagnostic'),
#     ('exp1.1.1', 'superdiagnostic'),
#     ('exp2', 'form'),
#     ('exp3', 'rhythm')
#     ]



In [35]:
experiment = SCP_Experiment(experiment_name="logistic_regression_exp", task="all", # "rhythm", 
                            datafolder=datafolder, outputfolder=outputfolder, 
                            models=[config])

In [36]:
# This line takes a while to run
import time
tic = time.perf_counter()
experiment.prepare()
print(f"expriment.prepare() finished in {time.perf_counter() - tic} sec")

expriment.prepare() finished in 12.73757969999997 sec


In [37]:
experiment.models = [config]
experiment.perform()

2021-03-24 20:58:50.274391  - Training start
2021-03-24 20:58:50.274391  - Training with  {'C': 1.0, 'solver': 'sag', 'max_iter': 100}




2021-03-24 21:04:25.628038  - Result:  {'score_train': 0.02826672782523938, 'score_val': 0.024623803009575923, 'model': MultiOutputClassifier(estimator=LogisticRegression(solver='sag')), 'hyperparameters': {'C': 1.0, 'solver': 'sag', 'max_iter': 100}}
2021-03-24 21:04:25.628038  - Best model:  {'score_train': 0.02826672782523938, 'score_val': 0.024623803009575923, 'model': MultiOutputClassifier(estimator=LogisticRegression(solver='sag')), 'hyperparameters': {'C': 1.0, 'solver': 'sag', 'max_iter': 100}}
2021-03-24 21:04:25.628038  - Training complete


In [38]:
experiment.evaluate(bootstrap_eval=False)

LogisticRegression
ensemble
naive


In [39]:
import pandas as pd
import os
for m in sorted(os.listdir(experiment.outputfolder+experiment.experiment_name+'/models')):
    print(m)
    rpath = experiment.outputfolder+experiment.experiment_name+'/models/'+m+'/results/'
    print(pd.read_csv(rpath+'te_results.csv', index_col=0))


LogisticRegression
       macro_auc
point   0.498172
mean    0.498172
lower   0.498172
upper   0.498172
ensemble
       macro_auc
point   0.498172
mean    0.498172
lower   0.498172
upper   0.498172
naive
       macro_auc
point        0.5
mean         0.5
lower        0.5
upper        0.5


In [40]:
experiment.X_train.shape

(17441, 1000, 12)

In [41]:
clf = LogisticRegression(solver='sag', max_iter=500).fit(experiment.X_train[:, :, 0].reshape(experiment.X_train.shape[0], -1), experiment.y_train[:, 0])
clf.score(experiment.X_train[:, :, 0].reshape(experiment.X_train.shape[0], -1), experiment.y_train[:, 0])



0.9638208818301703

In [13]:
experiment.X_train[:, 0].shape

AttributeError: 'SCP_Experiment' object has no attribute 'X_train'

In [48]:
clf.score(experiment.X_test[:, :, 0].reshape(experiment.X_test.shape[0], -1), experiment.y_test[:, 0])

0.562358276643991

In [53]:
clf.predict(experiment.X_test[:, :, 0].reshape(experiment.X_test.shape[0], -1))

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [54]:
clf.predict_proba(experiment.X_test[:, :, 0].reshape(experiment.X_test.shape[0], -1))

array([[0.7327405 , 0.2672595 ],
       [0.74475102, 0.25524898],
       [0.55615577, 0.44384423],
       ...,
       [0.71191941, 0.28808059],
       [0.67495533, 0.32504467],
       [0.69550277, 0.30449723]])

In [57]:
X_train = experiment.X_train[:, :, 0:2].reshape(experiment.X_train.shape[0], -1)
y_train = experiment.y_train[:, 0:2]
multi_clf = MultiOutputClassifier(LogisticRegression(solver='sag', max_iter=10)
                                       ).fit(X_train, y_train)



In [59]:
multi_clf.score(X_train, y_train)

0.6388503193557346

In [68]:
np.array(multi_clf.predict_proba(X_train))[..., 1].T  # Probability of the 1 class

array([[0.368526  , 0.23780569],
       [0.5157469 , 0.12042826],
       [0.3686238 , 0.0859225 ],
       ...,
       [0.4687357 , 0.1359982 ],
       [0.37877181, 0.09416581],
       [0.40798742, 0.14934758]])

In [66]:
multi_clf.predict(X_train)

array([[0, 0],
       [1, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 0],
       [0, 0]])

In [83]:
y_train

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [1, 0],
       [0, 0],
       [1, 0]])

In [77]:
d = {0: {0: 0}, 1: {0: 0}}
max(d.items(), key=lambda x: x[1][0])

(0, {0: 0})

In [74]:
d = {0:'0', 1:'1'}
max(d.items(), key=lambda x: x[1][0])

int

In [16]:
experiment.X_train.shape

(16854, 1000, 12)