# Notebook 1 - Classification

## Section 0 - Get signal features

In [None]:
from multiprocessing import Pool, cpu_count
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.signal import periodogram
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import Imputer
import wfdb
from wfdb import processing

from vt.records import get_alarms, data_dir
from vt.features import calc_moments, calc_spectral_ratios

In [None]:
alarms, record_names, record_names_true, record_names_false = get_alarms()

In [None]:
def calc_features(record_name):
    """
    Aggregate function. Calculate all features for the last 10s of a record.
    
    Features for each signal are:
    - Moments: mean, std, skew, kurtosis
    - Number of beats detected
    - Average heart rate
    - Spectral band power ratios
    
    Parameters
    ---------
    record_name : str
        The record name
    
    Returns
    -------
    features : pandas dataframe
        Dataframe of the calculated features

    """
    fs = 250
    start_sec = 290
    stop_sec = 300
    
    # Desired features
    features = []
    # Features are calculated for each individual signal
    feature_labels = [['_'.join([moment, str(ch)]) for moment in ['mean', 'std', 'skew',
                                                                  'kurt', 'n_beats', 'hr',
                                                                  'lfp', 'mfp', 'hfp']] for ch in range(3)]
    feature_labels = [x for y in feature_labels for x in y] + ['result']
    
    # Read record
    signal, fields = wfdb.rdsamp(os.path.join(data_dir, record_name),
                                 sampfrom=start_sec*fs, sampto=stop_sec*fs,
                                 channels=[0, 1, 2])
    
    # Get beat locations
    qrs_0 = processing.xqrs_detect(signal[:, 0], fs=fs, verbose=False)
    qrs_1 = processing.xqrs_detect(signal[:, 1], fs=fs, verbose=False)
    pulse_2 = wfdb.rdann(os.path.join(data_dir, record_name), 'wabp2',
                         sampfrom = start_sec*fs, sampto = stop_sec*fs,
                         shift_samps=True).sample
    
    beat_inds = [qrs_0, qrs_1, pulse_2]
    
    # Calculate features for each signal
    for ch in range(3):
        # Moments
        features = features + list(calc_moments(signal[:,ch]))
        
        # Beat information
        rr = processing.calc_rr(qrs_locs=beat_inds[ch])
        n_beats = len(rr)
        hr = processing.calc_mean_hr(rr=rr, fs=fs)
        features = features + [n_beats, hr]
        
        # Frequency information
        features = features + list(calc_spectral_ratios(signal[:, ch], fs=fs))
        
    # Add on the alarm label
    features = features + [alarms.loc[record_name]['result']]
    # Convert to dataframe
    features = pd.DataFrame([features], columns=feature_labels, index=[record_name])

    return features

In [None]:
# Calculate features for all records using multiple cpus
pool = Pool(processes=cpu_count() - 1)
features = pool.map(calc_features, record_names)

# Combine features into a single data frame
features = pd.concat(features)

# Impute the missing nans
imp = Imputer(missing_values='NaN', strategy='mean')
imp.fit(features)
features = imp.transform(features)

print('Finished calculating features')

## Section 1 - Training and Testing Data

- We take a subset of our data as the training set. Supervised classifiers can use this labelled data to learn how to discern between the two outcome categories.
- We take the remaining data as the testing set, which we use to evaluate our algorithms/models. This is analagous to new data we have not previously encountered.


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(features[:, :-1], features[:, -1],
                                                    train_size=0.75, test_size=0.25,
                                                    random_state=0)
print('Number of training records: %d' % len(x_train))
print('Number of testing records: %d' % len(x_test))

## Section 2 - Supervised Classifiers

A supervised classifier learns parameters from labeled training data (the alarm results). After being trained, it can be used to classify new unlabelled data.

Examples:
- Logistic regression (LR) http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- K nearest neighbors (KNN) http://scikit-learn.org/stable/modules/neighbors.html
- Support vector machine (SVM) http://scikit-learn.org/stable/modules/svm.html
- Gradient Boosting (GB) http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

We are using hard classifiers, as opposed to fuzzy/soft classifiers. Each decision falls firmly into one category, rather than outputting a probability.

On top of these, we can combine them with with an ensemble method. ie. Voting Classifier http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html


*We can also use an unsupervised rule-based classifier, leveraging expert knowledge, rather than relying on a model that is limited by its training data.

In [None]:
from sklearn import svm, neighbors
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# LR
clf_lr = LogisticRegression()
clf_lr.fit(x_train, y_train)
y_predict_lr = clf_lr.predict(x_test)

# KNN
clf_knn = neighbors.KNeighborsClassifier()
clf_knn.fit(x_train, y_train)
y_predict_knn = clf_knn.predict(x_test)

# SVM
clf_svm = svm.SVC()
clf_svm.fit(x_train, y_train)
y_predict_svm = clf_svm.predict(x_test)

# And GB
clf_gb = GradientBoostingClassifier()
clf_gb.fit(x_train, y_train)
y_predict_gb = clf_gb.predict(x_test)

## Section 3 - Evaluating Performance

In order to determine how well our system performs, we need an objective evaluation function.

### The confusion matrix
![Confusion Matrix](http://www.dataschool.io/content/images/2015/01/confusion_matrix2.png)

### The cost matrix

The cost matrix is the confusion matrix weighed by the penalty of each decision result. In our challenge, we assign zero cost to correct predictions (as is the usual case), and the cost of *False Negatives* is 5x as great as the cost of *False Positives*.

**`Score = ( TP + TN ) / ( TP + TN + FP + 5*FN )`**

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
def calc_results(y_true, y_pred):
    """
    Calculate performance metrics. Input variables are array-likes of true
    outcomes and predicted outcomes.
    
    Returns the confusion matrix, the proportion of correct predictions,
    and the final score
    """
    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    cm = pd.DataFrame(cm, columns=['Predict 0', 'Predict 1'], index=['Actual 0', 'Actual 1'])
    
    # Correct classification proportion
    p_correct = (cm.iloc[0,0]+cm.iloc[1,1])/len(y_pred)
    
    # Score = ( TP + TN ) / ( TP + TN + FP + 5*FN )
    score = calc_final_score(cm)
    
    return cm, p_correct, score


def calc_final_score(cm):
    """
    Calculate final score from a confusion matrix. False negatives
    are penalized 5x as much as false positives::
    
        Score = ( TP + TN ) / ( TP + TN + FP + 5*FN )
    
    """
    if type(cm) == pd.DataFrame:
        score = ((cm.iloc[1, 1] + cm.iloc[0, 0])
                  / (cm.iloc[1, 1] + cm.iloc[0, 0] + cm.iloc[0, 1] + 5*cm.iloc[1, 0]))
    elif type(cm) == np.ndarray:
        score = (cm[0, 0] + cm[0, 1]) / (cm[1, 1]+cm[0, 1]+cm[0, 1] + 5*cm[1, 0])
    
    return score

def print_results(cm, pcorrect, score, classifier_name=''):
    """
    Display the performance results
    
    """
    print('Classifier: %s' % classifier_name)
    print('Confusion Matrix:')
    display(cm)
    print('Proportion Correct:', pcorrect)
    print('Final Score:', score)
    print('\n\n')


In [None]:
cm, p_correct, score = calc_results(y_test, y_predict_knn)
print_results(cm, p_correct, score, 'KNN')

cm, p_correct, score = calc_results(y_test, y_predict_svm)
print_results(cm, p_correct, score, 'SVM')

cm, p_correct, score = calc_results(y_test, y_predict_lr)
print_results(cm, p_correct, score, 'LR')

cm, p_correct, score = calc_results(y_test, y_predict_gb)
print_results(cm, p_correct, score, 'GB')

*Most classifications are correct, yet the score is low because of the disproportionate penalty of false negatives. Because there are many more cases of false alarms, the trained models may favor outputting 'False'.

In [None]:
print('%d training records - %d false alarms, %d true alarms'
      % (len(y_train), len(np.where(y_train==0)[0]), len(np.where(y_train==1)[0])))

print('%d testing records - %d false alarms, %d true alarms'
      % (len(y_test), len(np.where(y_test==0)[0]), len(np.where(y_test==1)[0])))