In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import general_features
import dissimilarity

#loading dataset
import data

#Feature selection
from sklearn.feature_selection import SelectKBest, f_classif, chi2

# Train/Test splitting
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier

# Class Imbalance
from sklearn.utils import resample

# Classification
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC

# Error
from sklearn.metrics import f1_score


import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [2]:
X_raw = data.load_X_train()
y_raw = data.load_labels()
HB_raw = data.load_heartbeats_from_file()

In [9]:
HB_raw[3].shape
HB_var.shape

(5117, 185)

# Feature Extraction

In [3]:
# Generating general normalized features for the individual heartbeats
HB_features = general_features.get_heartbeat_features(HB_raw)


In [None]:
# get_heartbeat_features - function
HB_features = pd.DataFrame(0, index=np.arange(len(HB_raw)), columns=['HB_mean', 'HB_median', 'HB_std', 'HB_max',
                                                                   'HB_min', 'rpeak1', 'rpeak2', 'rpeak3'])

for i in range(0, len(HB_raw)):
    HB_features.set_value(i, 'HB_mean', np.median(HB_raw[i].mean(axis=0)))
    HB_features.set_value(i, 'HB_median', np.median(np.median(HB_raw[i], axis=0)))
    HB_features.set_value(i, 'HB_std', np.median(HB_raw[i].std(axis=0)))
    HB_features.set_value(i, 'HB_max', np.median(HB_raw[i].max(axis=0)))
    HB_features.set_value(i, 'HB_min', np.median(HB_raw[i].min(axis=0)))
    HB_features.set_value(i, 'rpeak1', np.median(HB_raw[i][:, 59]))
    HB_features.set_value(i, 'rpeak2', np.median(HB_raw[i][:, 60]))
    HB_features.set_value(i, 'rpeak3', np.median(HB_raw[i][:, 61]))

HB_features = (HB_features - HB_features.mean(axis=0)) / HB_features.std(axis=0)

In [4]:
# Generating general features for the variance of the heartbeats
var = pd.read_csv('out/HB_Variance_train.csv', float_precision='high').drop('id', axis=1)
var['varmean'] = var.mean(axis=1)
var['varmed'] = var.median(axis=1)
var['varmax'] = var.max(axis=1)
var['varmin'] = var.min(axis=1)
var['varstd'] = var.std(axis=1)

# Normalizing the features
HB_var = (var - var.mean(axis=0))/var.std(axis=0)

In [5]:
pulse = pd.DataFrame(0, index=np.arange(len(HB_raw)), columns=['pulse'])
for i in range(0, len(HB_raw)):
    pulse.set_value(i, 'pulse', (len(X_raw[i])/HB_raw[i].shape[0]))
pulse = (pulse - pulse.mean(axis=0))/pulse.std(axis=0)

In [6]:
# Extracting general normalized features for the time series X
X = general_features.get_general_features(X_raw)
print(X.shape)
X = pd.concat([X, HB_features, HB_var, pulse], axis=1)
print(X.shape)
y = pd.DataFrame(y_raw, columns=['y'])

(5117, 5)
(5117, 199)


## Class imbalance

In [109]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train.head()

Unnamed: 0,mean,median,std,max,min,HB_mean,HB_median,HB_std,HB_max,HB_min,...,t176,t177,t178,t179,varmean,varmed,varmax,varmin,varstd,pulse
3105,0.533278,0.317583,0.837692,0.861036,-0.166542,-0.97977,-0.747997,1.210417,0.483153,-1.002452,...,-0.104546,-0.112233,-0.119485,-0.130771,0.306279,0.205114,0.084082,-0.000751,0.196666,-0.058731
250,0.133807,-0.597822,-0.547622,-0.510912,0.386613,-1.113899,-1.014295,-0.442357,-0.36443,0.287588,...,-0.143455,-0.152907,-0.160915,-0.175378,-0.183358,-0.131085,-0.236653,-0.125242,-0.246337,-0.397544
2195,0.832881,0.409124,-0.060197,0.14367,0.638893,-0.443253,-0.34855,-0.52106,-0.650489,0.525749,...,-0.141343,-0.150752,-0.158791,-0.173186,-0.18644,-0.135324,-0.248641,-0.134166,-0.252282,2.580444
55,-0.365531,-0.27743,0.478536,0.132043,-1.026776,0.227393,0.317195,-0.442357,-0.523351,0.297512,...,-0.139988,-0.148625,-0.156415,-0.170664,-0.171101,-0.131324,-0.209199,-0.129162,-0.21726,3.561218
3378,-0.6152,-0.048579,-1.035047,-1.093408,0.446581,0.629781,0.583493,-0.573529,-0.533946,0.605136,...,-0.145247,-0.155274,-0.163776,-0.178827,-0.190024,-0.138513,-0.241713,-0.132026,-0.2498,0.458404


In [44]:
# Resample dataset
print(y_train['y'].value_counts())
X_resampled, y_resampled = general_features.resample_set(X_train, y_train, y_train['y'].value_counts()[2], 27)
print(y_resampled.value_counts())

0    2433
2    1161
1     365
3     134
Name: y, dtype: int64
3    1161
2    1161
1    1161
0    1161
Name: y, dtype: int64


## Feature Selection

In [56]:
# Apply SelectKBest class to extract top best features
bestfeatures = SelectKBest(score_func=f_classif, k=20)
fit = bestfeatures.fit(X_train,y_train.to_numpy().reshape(-1, ))

X_train = bestfeatures.transform(X_train)
X_test = bestfeatures.transform(X_test)

# Classification Models

In [19]:
BBC = BalancedBaggingClassifier(base_estimator=SVC(gamma='auto'), random_state=42, max_samples=y_train['y'].value_counts()[1], sampling_strategy="not majority")
BBC.fit(X_resampled, y_resampled) 
y_pred = BBC.predict(X_test)

In [9]:
X_final = data.load_X_test()



In [10]:
clf = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0, shrinking=True, tol=0.001, cache_size=200, max_iter=-1, random_state=None)
clf.fit(X, y) 
y_pred = clf.predict(X_final)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
F1 = f1_score(y_test, y_pred, average='micro')

In [None]:
F1

In [None]:
y_final_pred = pd.read_csv('sample.csv')
y_final_pred['y'] = pd.DataFrame(y_pred) 
y_final_pred.to_csv('general_features.csv')

In [47]:
# Truncate the data into 10sec timeseries
X_raw = data.load_X_train()
y = data.load_labels()


In [48]:
# Truncate the data into ~10sec timeseries
X_10s = pd.DataFrame(np.zeros((len(X),2400)))
for i in range(0,len(X)):
    X_10s[i] = pd.DataFrame(X_raw[i][:2400])

X_10s = (X_10s - X_10s.mean(axis=0).mean())/X_10s.std(axis=0).mean()
                    
    


In [None]:
# Normalize the data

In [17]:
# Implementation of a RNN - ResNet


Using cache found in /Users/gyrireiersen/.cache/torch/hub/pytorch_vision_v0.4.2


RuntimeError: Expected 4-dimensional input for 4-dimensional weight 64 3 7, but got 3-dimensional input of size [1, 5117, 5117] instead

In [3]:
import keras 
import numpy as np 
import pandas as pd 
import time

X_raw = data.load_X_train()
y = data.load_labels()

# Truncate the data into ~10sec timeseries
X_10s = pd.DataFrame(np.zeros((len(X_raw),2400)))
for i in range(0,len(X_raw)):
    X_10s[i] = pd.DataFrame(X_raw[i][:2400])

X_10s = (X_10s - X_10s.mean(axis=0).mean())/X_10s.std(axis=0).mean()

print("Data loaded.")
                    

np.random.seed(813306)

def build_resnet(input_shape, n_feature_maps, nb_classes):
    print ('build conv_x')
    x = keras.layers.Input(shape=(input_shape))
    conv_x = keras.layers.BatchNormalization()(x)
    conv_x = keras.layers.Conv2D(n_feature_maps, 8, strides=1, padding='same')(conv_x)
    conv_x = keras.layers.BatchNormalization()(conv_x)
    conv_x = keras.layers.Activation('relu')(conv_x)
     
    print ('build conv_y')
    conv_y = keras.layers.Conv2D(n_feature_maps, 5, strides=1, padding='same')(conv_x)
    conv_y = keras.layers.BatchNormalization()(conv_y)
    conv_y = keras.layers.Activation('relu')(conv_y)
     
    print ('build conv_z')
    conv_z = keras.layers.Conv2D(n_feature_maps, 3, strides=1, padding='same')(conv_y)
    conv_z = keras.layers.BatchNormalization()(conv_z)
     
    is_expand_channels = not (input_shape[-1] == n_feature_maps)
    if is_expand_channels:
        shortcut_y = keras.layers.Conv2D(n_feature_maps, 1, strides=1,padding='same')(x)
        shortcut_y = keras.layers.BatchNormalization()(shortcut_y)
    else:
        shortcut_y = keras.layers.BatchNormalization()(x)
    print ('Merging skip connection')
    y = keras.layers.Add()([shortcut_y, conv_z])
    y = keras.layers.Activation('relu')(y)
     
    print ('build conv_x')
    x1 = y
    conv_x = keras.layers.Conv2D(n_feature_maps*2, 8, strides=1, padding='same')(x1)
    conv_x = keras.layers.BatchNormalization()(conv_x)
    conv_x = keras.layers.Activation('relu')(conv_x)
     
    print ('build conv_y')
    conv_y = keras.layers.Conv2D(n_feature_maps*2, 5, strides=1, padding='same')(conv_x)
    conv_y = keras.layers.BatchNormalization()(conv_y)
    conv_y = keras.layers.Activation('relu')(conv_y)
     
    print ('build conv_z')
    conv_z = keras.layers.Conv2D(n_feature_maps*2, 3, strides=1, padding='same')(conv_y)
    conv_z = keras.layers.BatchNormalization()(conv_z)
     
    is_expand_channels = not (input_shape[-1] == n_feature_maps*2)
    if is_expand_channels:
        shortcut_y = keras.layers.Conv2D(n_feature_maps*2, 1, strides=1,padding='same')(x1)
        shortcut_y = keras.layers.BatchNormalization()(shortcut_y)
    else:
        shortcut_y = keras.layers.BatchNormalization()(x1)
    print ('Merging skip connection')
    y = keras.layers.Add()([shortcut_y, conv_z])
    y = keras.layers.Activation('relu')(y)
     
    print ('build conv_x')
    x1 = y
    conv_x = keras.layers.Conv2D(n_feature_maps*2, 8, strides=1, padding='same')(x1)
    conv_x = keras.layers.BatchNormalization()(conv_x)
    conv_x = keras.layers.Activation('relu')(conv_x)
     
    print ('build conv_y')
    conv_y = keras.layers.Conv2D(n_feature_maps*2, 5, strides=1, padding='same')(conv_x)
    conv_y = keras.layers.BatchNormalization()(conv_y)
    conv_y = keras.layers.Activation('relu')(conv_y)
     
    print ('build conv_z')
    conv_z = keras.layers.Conv2D(n_feature_maps*2, 3, strides=1, padding='same')(conv_y)
    conv_z = keras.layers.BatchNormalization()(conv_z)

    is_expand_channels = not (input_shape[-1] == n_feature_maps*2)
    if is_expand_channels:
        shortcut_y = keras.layers.Conv2D(n_feature_maps*2, 1, strides=1,padding='same')(x1)
        shortcut_y = keras.layers.BatchNormalization()(shortcut_y)
    else:
        shortcut_y = keras.layers.BatchNormalization()(x1)
    print ('Merging skip connection')
    y = keras.layers.Add()([shortcut_y, conv_z])
    y = keras.layers.Activation('relu')(y)
     
    full = keras.layers.GlobalAveragePooling2D()(y)
    out = keras.layers.Dense(nb_classes, activation='softmax')(full)
    print ('        -- model was built.')
    return x, out
 
nb_epochs = 100
nb_classes = 4
batch_size = 1

X_train, X_test, y_train, y_test = train_test_split(X_10s, y, test_size=0.2)
X_train = X_train.values.reshape(X_train.shape + (1,1,))
X_test = X_test.values.reshape(X_test.shape + (1,1,))

Y_train = keras.utils.to_categorical(y_train, nb_classes)
Y_test = keras.utils.to_categorical(y_test, nb_classes)
   

x , y = build_resnet(X_train.shape[1:], 64, nb_classes)
model = keras.models.Model(inputs=x, outputs=y)
optimizer = keras.optimizers.Adam()
model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
      
reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5,
                      patience=50, min_lr=0.0001) 
hist = model.fit(X_train, Y_train, batch_size=batch_size, nb_epoch=nb_epochs,
              verbose=1, validation_data=(X_test, Y_test), callbacks = [reduce_lr])
log = pd.DataFrame(hist.history)
print(log.loc[log['loss'].idxmin]['loss'], log.loc[log['loss'].idxmin]['val_acc'])

Data loaded.
build conv_x
build conv_y
build conv_z
Merging skip connection
build conv_x
build conv_y
build conv_z
Merging skip connection
build conv_x
build conv_y
build conv_z
Merging skip connection
        -- model was built.
Train on 4093 samples, validate on 1024 samples
Epoch 1/100
Epoch 2/100

KeyboardInterrupt: 