In [25]:

#basics
import sys
import scipy
import csv
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

#scaling methods
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import QuantileTransformer


#neurokit
#import neurokit as nk
#import seaborn as sns


#metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


#model selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold


#preprocessing
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split, KFold

#models
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, GradientBoostingClassifier )
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

#fourier transform
from scipy.fftpack import fft, ifft

from sklearn.decomposition import (PCA, LatentDirichletAllocation)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.feature_selection import f_regression
# Importing metrics for evaluation

#feature selection
from sklearn.feature_selection import SelectKBest, f_regression



#biospy
import random as rn
from biosppy.signals import (ecg, tools)
import pywt
from tqdm import tqdm_notebook as tqdm

import math
from itertools import product
# ============= CONSTS =============
#TRAIN_FILE_PATH = "/X_train.csv"
#TARGET_FILE_PATH =  "/y_train.csv"
#TEST_FILE_PATH = "/X_test.csv"

SEED=42
NUM_MAX_POINTS = 18154

#this is the frequency I guess?
SAMPLING_RATE=300
USE_WAVE_LETS = False
my_cols = ["id"] + ["x" + str(i) for i in range(NUM_MAX_POINTS)]
# ============= CONSTS =============

np.random.seed(seed)
rn.seed(seed)


# svc parameters
SVC_C = 1.0
SVC_KERNEL = 'rbf'
SVC_K = 1000
def SVC_GAMMA(X, f, k):
    return 1/(k*f(X))

# gradient boost parameters
GBC_LOSS = 'deviance'
GBC_L_RATE = 0.098
GBC_N_ESTIMATORS = 150
GBC_MAX_DEPTH = 4

# xgb parameters
XGB_OBJECTIVE = 'multi:softprob'
XGB_L_RATE = 0.2
XGB_N_ESTIMATORS = 150
XGB_BOOSTER = 'gbtree'
XGB_MAX_DEPTH = 6
XGB_LAMBDA = 0.0
XGB_ALPHA = 1.0



# LOADING DATA

In [2]:
X_train_data = pd.read_csv('/Users/charlotteout/Documents/AML/task3/X_train.csv', index_col=0)


Y_train = pd.read_csv('/Users/charlotteout/Documents/AML/task3/y_train.csv', index_col=0)


X_test_data =  pd.read_csv('/Users/charlotteout/Documents/AML/task3/X_test.csv', index_col=0)



# FEATURE EXTRACTION

In [51]:
def get_features(signal, sampling_rate):
    
    X = list()
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts,heart_rate = ecg.ecg(signal, sampling_rate, show=False)
    
    templates1, rpeaks1 = ecg.extract_heartbeats(filtered, rpeaks, sampling_rate)
    
    rpeaks2 = ecg.correct_rpeaks(signal=signal, rpeaks=rpeaks, sampling_rate=sampling_rate, tol=0.1)
    
    rpeaksamps = np.take(filtered, rpeaks)
    
    if len(heart_rate) < 2:
        heart_rate = [0,1]
    if len(heart_rate_ts) < 2:
        heart_rate_ts = [0,1]
    
    #rpeaksamps stats
    X.append(np.mean(rpeaksamps))
    X.append(np.min(rpeaksamps))
    X.append(np.max(rpeaksamps))
    X.append(np.std(rpeaksamps))
    X.append(np.median(rpeaksamps))
    
    X.append(np.mean(np.diff(rpeaksamps)))
    X.append(np.min(np.diff(rpeaksamps)))
    X.append(np.max(np.diff(rpeaksamps)))
    X.append(np.std(np.diff(rpeaksamps)))
    X.append(np.median(np.diff(rpeaksamps)))
    
    #heart rate stats
    X.append(np.mean(heart_rate))
    X.append(np.std(heart_rate))
    X.append(np.min(heart_rate))
    X.append(np.max(heart_rate))
    X.append(np.median(heart_rate))
    
    X.append(np.mean(np.diff(heart_rate)))
    X.append(np.min(np.diff(heart_rate)))
    X.append(np.max(np.diff(heart_rate)))
    X.append(np.std(np.diff(heart_rate)))
    X.append(np.median(np.diff(heart_rate)))
    
    #statistics of R-R interval
    
    RR_int =[]
    T = 1/300
    
    for k in range(1,len(rpeaks1)):
        RR_int.append((ts[rpeaks[k]] - ts[rpeaks[k-1]]))
    if len(RR_int) != 0:
        X.append(np.mean(RR_int))
        X.append(np.std(RR_int))
        X.append(np.min(RR_int))
        X.append(np.max(RR_int))
        X.append(np.median(RR_int))
        
    X.append(60000 / np.mean(RR_int))
    
    X += list(np.mean(templates1, axis=0))
    X += list(np.min(templates1, axis=0))
    X += list(np.max(templates1, axis=0))
    X += list(np.std(templates1, axis=0))
    X += list(np.median(templates1, axis=0))
    
    X = np.array(X)
    
    X[np.isnan(X)] = 0
    
    
    
    return X


In [52]:
features_train = list()
sampling_rate = float(SAMPLING_RATE)
for id in tqdm(range(X_train_data.shape[0])):
    #dropping the NaN's in this way
    signal = np.array(pd.to_numeric(X_train_data.iloc[id].dropna()))
    features_train.append(get_features(signal, sampling_rate))
    
    
X_train = np.array(features_train)
print(X_train.shape)

HBox(children=(IntProgress(value=0, max=5117), HTML(value='')))


(5117, 926)


In [53]:
features_test = list()
for id in tqdm(range(X_test_data.shape[0])):
    signal = np.array(pd.to_numeric(X_test_data.iloc[id].dropna()))
    features_test.append(get_features(signal,sampling_rate))
    
X_test = np.array(features_test)
print(X_test.shape)

HBox(children=(IntProgress(value=0, max=3411), HTML(value='')))


(3411, 926)


In [20]:
#scaling methods 

def Standardscaler(data):
    scaler = StandardScaler()
    fitscal = scaler.fit(data)
    return fitscal

def PowTrans(data):
    scaler = PowerTransformer(method='yeo-johnson',standardize=True)
    fitscal = scaler.fit(data)
    return fitscal

def MinMax(data):
    scaler = MinMaxScaler()
    fitscal = scaler.fit(data)
    return fitscal

def Quantile(data):
    scaler = QuantileTransformer()
    fitscal = scaler.fit(data)
    return fitscal


def Robust(data):
    scaler = RobustScaler()
    fitscal = scaler.fit(data)
    return fitscal

In [28]:
#MODELS

#naive bayes
def NB(data_set_X, data_set_y):
    model = GaussianNB()
    model.fit(data_set_X, data_set_y)
    return model

# svm classifier model
def svc(data_set_X, data_set_y):
    model = SVC(random_state=seed)
    model.fit(data_set_X, data_set_y)
    print(model)
    return model

# mlp classifier model
def mlp(data_set_X, data_set_y):
    model = MLPClassifier(random_state=0)
    model.fit(data_set_X, data_set_y)
    return model

# ada boost model
def ada_boost(data_set_X, data_set_y):
    model = AdaBoostClassifier()
    model.fit(data_set_X, data_set_y)
    return model

# random forest classifier model
def rfc(data_set_X, data_set_y):
    model = RandomForestClassifier()
    model.fit(data_set_X, data_set_y)
    return model

# quadratic discriminant analysis model
def qda(data_set_X, data_set_y):
    model = QuadraticDiscriminantAnalysis()
    model.fit(data_set_X, data_set_y)
    return model

def GradBoost(data_set_X, data_set_y):
    model = GradientBoostingClassifier()
    model.fit(data_set_X, data_set_y)
    return model

def xgbc(data_set_X, data_set_y):
    model = XGBClassifier(objective=XGB_OBJECTIVE, learning_rate=XGB_L_RATE,
                         n_estimators=XGB_N_ESTIMATORS, booster=XGB_BOOSTER,
                         max_depth=XGB_MAX_DEPTH,reg_lambda=XGB_LAMBDA,
                         reg_alpha=XGB_ALPHA, random_state=SEED)
    model.fit(data_set_X,data_set_y)
    return model

In [69]:
def do_kfold(classifier, X, y, k):
    kf = KFold(n_splits=k)

    mses_eval = []
   

    for train_index, test_index in kf.split(X):
        X_train, X_test, y_train = X[train_index], X[test_index], y[train_index]
        scaler = Robust(X_train)
        #X_train = scaler.transform(X_train)
        #X_test = scaler.transform(X_test)
        # X_train, y_train = resample(X_train, y_train)
        #X_train, X_test = do_best_select(, X_train, X_test, y_train)
        model = classifier(X_train, y_train)
        y_pred_test = model.predict(X_test)
        mses_eval.append(f1_score(y[test_index], y_pred_test, average="micro"))
       
        print(mses_eval)
       

    return np.mean(mses_eval), np.std(mses_eval)

In [70]:
y = np.ravel(np.array(Y_train.values))

In [71]:
mean, std1 = do_kfold(GradBoost,X_train, y, 5)

[0.8037109375]
[0.8037109375, 0.810546875]
[0.8037109375, 0.810546875, 0.8240469208211144]
[0.8037109375, 0.810546875, 0.8240469208211144, 0.7869012707722385]
[0.8037109375, 0.810546875, 0.8240469208211144, 0.7869012707722385, 0.8035190615835777]


In [72]:
print(mean)
print(std1)

0.8057450131353863
0.01201829149716305


In [66]:
#split train and test data in 80 20 
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y, test_size=0.2, random_state=42)

#scaling
myScale = Robust(X_train1)
X_train1 = myScale.transform(X_train1)
X_test1 = myScale.transform(X_test1)



#Model
model = xgbc(X_train1, y_train1)
y_pred_test = model.predict(X_test1)


print(f1_score(y_test1, y_pred_test, average="micro"))

0.8125


In [67]:

'''
Write predictions into csv file
'''

# function to write csv file
def csv_write(prediction):

    # size of prediction
    n_size = prediction.size

    # header
    header = []
    header.append('id')
    header.append('y')

    # array containing ids
    ids = []

    for i in range(0, n_size):
        ids.append(float(i))

    ids = np.array(ids)

    # write file
    with open('prediction3_sixth.csv', 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer = csv.DictWriter(csvfile, fieldnames = ['id', 'y'])
        writer.writeheader()
        nsize = prediction.size
        for i in range(0, nsize):
            row = {}
            row['id'] = ids[i]
            row['y'] = prediction[i]
            writer.writerow(row)



In [68]:
scaler= Robust(X_train)
X_train_data_1 = scaler.transform(X_train)
X_test_data_1 =scaler.transform(X_test)

model = xgbc(X_train_data_1, y)
y_pred_test = model.predict(X_test_data_1)

csv_write(y_pred_test)