In [None]:
import os
import math
import numpy as np
import pandas as pd
import hrvanalysis as hrv

from datetime import datetime

In [None]:
data_dir = "../../data/age_decades/"

In [None]:
def raw_recording_to_dict(filepath, rawPythonic=True):
    
    recording = {}
    
    with open(filepath) as raw_recording:
        
        recording["Gender"] = filepath.split("/")[-1][0:1]
        recording["AgeDecade"] = filepath.split("/")[-1][1:3]
        recording["RecordingStartTime"] = datetime.strptime(filepath.split("/")[-1][4:9], '%H.%M').time()
        
        series = {"ContractionNo": [], "ContractionNoNorm": [], "RrInterval": []}
        
        first_index = None
        previous_ContractionNo = None
        
        for line in raw_recording:
            
            # Handling shifted indexes
            if first_index is None:
                first_index = int(line.split()[1])
            
            # Fill missing data with None's
            if previous_ContractionNo is not None:
                diff = abs(previous_ContractionNo - int(line.split()[1]))
                
                if diff > 1:
                    
                    filling_indexes = np.array(range(previous_ContractionNo+1, int(line.split()[1])))
                    
                    series["ContractionNo"].extend(filling_indexes)
                    series["ContractionNoNorm"].extend(filling_indexes - first_index)
                    series["RrInterval"].extend([math.nan]*(diff-1))
          
            series["ContractionNo"].append(int(line.split()[1]))
            series["ContractionNoNorm"].append(int(line.split()[1]) - first_index)
            series["RrInterval"].append(int(line.split()[0]))
            
            previous_ContractionNo = int(line.split()[1])
            
        if rawPythonic:
            recording["Series"] = series
            recording["RecordingStartTime"] = str(recording["RecordingStartTime"])
        else:
            recording["Series"] = pd.DataFrame(series)
            
        return recording

In [None]:
%%time
recordings = []
for filename in os.listdir(data_dir):
    recordings.append(raw_recording_to_dict(data_dir + filename, False))

In [None]:
recordings[0]["Series"]["RrInterval"]

In [None]:
%%time
#interpolated_example = hrv.interpolate_nan_values(recordings[0]["Series"]["RrInterval"][:10_000])
print(len(recordings[0]["Series"]["RrInterval"]))
interpolated_example = hrv.preprocessing.get_nn_intervals(recordings[0]["Series"]["RrInterval"], interpolation_method='linear')
#interpolated_example = pd.Series(interpolated_example).dropna().tolist()
#, interpolation_method="cubic")
np.mean(interpolated_example)

In [None]:
%%time
hrv.get_time_domain_features(interpolated_example)

In [None]:
hrv.get_geometrical_features(interpolated_example)

In [None]:
hrv.get_frequency_domain_features(interpolated_example)

In [None]:
list(pd.Series([1,2,3,4,5]))

In [None]:
list([1,2,3,4,5])

In [None]:
hrv.get_frequency_domain_features(interpolated_example)

In [None]:
hrv.get_frequency_domain_features(pd.Series(interpolated_example))

In [None]:
hrv.get_csi_cvi_features(interpolated_example)

In [None]:
hrv.get_poincare_plot_features(interpolated_example)

hrv.get_geometrical_features(interpolated_example)
hrv.get_frequency_domain_features(interpolated_example)
hrv.get_csi_cvi_features(interpolated_example)
hrv.get_poincare_plot_features(interpolated_example)
hrv.get_sampen(interpolated_example)

In [None]:
[value for value in hrv.get_time_domain_features(interpolated_example).values()]

In [None]:
feature_names = [key for key in hrv.get_time_domain_features(interpolated_example).keys()]
feature_names

In [None]:
names = [key for key in hrv.get_time_domain_features(interpolated_example).keys()]
names

In [None]:
def decade_to_label(decade):
    return(int(int(decade)/10) - 2)

In [None]:
def split_data(data, splits=[0.6, 0.2, 0.2]):
    return np.array_split(data, (np.array(splits)[:-1].cumsum() * len(data)).astype(int))

In [None]:
pd.Series([1,2,3,4,5,6,None]).interpolate()

In [None]:
def recordings_to_dataframe(recordings, interpolation_method='linear'):
    
    data_frame = pd.DataFrame()
    column_names = []

    for i, recording in enumerate(recordings):

        interpolated_recording = hrv.preprocessing.get_nn_intervals(recording["Series"]["RrInterval"],
                                                                    interpolation_method=interpolation_method,
                                                                    verbose=False)

        #interpolated_recording = pd.Series(interpolated_example).dropna().tolist()

        time_domain_features = hrv.get_time_domain_features(interpolated_recording)
        geometrical_features = hrv.get_geometrical_features(interpolated_recording)
        frequency_domain_features = hrv.get_frequency_domain_features(interpolated_recording)
        csi_cvi_features = hrv.get_csi_cvi_features(interpolated_recording)
        poincare_plot_features = hrv.get_poincare_plot_features(interpolated_recording)

        feature_dictionary = {
                                **time_domain_features,
                                **geometrical_features,
                                **frequency_domain_features,
                                **csi_cvi_features,
                                **poincare_plot_features
                             }

        if i == 0:
            column_names = [key for key in feature_dictionary.keys()]

        x = [value for value in feature_dictionary.values()]
        y = decade_to_label(recording["AgeDecade"])
        data_frame = data_frame.append([[y]+ x], ignore_index=True)

    data_frame.columns = ["label"] + column_names

    return data_frame
        

In [None]:
%%time
#train, val, test = split_data(recordings)
train, val = split_data(recordings, splits=[0.8, 0.2])
df_train = recordings_to_dataframe(train)
df_val = recordings_to_dataframe(val)
#df_test = recordings_to_dataframe(test)

In [None]:
df_train.shape

In [None]:
df_train

In [None]:
X_train = df_train.loc[:, df_train.columns != 'label']
X_train = X_train.drop(columns=['tinn']) # Is all 'None'.drop(columns=['B', 'C'])
X_train

In [None]:
X_val = df_val.loc[:, df_val.columns != 'label']
X_val = X_val.drop(columns=['tinn']) # Is all 'None'.drop(columns=['B', 'C'])
X_val.columns

In [None]:
X_val[['total_power', 'vlf', 'csi', 'cvi',
       'Modified_csi', 'sd1', 'sd2', 'ratio_sd2_sd1']]

In [None]:
Y_train = df_train["label"]
Y_train

In [None]:
Y_val = df_val["label"]
Y_val

In [None]:
X_train.shape

In [None]:
import sklearn
from sklearn import pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#Create a svm Classifier
clf = sklearn.linear_model.SGDClassifier(n_jobs=-1)

#Train the model using the training sets
clf.fit(X_train, Y_train)

#Predict the response for test dataset
Y_train_pred = clf.predict(X_train)
Y_val_pred = clf.predict(X_val)

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_train, Y_train_pred))

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_val, Y_val_pred))

In [None]:
Y_train.value_counts().plot(kind='bar');

In [None]:
pd.Series(Y_train_pred).value_counts().plot(kind='bar');

In [None]:
%%time
parameters = {'classifier__alpha': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)}

nb_classifier_pipe = pipeline.Pipeline(steps = [
    ('classifier', MultinomialNB())
])

nb_classifier = GridSearchCV(nb_classifier_pipe, parameters, cv = 2, n_jobs = -1, verbose = 10)

nb_classifier.fit(X_train, Y_train)

#Predict the response for test dataset
Y_train_pred = clf.predict(X_train)
Y_val_pred = clf.predict(X_val)

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_train, Y_train_pred))

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_val, Y_val_pred))

In [None]:
Y_train.value_counts().plot(kind='bar');

In [None]:
pd.Series(Y_train_pred).value_counts().plot(kind='bar');

In [None]:
#Create a linreg
clf = LogisticRegression(n_jobs=-1)

#Train the model using the training sets
clf.fit(X_train, Y_train)

#Predict the response for test dataset
Y_train_pred = clf.predict(X_train)
Y_val_pred = clf.predict(X_val)

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_train, Y_train_pred))

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_val, Y_val_pred))

In [None]:
Y_train.value_counts().plot(kind='bar');

In [None]:
pd.Series(Y_train_pred).value_counts().plot(kind='bar');

In [None]:
%%time
#Create a SVM

parameters = {
                'classifier__C': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0),
                'classifier__kernel': ('linear', 'poly', 'rbf', 'sigmoid')
             }

svc_classifier_pipe = pipeline.Pipeline(steps = [
    ('classifier', SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True,
          probability=False, tol=0.001, cache_size=200, class_weight=None,
          verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False,
          random_state=None))
])

clf = GridSearchCV(svc_classifier_pipe, parameters, cv = 3, n_jobs = -1, verbose = 10)

#Train the model using the training sets
clf.fit(X_train, Y_train)

#Predict the response for test dataset
Y_train_pred = clf.predict(X_train)
Y_val_pred = clf.predict(X_val)

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_train, Y_train_pred))

In [None]:
print("Accuracy:", sklearn.metrics.accuracy_score(Y_val, Y_val_pred))

In [None]:
Y_train.value_counts().plot(kind='bar');

In [None]:
pd.Series(Y_train_pred).value_counts().plot(kind='bar');

In [None]:
Y_val.value_counts().plot(kind='bar');

In [None]:
pd.Series(Y_val_pred).value_counts().plot(kind='bar');