In [None]:
import os
import subprocess
import numpy as np
import pandas as pd

In [None]:
!ls /home/flennic/Downloads/physionet.org/files/crisdb/1.0.0/ | head

In [None]:
# Hyper Parameters
N = 400
SIZE = 5_000
LR = 0.0001
EPOCHS = 3
splits = [0.8, 0.1, 0.1]
SEED = 21
TAKE = 0.5
sigma_outlier = 2
data_dir_out = "../data/age_decades/"
data_dir = "/home/flennic/Downloads/physionet.org/files/crisdb/1.0.0/"
subdirs = ["e/", "f/", "m/"]

In [None]:
def extract_header_information(path):
    with open(path, "r") as file:
        
        content = file.read()
        start, end = 0, 0
        
        while content.find(">", end) != -1:

            start = content.find(">", end)
            end = content.find("<", start)
            value = content[start+2:end-2].strip()
            
            if value[-1] == ",":
                value = value[0:-1]
            
            if value.find("#") != -1:
                value = value.split("#")[0]
            
            yield value

In [None]:
path = data_dir + "e/" + "e238a.hea"
for value in extract_header_information(path):
    print(value)

In [None]:
%%time

all_recordings = []

for medicationdir in subdirs:
    
    print(f"Parsing {medicationdir} directory...")
    
    basedir = data_dir + medicationdir
    recordings = list(set([filename.split(".")[0] for filename in os.listdir(basedir) if len(filename.split(".")) == 2 and filename.find("index") == -1]))
    
    for i, recording in enumerate(recordings):
        
        # Status
        print(str(round(i/len(recordings)*100, 2)) + "%", end="\r")
        
        entry = {}
        
        # Filename
        entry["name"] = recording[1:4]
        entry["medication"] = recording[0]
        entry["treatment"] = recording[4] == "b"
        
        # Header
        header_information = [value for value in extract_header_information(basedir + recording + ".hea")]
        entry["age"] = header_information[0]
        entry["gender"] = header_information[1]
        
        try:
            entry["rrlow"] = int(float(header_information[3]) * 1000)
        except ValueError:
            entry["rrlow"] = None
        
        try:
            entry["rrhigh"] = int(float(header_information[4]) * 1000)
        except ValueError:
            entry["rrhigh"] = None
        
        # Recording
        rrintervals = subprocess.check_output(["ann2rr", "-r", recording, "-a", "atr", "-i", "s"], cwd=basedir).splitlines()
        rrintervals = np.array([int(float(value)*1000) for value in rrintervals], dtype=object)
        rrintervals = rrintervals[1:]

        try:
            rrintervals[(rrintervals > entry["rrhigh"]) | (rrintervals < entry["rrlow"])] = np.nan
        except TypeError:
            rrintervals[abs(rrintervals - np.mean(rrintervals)) < sigma_outlier * np.std(rrintervals)] = np.nan

        #print("Parsing")
        entry["Series"] = pd.DataFrame({"ContractionNoNorm": list(range(len(rrintervals))), "RrInterval": rrintervals})

        all_recordings.append(entry)

In [None]:
len(all_recordings)

In [None]:
all_recordings[0]["Series"]

In [None]:
def splice_random(series, n=100, size=500, sigma=50):
    cap = len(series) - size
    starts = np.random.randint(cap, size=n)
    ends = [max(min(round(np.random.normal(start+size, sigma)), cap+size), start+1) for start in starts]
    splices = [series.iloc[start:end,] for start, end in zip(starts, ends)]
    return splices

In [None]:
def splice_constant(series, n=48):
    return np.array_split(series, n)

In [None]:
def splice_rr_intervals_constant(rr_intervals, n=48):
    
    spliced_recordings = []
    
    for recording in rr_intervals:
        # List of data frames
        splices = splice_constant(recording["Series"], n=n)
        
        for splice in splices:
            recording_copy = recording.copy()
            recording_copy["Series"] = splice
            spliced_recordings.append(recording_copy)
            
    return spliced_recordings  

In [None]:
def splice_rr_intervals_random(rr_intervals, n=48, size=500, sigma=50):
    
    spliced_recordings = []
    
    for recording in rr_intervals:
        # List of data frames
        splices = splice_random(recording["Series"], n=n, size=500, sigma=50)
        
        for splice in splices:
            recording_copy = recording.copy()
            recording_copy["Series"] = splice
            spliced_recordings.append(recording_copy)
            
    return spliced_recordings  

In [None]:
def decade_to_label(decade):
    return(int(int(decade[0:2])/10) - 2)

In [None]:
def pad(l, size, padding):
    l = list(l)
    l = l[0:min(len(l), size)]
    return l + [padding] * abs((len(l)-size))

In [None]:
def normalise(l):
    scaled = l - np.mean(l)
    return scaled / np.std(scaled)

In [None]:
def rr_interval_dict_to_matrix(rr_interval_dict, pad_length):
    data = []
    labels = []
    for recording in rr_interval_dict:
        label = decade_to_label(recording["age"])
        
        # Interpolate
        series = recording["Series"]["RrInterval"].interpolate(method='linear', axis=0, limit_direction='both')
        
        # Normalise
        series = normalise(series)
        
        # Pad
        series = pad(series, pad_length, 0)
        
        #series = pad(normalise(recording["Series"]["RrInterval"]), pad_length, 0)
        labels.append(label)
        data.append(series)
    data = pd.DataFrame(data)
    
    return labels, data

In [None]:
len(all_recordings)

In [None]:
len(all_recordings[0:round(len(all_recordings) * TAKE)])

In [None]:
splits

In [None]:
len(all_recordings[0:round(len(all_recordings) * TAKE)])

In [None]:
%%time
subset = all_recordings[0:round(len(all_recordings) * TAKE)]
train_orig, val_orig, test_orig = np.array_split(subset, (np.array(splits)[:-1].cumsum() * len(subset)).astype(int))
#train = splice_rr_intervals_constant(train_orig, n=N)#, size=SIZE)
#val = splice_rr_intervals_constant(val_orig, n=N)#, size=SIZE)
#test = splice_rr_intervals_constant(test_orig, n=N)#, size=SIZE)
train = train_orig
val = val_orig
test = test_orig

In [None]:
del subset, train_orig, val_orig, test_orig, all_recordings

In [None]:
len(train) #493600

In [None]:
len(val) #61600

In [None]:
len(test) #62000

In [None]:
%%time
max_length_rr = max([recording["Series"].shape[0] for recording in train])
colnames = ["label"] + ["rr" + str(i+1) for i in range(max_length_rr)]

In [None]:
%%time

# Finding missing values

series = None
indeces = None

for i in range(train.shape[0]):
    try:
        indeces = []
        for index, value in enumerate(train):
            if np.isnan(value):
                indeces.append(index)

        series = train.iloc[i, indeces[0]-10:indeces[0]+30]
        break
    except IndexError:
        continue
series

In [None]:
%%time
train_labels, train = rr_interval_dict_to_matrix(train, max_length_rr)
val_labels, val = rr_interval_dict_to_matrix(val, max_length_rr)
test_labels, test = rr_interval_dict_to_matrix(test, max_length_rr)
train

In [None]:
%%time
train.insert(0, "label", train_labels)
train.columns = colnames
train["label"] = train["label"].astype('int32')

val.insert(0, "label", val_labels)
val.columns = colnames
val["label"] = val["label"].astype('int32')

test.insert(0, "label", test_labels)
test.columns = colnames
test["label"] = test["label"].astype('int32')

In [None]:
%%time
save_path_train = "{}/preprocessed/PHYSIO_train_spliced_complete.csv".format("/".join(data_dir_out.split("/")[:-2]))
save_path_val = "{}/preprocessed/PHYSIO_val_spliced_complete.csv".format("/".join(data_dir_out.split("/")[:-2]))
save_path_test = "{}/preprocessed/PHYSIO_test_spliced_complete.csv".format("/".join(data_dir_out.split("/")[:-2]))
train.to_csv(save_path_train)
val.to_csv(save_path_val)
test.to_csv(save_path_test)

In [None]:
train["label"].value_counts()

In [None]:
train["label"].count()

In [None]:
train["label"].value_counts()[4]/train["label"].count()

## DeepSleepNet

In [None]:
# Hyper Parameters
LR = 0.00001
EPOCHS = 10
#splits = [0.8, 0.1, 0.1]
SEED = 21
#data_dir = "../data/age_decades/"
data_dir_out = "../data/age_decades/"

In [None]:
save_path_train = "{}/preprocessed/PHYSIO_train_spliced_complete.csv".format("/".join(data_dir_out.split("/")[:-2]))
save_path_val = "{}/preprocessed/PHYSIO_val_spliced_complete.csv".format("/".join(data_dir_out.split("/")[:-2]))
save_path_test = "{}/preprocessed/PHYSIO_test_spliced_complete.csv".format("/".join(data_dir_out.split("/")[:-2]))

In [None]:
from torch.utils.data import Dataset
import os

class RrIntervalDataset(Dataset):
    def __init__(self, path, header=True):
        
        self.samples = []
        
        with open(path, 'r') as file:
            if header:
                next(file)
            for line in file:
                self.samples.append(list(map(lambda x: float(x), line.split(",")[1:])))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

In [None]:
%%time
train_data = RrIntervalDataset(save_path_train)
val_data = RrIntervalDataset(save_path_val)
#test_data = RrIntervalDataset(save_path_test)

### RrInteral DataLoader

In [None]:
import torch
def __batch2tensor__(batch):
    """
    Takes a batch and transforms it in such a way that it can directly be fed to the network.
    @param batch: List of x and y labels.
    @return: Two tensors, one for x and one for y.
    """
    #print(batch)
    x, y = [None] * len(batch), [None] * len(batch)
    for i, row in enumerate(batch):
        #print(row)
        y[i] = int(row[0])
        x[i] = row[1:]

    return torch.FloatTensor(x), torch.LongTensor(y)

In [None]:
%%time
from torch.utils.data import DataLoader

# Batch size is one patient size!
rrIntervalDataLoaderTrain = DataLoader(train_data, batch_size=1, num_workers=1, collate_fn=__batch2tensor__, shuffle=True)
rrIntervalDataLoaderVal = DataLoader(val_data, batch_size=1, num_workers=1, collate_fn=__batch2tensor__)
#rrIntervalDataLoaderTest = DataLoader(test_data, batch_size=2, num_workers=1, collate_fn=__batch2tensor__)

## DeepSleepNet Model

In [None]:
import torch.nn as nn

class DeepSleepNet(nn.Module):
    def __init__(self, no_classes, dropout=0.5, lstm_dropout=0.5, lstm_layers=2, lstm_hidden=256):
        
        super().__init__()
        
        # General Information
        self.lstm_layers = lstm_layers
        self.lstm_hidden = lstm_hidden
        self.lstm_hidden_states = None
        
        # Dropout
        # Maybe for CNN, check literature
        self.dropout = nn.Dropout(p=dropout)
        
        # Head 1
        self.conv1dh1l1 = nn.Conv1d(1, 128, 8, stride=1)
        self.conv1dh1l2 = nn.Conv1d(128, 128, 8, stride=1)
        self.conv1dh1l3 = nn.Conv1d(128, 128, 8, stride=1)

        # Head 2
        self.conv1dh2l1 = nn.Conv1d(1, 128, 4, stride=1)
        self.conv1dh2l2 = nn.Conv1d(128, 128, 4, stride=1)
        self.conv1dh2l3 = nn.Conv1d(128, 128, 4, stride=1)
        
        # Pooling
        ## MaxPool
        #self.maxpool = nn.AdaptiveMaxPool1d(avg_sequence_length//10)
        self.maxpoolConvH1 = nn.MaxPool1d(8)
        self.maxpoolConvH2 = nn.MaxPool1d(4)
        
        # LSTM
        # Input diemension will be hardcoded, very hard the obtain that through calculations (but possible...)
        self.lstm = nn.LSTM(58248, lstm_hidden, lstm_layers,
                            batch_first=True, dropout=lstm_dropout,
                            bidirectional=True)
        
        # Linear
        self.l1 = nn.Linear(65536, no_classes)
            
        # Dropout
        self.dropout = nn.Dropout(p=dropout)
        
        # Batch Normalisation
        self.batchnorm1dH1 = nn.BatchNorm1d(128)
        self.batchnorm1dH2 = nn.BatchNorm1d(128)
        self.batchnormLstm = nn.BatchNorm1d(128)
        
        # Activation
        #self.act = nn.Sigmoid()
        self.act = nn.ReLU()
        
        # Log Softmax
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        
        # CNN
        x = x.view(x.shape[0], -1, x.shape[1])
        
        x1 = self.conv1dh1l1(x)
        x1 = self.act(x1)
        x1 = self.conv1dh1l2(x1)
        x1 = self.act(x1)
        x1 = self.conv1dh1l3(x1)
        x1 = self.batchnorm1dH1(x1)
        x1 = self.maxpoolConvH1(x1)
        
        x2 = self.conv1dh2l1(x)
        x2 = self.act(x2)
        x2 = self.conv1dh2l2(x2)
        x2 = self.act(x2)
        x2 = self.conv1dh2l3(x2)
        x2 = self.batchnorm1dH2(x2)
        x2 = self.maxpoolConvH2(x2)
        
        x = torch.cat((x1, x2), 2)
        x = self.batchnormLstm(x)
        x = self.dropout(x)
        
        # LSTM
        x, self.lstm_hidden_states = self.lstm(x, self.lstm_hidden_states)
        # Decouple from training history
        self.lstm_hidden_states = tuple([each.data for each in self.lstm_hidden_states])
        
        x = self.dropout(x)
        
        # Linear
        x = x.reshape(x.shape[0], -1)
        
        x = self.l1(x)
        #x = self.sigmoid(x)
        x = self.softmax(x)
        
        return x
    
    def init_hidden(self, batch_size):
        # Create two new tensors with sizes lstm_layers x batch_size x lstm_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data

        self.lstm_hidden_states = (weight.new(self.lstm_layers * 2, batch_size, self.lstm_hidden).zero_().cuda(),
                                   weight.new(self.lstm_layers * 2, batch_size, self.lstm_hidden).zero_().cuda())

In [None]:
#del model
#del X, Y
#torch.cuda.empty_cache()

model = DeepSleepNet(6).cuda()
# Initialize hidden states
model.init_hidden(1)
X, Y = next(iter(rrIntervalDataLoaderTrain))
X

model(X.cuda()).shape

model(X.cuda())

## Training

In [None]:
model = DeepSleepNet(6).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
criterion = torch.nn.NLLLoss()

In [1]:
%%time

training_loss_storage = []
training_accuracy_storage = []
validation_loss_storage = []
validation_accuracy_storage = []

for i in range(EPOCHS):
    
    print("Epoch: {}".format(i+1))
    
    training_loss = 0
    training_accuracy = 0
    training_processed_data = 0
    
    for x, y in rrIntervalDataLoaderTrain:
        
        # Initialize hidden states in each epoch
        model.init_hidden(x.shape[0])
        
        x = x.cuda()
        y = y.cuda()
        
        # Reset Gradients
        optimizer.zero_grad()

        # Forward, Loss, Backwards, Update
        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        
        training_processed_data += x.shape[0]

        # Calculate Metrics
        training_loss += loss.item()
        training_accuracy += torch.sum(torch.exp(output).topk(1)[1].view(-1) == y).item()
        
        print(f"{training_processed_data}/{len(train_data)} ({round(training_processed_data/len(train_data)*100, 2)}%)", end="\r")
    
    else:
        print("Training Loss: {}".format(training_loss/training_processed_data))
        print("Training Accuracy: {}".format(training_accuracy/training_processed_data))
        
        training_loss_storage.append(training_loss/training_processed_data)
        training_accuracy_storage.append(training_accuracy/training_processed_data)
        
        validation_loss = 0
        validation_accuracy = 0
        validation_processed_data = 0

        model.eval()

        with torch.no_grad():
            for x, y in rrIntervalDataLoaderVal:
                x = x.cuda()
                y = y.cuda()

                output_validation = model(x)
                loss_val = criterion(output_validation, y)
                
                validation_processed_data += x.shape[0]
                
                validation_loss += loss_val.item()
                validation_accuracy += torch.sum(
                    torch.exp(output_validation).topk(1, dim=1)[1].view(-1) == y).item()
            else:
                print("Validation Loss: {}".format(validation_loss/validation_processed_data))
                print("Validation Accuracy: {}".format(validation_accuracy/validation_processed_data))
                
                validation_loss_storage.append(validation_loss/validation_processed_data)
                validation_accuracy_storage.append(validation_accuracy/validation_processed_data)
                
                model.train()
                

NameError: name 'EPOCHS' is not defined

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
X, Y = next(iter(rrIntervalDataLoaderVal))
res = model(X.cuda()).cpu()

In [None]:
index = 0
true_label = int(Y)
posterior = np.array(torch.exp(res).detach().cpu())
plt.bar(range(6), posterior.tolist()[0]);
plt.title(f'True Label: {true_label}');

In [None]:
list(range(6))

In [None]:
posterior.tolist()[0]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.dpi']= 160

In [None]:
plt.plot(range(len(training_loss_storage)), training_loss_storage, label="Training Loss");
plt.plot(range(len(validation_loss_storage)), validation_loss_storage, label="Validation Loss");
plt.legend()
plt.ylabel('Loss');
plt.xlabel('Epoch');
plt.title('Loss during training');

In [None]:
plt.plot(range(len(training_accuracy_storage)), training_accuracy_storage, label="Training Accuracy");
plt.plot(range(len(validation_accuracy_storage)), validation_accuracy_storage, label="Validation Accuracy");
plt.legend()
plt.ylabel('Accuracy');
plt.xlabel('Epoch');
plt.title('Accuracy during training');