In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from time import time
import pickle
import lightgbm as lgb
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import Embedding
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.rnn = nn.GRU(input_size=1, hidden_size=64,
                          num_layers=2, batch_first=True, dropout=0.5)
        self.fc = nn.Linear(in_features=64, out_features=2)

    def forward(self, x):
        x, _ = self.rnn(x)
        x = self.fc(x[:, -1, :])
        return x


model = RNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import mne
from mne.io import read_raw_edf
import os
import re
import scipy
from scipy.signal import welch
from scipy.integrate import simps
from time import time
import pickle
import matplotlib.cm as cm

In [4]:
eeg_info_df = pd.read_csv('../data/eeg_info_df.csv')
eeg_info_df['n_channels'] = eeg_info_df.channels.map(len)
eeg_info_df['label'] = np.where(eeg_info_df.path.map(
    lambda x: x.split('/')[0]) == 'sleep-telemetry', 1, 0)
eeg_info_df.head()

Unnamed: 0,path,channels,n_records,n_channels,label
0,sleep-cassette/SC4001E0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",7950000,111,0
1,sleep-cassette/SC4002E0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",8490000,111,0
2,sleep-cassette/SC4011E0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",8406000,111,0
3,sleep-cassette/SC4012E0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",8550000,111,0
4,sleep-cassette/SC4021E0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",8412000,111,0


In [5]:
eeg_info_df.tail()

Unnamed: 0,path,channels,n_records,n_channels,label
192,sleep-telemetry/ST7212J0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",3162000,72,1
193,sleep-telemetry/ST7221J0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",3296000,72,1
194,sleep-telemetry/ST7222J0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",2874000,72,1
195,sleep-telemetry/ST7241J0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",3261000,72,1
196,sleep-telemetry/ST7242J0-PSG.edf,"['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal', ...",2948000,72,1


In [9]:
eeg_info_df['label'].value_counts()

0    153
1     44
Name: label, dtype: int64

In [10]:
np.unique(eeg_info_df[['label', 'n_channels']].values, axis=0)

array([[  0, 111],
       [  1,  72]])

# Training inline with reading data

In [12]:
def make_training_set(df, label):
    features_list = []
    label_list = []

    sf = 100
    time_interval = 3600
    n = edf_df.shape[0]
    n_hours = n//(time_interval*sf)
    for j in range(n_hours):
        temp_df = edf_df.iloc[(j*time_interval*sf):((j+1)*time_interval*sf), :]
        features_list.append(temp_df[['EEG Fpz-Cz']].values)
        label_list.append(label)

    X = np.array(features_list)
    y = np.array([label] * n_hours)
    print(X.shape)
    print(y.shape)
    return X, y

In [None]:
## This loads each psg data file INLINE with training.. I can't load all the training data at once :(
model.train()
for i, path in enumerate(eeg_info_df.path.values):
    print(f'Training from data: {path}...')
    tic = time()
    raw_edf = read_raw_edf(os.path.join('../data', path))
    edf_df = raw_edf.to_data_frame()
    edf_df = edf_df[edf_df['EEG Fpz-Cz'] > -1000]  # remove outliers
    label = eeg_info_df.label.iloc[i]
    toc = time()
    print(f'Time Taken (Loading Data): {round(toc-tic)}')
    
    tic = time()
    X, y = make_training_set(edf_df, label)
    trainset = TensorDataset(torch.from_numpy(X.astype('float32')), torch.from_numpy(y.astype('long')))
    trainloader = DataLoader(trainset, batch_size=10, shuffle=True, num_workers=2)
    toc = time()
    print(f'Time Taken (Preprocessing Data): {round(toc-tic)}')
    
    total_training_tic = time()
    print(f'Training... X: {X.shape}, y: {y.shape}')
    for i, data in enumerate(trainloader, 0):
        inputs, targets = data    
        optimizer.zero_grad()

        tic = time()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        toc = time()
        print(f'Time Taken (Training Step {i}): {round(toc-tic)}')
    total_training_toc = time()
    tic = total_training_tic
    toc = total_training_toc
    print(f'Time Taken (Training Complete): {round(toc-tic)}')
        
    data = next(iter(trainloader))
    inputs, targets = data
    outputs = model(inputs)
    probability, predicted = torch.max(outputs.data, 1)
    print((targets == predicted).squeeze())
    print(f'Training from data: {path}, is complete.')

Training from data: sleep-cassette/SC4001E0-PSG.edf...
Extracting EDF parameters from /Users/oulrich/Projects/MastersDegree/Semesters/2019_Fall/CSE-6250-HealthData/Project/CSE6250-project/data/sleep-cassette/SC4001E0-PSG.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Converting "time" to "<class 'numpy.int64'>"...
Time Taken (Loading Data): 7
(22, 360000, 1)
(22,)
Time Taken (Preprocessing Data): 0
Training... X: (22, 360000, 1), y: (22,)
