# Preprocessing for ECG Classification

> Copyright 2019 Dave Fernandes. All Rights Reserved.
> 
> Licensed under the Apache License, Version 2.0 (the "License");
> you may not use this file except in compliance with the License.
> You may obtain a copy of the License at
>
> http://www.apache.org/licenses/LICENSE-2.0
>  
> Unless required by applicable law or agreed to in writing, software
> distributed under the License is distributed on an "AS IS" BASIS,
> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> See the License for the specific language governing permissions and
> limitations under the License.

Data can be downloaded from: https://www.kaggle.com/coni57/model-from-arxiv-1805-00794

- Randomly sample 10% of data for the test set.
- Remaining data is balanced for the training set by upsampling under-represented classes.

In [None]:
import numpy as np
import pandas as pd
import pickle

TEST_FRACTION = 0.1

CSV_1 = './Data/mitbih_train.csv'
CSV_2 = './Data/mitbih_test.csv'

TRAIN_SET = './Data/train_set.pickle'
TEST_SET = './Data/test_set.pickle'

raw_1 = pd.read_csv(CSV_1, header=None)
raw_2 = pd.read_csv(CSV_2, header=None)
raw = pd.concat([raw_1, raw_2], axis=0)

shuffled = raw.sample(frac=1, axis=0)
del raw
del raw_1
del raw_2

values = shuffled.values
x = values[:, :-1]
y = values[:, -1].astype(int)
del values
del shuffled

n = int(len(x) * TEST_FRACTION)

with open(TEST_SET, 'wb') as file:
    pickle.dump({'x': x[:n, :], 'y': y[:n]}, file)

with open(TRAIN_SET, 'wb') as file:
    pickle.dump({'x': x[n:, :], 'y': y[n:]}, file)

In [None]:
TRAIN_BALANCED = './Data/train_balanced.pickle'

xt = x[n:, :]
yt = y[n:]
class_x = []
class_count = []

for label in range(5):
    x_i = xt[yt == label]
    class_x.append(x_i)
    class_count.append(len(x_i))

counts = (np.floor(max(class_count) / np.array(class_count))).astype(int)

for label in range(5):
    count = counts[label]
    if label == 0:
        x_bal = class_x[label]
        y_bal = np.zeros((class_count[label])).astype(int)
        count -= 1

    for j in range(count):
        x_bal = np.concatenate((x_bal, class_x[label]), axis=0)
        y_bal = np.concatenate((y_bal, np.zeros((class_count[label])).astype(int) + label))

print(np.shape(x_bal), np.shape(y_bal))

with open(TRAIN_BALANCED, 'wb') as file:
    pickle.dump({'x': x_bal, 'y': y_bal}, file)

## Next
Run the `ClassifyECG.ipynb` notebook next...