In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import ConvLSTM2D
from keras.utils import to_categorical
from matplotlib import pyplot
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import math
print(os.listdir("."))

# Any results you write to the current directory are saved as output.

['.ipynb_checkpoints', 'detect_peaks.py', 'evaluators.py', 'kernel1b66f5b77c (1).ipynb', 'kernel1b66f5b77c.ipynb', 'load_data.py', 'logistic_regression.py', 'ml-unibuc-2019-23.zip', 'naive_bayes.py', 'perceptron.py', 'sample_submission.csv', 'signal_analysis_utils.py', 'siml', 'sk_utils.py', 'test', 'test_labels.csv', 'train', 'train_labels.csv', '__init__.py', '__pycache__']


In [178]:
# constants
train_root_path = './train'
train_labels_path = './train_labels.csv'
test_root_path = './test'

train_samples = 9000
test_samples = 5000
n_features = 3
bucket_size = 150

In [179]:
# load a single file as a numpy array
def load_file(filepath):
    dataframe = pd.read_csv(filepath, header=None)
    return np.resize(dataframe.values, (bucket_size, n_features))

# test load_file
print(load_file(os.path.join(train_root_path, '16763.csv')).shape)

# load a list of files and return as a 3d numpy array
def load_group(root_path, limit = None):
    loaded = list()
    filenames = os.listdir(root_path)
    if limit is None: 
        limit = len(filenames)
    for i, name in enumerate(filenames):
        if i >= limit: break
        data = load_file(os.path.join(root_path, name))
        loaded.append(data)
    # stack group so that features are the 3rd dimension
    loaded = np.stack(loaded, axis=0)
    return loaded

# test load_group
print(load_group(train_root_path, limit=100).shape)

# load train labels
def load_train_labels(labels_path, train_data_root_path, limit = train_samples):
    train_labels = np.zeros(limit)
    loaded = pd.read_csv(labels_path)
    
    for i, filename in enumerate(os.listdir(train_data_root_path)):
        if i >= limit: break
        file_id = int(filename.split('.')[0], 10)
        train_labels[i] = int(loaded[loaded['id'] == file_id]['class'])
        
    return train_labels.reshape(limit, )

# test load_train_labels
print(load_train_labels(train_labels_path, train_root_path, limit = 100).shape)

# load test ids
def load_test_ids(test_root_path):
    test_ids = np.zeros(test_samples)
    
    for i, filename in enumerate(os.listdir(test_root_path)):
        file_id = int(filename.split('.')[0], 10)
        test_ids[i] = file_id
        
    return test_ids

# test load_test_ids
print(load_test_ids(test_root_path))

(150, 3)
(100, 150, 3)
(100,)
[10001. 10002. 10004. ... 23992. 23998. 24000.]


In [180]:
# load data from input
train_data = load_group(train_root_path)
train_labels = load_train_labels(train_labels_path, train_root_path) - 1
test_data = load_group(test_root_path)
test_ids = load_test_ids(test_root_path)

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_ids.shape)

(9000, 150, 3)
(9000,)
(5000, 150, 3)
(5000,)


In [169]:
# Split train_data
# train_data, test_data, train_labels, test_labels = train_test_split(train_data, train_labels, test_size=0.20)
X_train, X_test, Y_train, Y_test = train_test_split(train_data, train_labels, test_size=0.20)

In [181]:
# X_test = np.append(X_test, np.reshape(np.mean(X_test, axis=2), (X_test.shape[0], X_test.shape[1], 1)), axis=2)
# X_train = np.append(X_train, np.reshape(np.mean(X_train, axis=2), (X_train.shape[0], X_train.shape[1], 1)), axis=2)

test_data = np.append(test_data, np.reshape(np.mean(test_data, axis=2), (test_data.shape[0], test_data.shape[1], 1)), axis=2)
train_data = np.append(train_data, np.reshape(np.mean(train_data, axis=2), (train_data.shape[0], train_data.shape[1], 1)), axis=2)

print(train_data.shape)
print(test_data.shape)

# print(X_test.shape)
# print(X_train.shape)

(9000, 150, 4)
(5000, 150, 4)


In [182]:
def encode_labels(labels):
    labels = to_categorical(labels)
    print(labels.shape)
    
    return labels

# one hot encode train_labels y
train_labels = encode_labels(train_labels)
# Y_train = encode_labels(Y_train)

# one hot encode test_labels y
# Y_test = encode_labels(Y_test)

(9000, 20)


In [183]:
# standardize data
def standardize_data(data):
    data -= np.mean(data)
    data /= np.std(data)
    
    return data

# X_train = standardize_data(X_train)
# X_test = standardize_data(X_test)
train_data = standardize_data(train_data)
test_data = standardize_data(test_data)

In [186]:
# fit and evaluate a model
def evaluate_model_ConvLSTM2D(trainX, trainy, testX, testy):
    # define model
    verbose, epochs, batch_size = 1, 100, 128
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1]
    
    # reshape into subsequences (samples, time steps, rows, cols, channels)
    n_steps, n_length = 5, 30
    trainX = trainX.reshape((trainX.shape[0], n_steps, 1, n_length, n_features))
    testX = testX.reshape((testX.shape[0], n_steps, 1, n_length, n_features))

    # define model
    model = Sequential()
    model.add(ConvLSTM2D(filters=64, kernel_size=(1,3), activation='tanh', input_shape=(n_steps, 1, n_length, n_features), ))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(100, activation='tanh'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # fit network
    model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    # evaluate model
    _, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=0)
    print(accuracy)
    
    return model

In [None]:
# evaluate a model
# model = evaluate_model_ConvLSTM2D(X_train[:1000], Y_train[:1000], X_test[:1000], Y_test[:1000])
model = evaluate_model_ConvLSTM2D(train_data, train_labels, train_data, train_labels)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
1536/9000 [====>.........................] - ETA: 20s - loss: 0.2134 - acc: 0.9245

In [None]:
# make predictions for test_data
def predictConvLSTM(model, testX):
    # reshape into subsequences (samples, time steps, rows, cols, channels)
    n_steps, n_length, n_features = 5, 30, testX.shape[2]
    testX = testX.reshape((testX.shape[0], n_steps, 1, n_length, n_features))
    predictions = model.predict(testX)
    
    return predictions

In [None]:
# test predict
predictions = predictConvLSTM(model, test_data)
np.argmax(predictions[0])

In [None]:
d = {'id' : [], 'class' : []}
for i in range(len(predictions)):
    d['id'].append(int(test_ids[i]))
    d['class'].append(np.argmax(predictions[i]) + 1)

print(d)

In [None]:
# Write test_labels csv file

dataframe = pd.DataFrame(data=d)
dataframe.to_csv('test_labels.csv', index=False)