In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import ConvLSTM2D
from keras.utils import to_categorical
from matplotlib import pyplot
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import math
print(os.listdir("./"))

# Any results you write to the current directory are saved as output.

['.ipynb_checkpoints', 'kernel1b66f5b77c.ipynb', 'ml-unibuc-2019-23.zip', 'sample_submission.csv', 'test', 'train', 'train_labels.csv']


In [4]:
# constants
train_root_path = './train'
train_labels_path = './train_labels.csv'
test_root_path = './test'

N = 135
f_s = 100
t_n = 1.35
T = t_n / N

train_samples = 9000
test_samples = 5000
n_features = 3
bucket_size = 145

In [5]:
# load a single file as a numpy array
def load_file(filepath):
    dataframe = pd.read_csv(filepath, header=None)
    return np.resize(dataframe.values, (bucket_size, n_features))

# test load_file
print(load_file(os.path.join(train_root_path, '16763.csv')).shape)
print(load_file(os.path.join(test_root_path, '17928.csv')).shape)

(145, 3)
(145, 3)


In [6]:
# load a list of files and return as a 3d numpy array
def load_group(root_path, limit = None):
    loaded = list()
    filenames = os.listdir(root_path)
    if limit is None: 
        limit = len(filenames)
    for i, name in enumerate(filenames):
        if i >= limit: break
        data = load_file(os.path.join(root_path, name))
        loaded.append(data)
    # stack group so that features are the 3rd dimension
    loaded = np.stack(loaded, axis=0)
    return loaded

# test load_group
print(load_group(train_root_path, limit=100).shape)

(100, 145, 3)


In [7]:
# load train labels
def load_train_labels(labels_path, train_data_root_path, limit = train_samples):
    train_labels = np.zeros(limit)
    loaded = pd.read_csv(labels_path)
    
    for i, filename in enumerate(os.listdir(train_data_root_path)):
        if i >= limit: break
        file_id = int(filename.split('.')[0], 10)
        train_labels[i] = int(loaded[loaded['id'] == file_id]['class'])
        
    return train_labels.reshape(limit, 1)

# test load_train_labels
print(load_train_labels(train_labels_path, train_root_path, limit = 100).shape)

(100, 1)


In [8]:
# load test ids
def load_test_ids(test_root_path):
    test_ids = np.zeros(test_samples)
    
    for i, filename in enumerate(os.listdir(test_root_path)):
        file_id = int(filename.split('.')[0], 10)
        test_ids[i] = file_id
        
    return test_ids

# test load_test_ids
print(load_test_ids(test_root_path))

[10001. 10002. 10004. ... 23992. 23998. 24000.]


In [21]:
# load data from input
train_data = load_group(train_root_path, limit=100)
train_labels = load_train_labels(train_labels_path, train_root_path, limit=100) - 1
test_data = load_group(test_root_path)
test_ids = load_test_ids(test_root_path)

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_ids.shape)

(100, 145, 3)
(100, 1)
(5000, 145, 3)
(5000,)


In [22]:
# standardize data
def standardize_data(data):
    data -= np.mean(data)
    data /= np.std(data)
    
    return data

train_data = standardize_data(train_data)
test_data = standardize_data(test_data)

# ---- SPLIT TRAIN_DATA ----

In [23]:
# split train_data into 90% - 10% test_data
def split_train_data():
    global train_data, test_data
    global train_labels, test_labels
    
    train_data, test_data = np.array_split(train_data, [train_samples - 1000])
    train_labels, test_labels = np.array_split(train_labels, [train_samples - 1000])

    print(train_data.shape)
    print(test_data.shape)
    print(train_labels.shape)
    print(test_labels.shape)
    
# split_train_data()

In [24]:
def add_gravitation_orientation():
    global train_data, test_data
    # add gravitational orientation as a feature to train_data
    train_data = np.insert(train_data, 3, 0, axis=2)
    for i in train_data:
        for j in i:
            j[3] = math.sqrt(j[0]**2 + j[1]**2 + j[2]**2)

    print(train_data.shape)

    # add gravitational orientation as a feature to test_data
    test_data = np.insert(test_data, 3, 0, axis=2)
    for i in test_data:
        for j in i:
            j[3] = math.sqrt(j[0]**2 + j[1]**2 + j[2]**2)

    print(test_data.shape)

# add_gravitation_orientation()

In [25]:
def encode_labels(labels):
    labels = to_categorical(labels)
    print(labels.shape)
    
    return labels

# one hot encode train_labels y
train_labels = encode_labels(train_labels)

# one hot encode test_labels y
# test_labels = encode_labels(test_labels)

(100, 20)


In [26]:
# fit a model
def evaluate_model_LSTM(trainX, trainY):
    verbose, epochs, batch_size = 0, 15, 64
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainY.shape[1]
    model = Sequential()
    model.add(LSTM(100, input_shape=(n_timesteps, n_features)))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # fit network
    model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    # evaluate model
    _, accuracy = model.evaluate(trainX, trainY, batch_size=batch_size, verbose=0)
    print(accuracy)
    
    return model

In [27]:
# fit and evaluate a model
def evaluate_model_ConvLSTM2D(trainX, trainy, testX, testy):
    # define model
    verbose, epochs, batch_size = 0, 150, 128
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1]
    
    # reshape into subsequences (samples, time steps, rows, cols, channels)
    n_steps, n_length = 5, 29
    trainX = trainX.reshape((trainX.shape[0], n_steps, 1, n_length, n_features))
    testX = testX.reshape((testX.shape[0], n_steps, 1, n_length, n_features))

    # define model
    model = Sequential()
    model.add(ConvLSTM2D(filters=64, kernel_size=(1,3), activation='relu', input_shape=(n_steps, 1, n_length, n_features)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # fit network
    model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    # evaluate model
    _, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=0)
    print(accuracy)
    
    return model

In [19]:
# fit and evaluate random forrest model
def evaluate_model_RandomForrest(trainX, trainY, testX, testY):
    # Create a Gaussian Classifier
    model = RandomForestClassifier(n_estimators=1000)

    # Train the model using the training sets y_pred=clf.predict(X_test)
    model.fit(trainX, trainY)
    
    # Evaluate model
    print("Accuracy on training set is : {}".format(model.score(trainX, trainY)))
    print("Accuracy on test set is : {}".format(model.score(testX, testY)))
    
    return model

In [34]:
# evaluate a model
# model = evaluate_model_RandomForrest(X_train, Y_train.ravel(), X_test, Y_test.ravel())
# model = evaluate_model_ConvLSTM2D(train_data, train_labels, test_data, test_labels)
model = evaluate_model_ConvLSTM2D(train_data, train_labels, test_data, predictions)

0.744


#### Test with different configs
4 features 
verbose, epochs, batch_size = 0, 25, 64      -> 0.847


3 features 
verbose, epochs, batch_size = 0, 25, 64      -> 0.8795


3 features 
verbose, epochs, batch_size = 0, 35, 128     -> 0.8845


3 features
verbose, epochs, batch_size = 1, 50, 128     -> 0.8984


3 features
verbose, epochs, batch_size = 1, 100, 128    -> 0.9115

3 features + std
verbose, epochs, batch_size = 0, 100, 128    -> 0.9044

3 features + std
verbose, epochs, batch_size = 0, 150, 128    -> 0.9194

3 features + std
bucket size = 145
verbose, epochs, batch_size = 0, 150, 128    -> 0.9439



In [None]:
def evaluate_including_test_data():
    global model, train_data, test_data, train_labels
    
    test_labels = predictConvLSTM(model, test_data)
    print(train_data.shape)
    print(test_data.shape)
    
    print(train_labels.shape)
    print(test_labels.shape)

    train_data = np.concatenate((train_data, test_data))
    train_labels = np.concatenate((train_labels, test_labels))

    model = evaluate_model_ConvLSTM2D(train_data, train_labels, train_data, train_labels)    
    
evaluate_including_test_data()

(100, 145, 3)
(5000, 145, 3)
(100, 20)
(5000, 20)


In [29]:
# make predictions for test_data
def predictConvLSTM(model, testX):
    # reshape into subsequences (samples, time steps, rows, cols, channels)
    n_steps, n_length, n_features = 5, 29, testX.shape[2]
    testX = testX.reshape((testX.shape[0], n_steps, 1, n_length, n_features))
    predictions = model.predict(testX)
    
    return predictions

# test predict
predictions = predictConvLSTM(model, test_data)
np.argmax(predictions[0])

2

In [None]:
d = {'id' : [], 'class' : []}
for i in range(len(predictions)):
    d['id'].append(int(test_ids[i]))
    d['class'].append(np.argmax(predictions[i]) + 1)

print(d)

In [None]:
# Write test_labels csv file

dataframe = pd.DataFrame(data=d)
dataframe.to_csv('test_labels.csv', index=False)