In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn import metrics
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers import ConvLSTM2D
from keras.utils import to_categorical
from matplotlib import pyplot
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import math
print(os.listdir("./"))

# Any results you write to the current directory are saved as output.

Using TensorFlow backend.


In [None]:
# constants
train_root_path = './train'
train_labels_path = './train_labels.csv'
test_root_path = './test'

N = 135
f_s = 100
t_n = 1.35
T = t_n / N

train_samples = 9000
test_samples = 5000
n_features = 3
bucket_size = 145

In [None]:
# load a single file as a numpy array
def load_file(filepath):
    dataframe = pd.read_csv(filepath, header=None)
    return np.resize(dataframe.values, (bucket_size, n_features))

# test load_file
print(load_file(os.path.join(train_root_path, '16763.csv')).shape)

# load a list of files and return as a 3d numpy array
def load_group(root_path, limit = None):
    loaded = list()
    filenames = os.listdir(root_path)
    if limit is None: 
        limit = len(filenames)
    for i, name in enumerate(filenames):
        if i >= limit: break
        data = load_file(os.path.join(root_path, name))
        loaded.append(data)
    # stack group so that features are the 3rd dimension
    loaded = np.stack(loaded, axis=0)
    return loaded

# test load_group
print(load_group(train_root_path, limit=100).shape)

# load train labels
def load_train_labels(labels_path, train_data_root_path, limit = train_samples):
    train_labels = np.zeros(limit)
    loaded = pd.read_csv(labels_path)
    
    for i, filename in enumerate(os.listdir(train_data_root_path)):
        if i >= limit: break
        file_id = int(filename.split('.')[0], 10)
        train_labels[i] = int(loaded[loaded['id'] == file_id]['class'])
        
    return train_labels.reshape(limit, )

# test load_train_labels
print(load_train_labels(train_labels_path, train_root_path, limit = 100).shape)

# load test ids
def load_test_ids(test_root_path):
    test_ids = np.zeros(test_samples)
    
    for i, filename in enumerate(os.listdir(test_root_path)):
        file_id = int(filename.split('.')[0], 10)
        test_ids[i] = file_id
        
    return test_ids

def normalize_data(train_data, test_data, type=None):
    if type == None:
        return train_data, test_data
    
    if type == 'standard':
        scaler = preprocessing.StandardScaler()
        scaler.fit(train_data)
    
    if type == 'minmax':
        scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)) # (0, 1) default
        scaler.fit(train_data)
        
    if type == 'l1':
        train_data_l1 = preprocessing.normalize(train_data, norm='l1', axis=1)
        test_data_l1 = preprocessing.normalize(test_data, norm='l1', axis=1)
        
        return train_data_l1, test_data_l1
    
    if type == 'l2':
        train_data_l2 = preprocessing.normalize(train_data, norm='l2', axis=1)
        test_data_l2 = preprocessing.normalize(test_data, norm='l2', axis=1)
        
        return train_data_l2, test_data_l2
    
    train_data_scaled = scaler.transform(train_data)
    test_data_scaled = scaler.transform(test_data)
        
    return train_data_scaled, test_data_scaled

def svm_classifier(train_data, train_labels, test_data, C, algo):
    modelSVM = svm.SVC(C, algo)
    modelSVM.fit(train_data, train_labels)
    
    train_labels_predicted = modelSVM.predict(train_data)
    test_labels_predicted = modelSVM.predict(test_data)
    
    return train_labels_predicted, test_labels_predicted

def compute_accuracy(true_labels, predicted_labels):
    return (true_labels == predicted_labels).mean()

# test load_test_ids
print(load_test_ids(test_root_path))

In [4]:
# load data from input
X_train = train_data = load_group(train_root_path)
Y_train = Y_test = test_labels = train_labels = load_train_labels(train_labels_path, train_root_path) - 1
X_test = test_data = load_group(test_root_path)
test_ids = load_test_ids(test_root_path)

print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_ids.shape)

(9000, 150, 3)
(9000,)
(5000, 150, 3)
(5000,)


In [44]:
# Split train_data
X_train = train_data
Y_train = train_labels
X_test = test_data
X_train, X_test, Y_train, Y_test = train_test_split(train_data, train_labels, test_size=0.20)

In [45]:
# from siml.sk_utils import *
# from siml.signal_analysis_utils import *

# denominator = 10
# X_train, Y_train = extract_features_labels(train_data, train_labels, T, N, f_s, denominator)
# X_test, Y_test = extract_features_labels(test_data, test_labels, T, N, f_s, denominator)


In [49]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(7200, 150, 3)
(7200,)
(1800, 150, 3)
(1800,)


In [47]:
def extract_features(data):
    feature_engin = []
    feature_engin.append(np.mean(data, axis=2))
#     feature_engin.append(np.var(data, axis=1))
#     feature_engin.append(np.std(data, axis=1))
#     feature_engin.append(np.median(data, axis=1))
#     feature_engin.append(np.amin(data, axis=1))
#     feature_engin.append(np.amax(data, axis=1))
#     feature_engin.append(np.average(data, axis=1))
#     feature_engin.append(np.quantile(data, 0.5, axis=1))
#     feature_engin.append(np.argmax(data, axis=1))
#     feature_engin.append(np.prod(data**2, axis=1))


#     feature_engin.append(np.sum(data**2, axis=1))

#     feature_engin.append(np.sqrt(np.sum(data**2, axis=1)))

    new_data = []
    for feature in feature_engin:
        new_data = np.append(new_data, feature)
    
    size_f = np.sum([x.shape[1] for x in feature_engin])
    return new_data.reshape(len(data), size_f)

# train_features = np.reshape(X_train, (X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
# test_features = np.reshape(X_test, (X_test.shape[0], X_test.shape[1]*X_test.shape[2]))
# train_features = np.append(train_features, extract_features(X_train), axis = 1)
# test_features = np.append(test_features, extract_features(X_test), axis = 1)

# train_features = extract_features(X_train)
# test_features = extract_features(X_test)

print(train_features.shape)
print(test_features.shape)

(7200, 600)
(1800, 600)


In [143]:
# clf = RandomForestClassifier(n_estimators=1000)
# clf.fit(X_train, Y_train)
# print("Accuracy on training set is : {}".format(clf.score(X_train, Y_train)))
# print("Accuracy on test set is : {}".format(clf.score(X_test, Y_test)))


In [144]:
# dict_results = batch_classify(train_features, train_labels, test_features, test_labels)
# display_dict_models(dict_results)

In [36]:
def evaluateSVM(train_data, train_labels, test_data, test_labels):
    C = [1, 10, 50, 100]
    norm_types = ['standard']
    algo = ['rbf']
    accuracy = np.zeros((len(norm_types), 2, len(C)))

    for j in range(len(norm_types)):
        for i in range(len(C)):
            for a in range(len(algo)):
                train_data, test_data = normalize_data(train_data, test_data, type=norm_types[j])
                train_labels_predicted, test_labels_predicted = svm_classifier(train_data, train_labels, test_data, C[i], algo[a])

                print("Trainaccuracy (" + norm_types[j] + "," + str(C[i]) + "," + algo[a] + "): " + str(compute_accuracy(train_labels, train_labels_predicted)))
                print("Test accuracy (" + norm_types[j] + "," + str(C[i]) + "," + algo[a] + "): " + str(compute_accuracy(test_labels, test_labels_predicted)))

                accuracy[j, 0, i] = compute_accuracy(train_labels, train_labels_predicted)
                accuracy[j, 1, i] = compute_accuracy(test_labels, test_labels_predicted)
    
    print(accuracy)
    
# evaluateSVM(X_train, Y_train, X_test, Y_test)
evaluateSVM(train_features[:1000], Y_train[:1000], test_features[:1000], Y_test[:1000])
# X_train, X_test = normalize_data(train_features, test_features, type='standard')
# _, test_labels_predicted = svm_classifier(X_train, Y_train, X_test, 100, 'rbf')

Trainaccuracy (standard,1,rbf): 0.936
Test accuracy (standard,1,rbf): 0.752
Trainaccuracy (standard,10,rbf): 0.998
Test accuracy (standard,10,rbf): 0.807
Trainaccuracy (standard,50,rbf): 1.0
Test accuracy (standard,50,rbf): 0.801
Trainaccuracy (standard,100,rbf): 1.0
Test accuracy (standard,100,rbf): 0.801
[[[0.936 0.998 1.    1.   ]
  [0.752 0.807 0.801 0.801]]]


In [60]:
def evaluate_RandomForest(train_data, train_labels, test_data, test_labels):
    rf = RandomForestClassifier(n_estimators = 1000)
#     train_data, test_data = normalize_data(train_data, test_data, type="standard")
    rf.fit(train_data, train_labels)
    
    train_labels_predicted = rf.predict(train_data)
    test_labels_predicted = rf.predict(test_data)
    print("Train accuracy: ", compute_accuracy(train_labels, train_labels_predicted))
    print("Test accuracy: ", compute_accuracy(test_labels, test_labels_predicted))
    
    return test_labels_predicted

evaluate_RandomForest(train_features[:5000], Y_train[:5000], test_features[:1000], Y_test[:1000])
# test_labels_predicted = evaluate_RandomForest(train_features, train_labels, test_features, test_labels)

Train accuracy:  1.0
Test accuracy:  0.831


array([13.,  6., 14.,  4.,  0., 19., 12.,  4., 11., 17., 19.,  2., 13.,
       14., 18., 10., 13.,  1.,  0.,  4., 15., 10., 15.,  5., 18., 14.,
       13.,  2.,  2., 13., 18.,  4., 11.,  0.,  2.,  6.,  9., 14.,  3.,
       19., 13., 16., 12., 10.,  1., 10., 13.,  3., 13.,  1., 17., 12.,
       16., 16., 15., 16.,  8., 10., 14., 12., 13.,  2., 11.,  1.,  5.,
        6., 11., 16.,  8., 18., 19., 10., 11., 13., 11., 17., 10., 18.,
        5.,  0.,  8., 13.,  2.,  7., 18., 18., 15.,  0., 17., 12.,  2.,
        4.,  2., 14., 10., 12., 11., 18.,  8.,  2., 19.,  9., 16.,  6.,
       15.,  9.,  1., 19.,  4.,  6.,  7.,  9., 12.,  3.,  6., 10.,  4.,
        7.,  1.,  5.,  7.,  8., 13., 19., 13.,  0., 10.,  9.,  1.,  6.,
       17.,  2., 12.,  8., 14., 16., 16.,  3., 18.,  8.,  8., 15., 10.,
        9.,  8., 15., 11.,  3.,  7., 13.,  1., 16.,  4.,  3., 11., 16.,
        1.,  9., 11.,  4., 17.,  7.,  3., 13.,  6.,  6., 10.,  0., 16.,
       13., 13., 15.,  2., 13.,  7., 12., 12.,  4.,  4., 18., 15

In [None]:
# fit a model
def evaluate_model_LSTM(trainX, trainY):
    verbose, epochs, batch_size = 0, 15, 64
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainY.shape[1]
    model = Sequential()
    model.add(LSTM(100, input_shape=(n_timesteps, n_features)))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # fit network
    model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    # evaluate model
    _, accuracy = model.evaluate(trainX, trainY, batch_size=batch_size, verbose=0)
    print(accuracy)
    
    return model

In [38]:
# fit and evaluate a model
def evaluate_model_ConvLSTM2D(trainX, trainy, testX, testy):
    # define model
    verbose, epochs, batch_size = 0, 150, 128
    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainy.shape[1]
    
    # reshape into subsequences (samples, time steps, rows, cols, channels)
    n_steps, n_length = 4, 37
    trainX = trainX.reshape((trainX.shape[0], n_steps, 1, n_length, n_features))
    testX = testX.reshape((testX.shape[0], n_steps, 1, n_length, n_features))

    # define model
    model = Sequential()
    model.add(ConvLSTM2D(filters=64, kernel_size=(1,3), activation='relu', input_shape=(n_steps, 1, n_length, n_features)))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # fit network
    model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    # evaluate model
    _, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=0)
    print(accuracy)
    
    return model

In [None]:
# fit and evaluate random forrest model
def evaluate_model_RandomForrest(trainX, trainY, testX, testY):
    # Create a Gaussian Classifier
    model = RandomForestClassifier(n_estimators=1000)

    # Train the model using the training sets y_pred=clf.predict(X_test)
    model.fit(trainX, trainY)
    
    # Evaluate model
    print("Accuracy on training set is : {}".format(model.score(trainX, trainY)))
    print("Accuracy on test set is : {}".format(model.score(testX, testY)))
    
    return model

In [None]:
r = X_train
for i in range(X_train.shape[0]):
    for el in np.mean(X_train[i], axis=1):
        print(r.shape)
        print(el)
        
r

In [48]:
# evaluate a model


model = evaluate_model_ConvLSTM2D(X_train, Y_train.ravel(), X_test, Y_test.ravel())
# model = evaluate_model_ConvLSTM2D(train_data, train_labels, test_data, test_labels)
# model = evaluate_model_ConvLSTM2D(train_data, train_labels, train_data, train_labels)

ValueError: cannot reshape array of size 3240000 into shape (7200,150,4)

#### Test with different configs
4 features 
verbose, epochs, batch_size = 0, 25, 64      -> 0.847


3 features 
verbose, epochs, batch_size = 0, 25, 64      -> 0.8795


3 features 
verbose, epochs, batch_size = 0, 35, 128     -> 0.8845


3 features
verbose, epochs, batch_size = 1, 50, 128     -> 0.8984


3 features
verbose, epochs, batch_size = 1, 100, 128    -> 0.9115

3 features + std
verbose, epochs, batch_size = 0, 100, 128    -> 0.9044

3 features + std
verbose, epochs, batch_size = 0, 150, 128    -> 0.9194

3 features + std
bucket size = 145
verbose, epochs, batch_size = 0, 150, 128    -> 0.9439



In [None]:
# make predictions for test_data
def predictConvLSTM(model, testX):
    # reshape into subsequences (samples, time steps, rows, cols, channels)
    n_steps, n_length, n_features = 4, 37, testX.shape[2]
    testX = testX.reshape((testX.shape[0], n_steps, 1, n_length, n_features))
    predictions = model.predict(testX)
    
    return predictions

In [None]:
def evaluate_including_test_data():
    global model, train_data, test_data, train_labels
    
    test_labels = predictConvLSTM(model, test_data)
    print(train_data.shape)
    print(test_data.shape)
    
    print(train_labels.shape)
    print(test_labels.shape)

    train_data = np.concatenate((train_data, test_data))
    train_labels = np.concatenate((train_labels, test_labels))

    model = evaluate_model_ConvLSTM2D(train_data, train_labels, train_data, train_labels)    
    
evaluate_including_test_data()

In [None]:
# test predict
predictions = predictConvLSTM(model, test_data)
np.argmax(predictions[0])

In [161]:
print(test_labels_predicted)

[2. 1. 4. ... 3. 4. 0.]


In [162]:
d = {'id' : [], 'class' : []}
for i in range(len(test_labels_predicted)):
    d['id'].append(int(test_ids[i]))
    d['class'].append(int(test_labels_predicted[i]) + 1)

print(d)

{'id': [10001, 10002, 10004, 10008, 10009, 10010, 10011, 10015, 10023, 10027, 10032, 10034, 10035, 10040, 10041, 10043, 10044, 10045, 10048, 10051, 10057, 10060, 10061, 10063, 10068, 10069, 10073, 10080, 10083, 10084, 10089, 10093, 10095, 10097, 10099, 10102, 10104, 10105, 10107, 10115, 10116, 10118, 10123, 10126, 10127, 10129, 10130, 10131, 10134, 10136, 10137, 10139, 10141, 10147, 10148, 10149, 10151, 10152, 10156, 10157, 10158, 10160, 10162, 10167, 10171, 10175, 10177, 10180, 10182, 10184, 10190, 10194, 10195, 10197, 10198, 10202, 10203, 10204, 10209, 10210, 10223, 10226, 10232, 10235, 10236, 10237, 10241, 10242, 10244, 10246, 10247, 10248, 10250, 10251, 10256, 10259, 10260, 10261, 10263, 10264, 10265, 10267, 10269, 10273, 10276, 10285, 10287, 10291, 10296, 10300, 10301, 10304, 10305, 10309, 10310, 10320, 10322, 10323, 10324, 10325, 10329, 10336, 10337, 10338, 10341, 10351, 10352, 10354, 10357, 10358, 10359, 10361, 10364, 10366, 10369, 10370, 10374, 10375, 10377, 10381, 10382, 10384

In [163]:
# Write test_labels csv file

dataframe = pd.DataFrame(data=d)
dataframe.to_csv('test_labels.csv', index=False)