In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Helping Robots

In [None]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

In [None]:
train = pd.read_csv('../input/X_train.csv')
test = pd.read_csv('../input/X_test.csv')
target = pd.read_csv('../input/y_train.csv')
submission = pd.read_csv('../input/sample_submission.csv')

In [None]:
train.head()

In [None]:
# No null values
sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

There are no null values in the dataset

In [None]:
train.isnull().sum()

In [None]:
test.head()

In [None]:
target.head()

In [None]:
len(target.series_id.unique())

In [None]:
print('There are {} rows and {} columns for training set'.format(train.shape[0],train.shape[1]))
print('There are {} rows and {} columns for test set'.format(test.shape[0],test.shape[1]))
print('There are {} rows and {} columns for test set'.format(target.shape[0],target.shape[1]))

In [None]:
len(train['series_id'].value_counts())

The input data, is covering 10 sensor channels and 128 measurements per time series. 

The orientation channels encode the current angles how the robot is oriented as a quaternion (see Wikipedia). Angular velocity describes the angle and speed of motion, and linear acceleration components describe how the speed is changing at different times.

In [None]:
len(test.series_id.unique())

In [None]:
# remove the row_id
# train = X_train.iloc[:,1:]
# test = X_test.iloc[:,1:]

In [None]:
train.describe().T

In [None]:
test.describe().T

In [None]:
# checking the difference between train series id and test series id
diff = (test.shape[0]-train.shape[0])/128
print('Test has',diff,' extra series')

In [None]:
train.columns

In [None]:
#features = data.columns.values[2:]
#plot_feature_distribution(data, test, 'train', 'test', features)

sns.kdeplot(train['orientation_X'],bw='0.5')
sns.kdeplot(test['orientation_X'], bw='0.5')

In [None]:
# Correlation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(train.corr(), annot=True, cmap='coolwarm')

In [None]:
# https://stackoverflow.com/questions/53033620/how-to-convert-euler-angles-to-quaternions-and-get-the-same-euler-angles-back-fr?rq=1
def quaternion_to_euler(x, y, z, w):
    import math
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)

    return X, Y, Z

In [None]:
def fe_step0 (actual):
    
    # https://www.mathworks.com/help/aeroblks/quaternionnorm.html
    # https://www.mathworks.com/help/aeroblks/quaternionmodulus.html
    # https://www.mathworks.com/help/aeroblks/quaternionnormalize.html
        
    actual['norm_quat'] = (actual['orientation_X']**2 + actual['orientation_Y']**2 + actual['orientation_Z']**2 + actual['orientation_W']**2)
    actual['mod_quat'] = (actual['norm_quat'])**0.5
    actual['norm_X'] = actual['orientation_X'] / actual['mod_quat']
    actual['norm_Y'] = actual['orientation_Y'] / actual['mod_quat']
    actual['norm_Z'] = actual['orientation_Z'] / actual['mod_quat']
    actual['norm_W'] = actual['orientation_W'] / actual['mod_quat']
    
    return actual

In [None]:
%%time
train = fe_step0(train)
test = fe_step0(test)

In [None]:
print(train.shape)
train.head()

In [None]:
test.head()

In [None]:
def fe_step1 (actual):
    """Quaternions to Euler Angles"""
    
    x, y, z, w = actual['norm_X'].tolist(), actual['norm_Y'].tolist(), actual['norm_Z'].tolist(), actual['norm_W'].tolist()
    nx, ny, nz = [], [], []
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    actual['euler_x'] = nx
    actual['euler_y'] = ny
    actual['euler_z'] = nz
    return actual

In [None]:
train = fe_step1(train)
test = fe_step1(test)
print(train.shape)
train.head()
#test.head()

In [None]:
# train_with_label = pd.merge(train,y_train)

In [None]:
#final_test = test

In [None]:
len(test['series_id'].unique())

In [None]:
sns.countplot(y='surface',data=target)

In [None]:
def total_values_fe(data):
    data['totl_anglr_vel'] = (data['angular_velocity_X']**2 + data['angular_velocity_Y']**2 +
                             data['angular_velocity_Z'])** 0.5
    data['totl_linr_acc'] = (data['linear_acceleration_X']**2 + data['linear_acceleration_Y']**2 +
                             data['linear_acceleration_Z'])**0.5
    data['totl_xyz'] = (data['orientation_X']**2 + data['orientation_Y']**2 +
                             data['orientation_Z'])**0.5
    #Lets derive one more column since there is a relationship in velocity and acceleration
    # v = u + a*t , u is initial velocty. if u = 0, then v = at means t = v/a
    # but value of acceleration is more and value of velocity is less, lets do a/v relation
    data['acc_vs_vel'] = data['totl_linr_acc'] / data['totl_anglr_vel']
    return data

In [None]:
data = total_values_fe(train)
test = total_values_fe(test)
print(data.shape)
data.head()

In [None]:
test.head()

In [None]:
len(test['series_id'].unique())

In [None]:
data = data.drop(['row_id','measurement_number'],axis=1)
test = test.drop(['row_id','measurement_number'],axis=1)

In [None]:
train_data = pd.DataFrame()
test_data = pd.DataFrame()

In [None]:
%%time
# columns for max, min, mean, median, abs_max, std, quartile(25%), quartile(50%), quartile(75%))
# starting from the Orientation column
columns = data.columns
for i in columns[1:]:
    if i in ['row_id','series_id','measurement_number']:
        continue
    train_data[i+'_max'] = data.groupby(by='series_id')[i].max()
    test_data[i+'_max'] = test.groupby(by='series_id')[i].max()
    print(i)
    train_data[i+'_min'] = train.groupby(by='series_id')[i].min()
    test_data[i+'_min'] = test.groupby(by='series_id')[i].min()
        
    train_data[i+'_mean'] = train.groupby(by='series_id')[i].mean()
    test_data[i+'_mean'] = test.groupby(by='series_id')[i].mean()
        
    train_data[i+'_median'] = train.groupby(by='series_id')[i].median()
    test_data[i+'_median'] = test.groupby(by='series_id')[i].median()
        
    train_data[i+'_quantile_25'] = train.groupby(by='series_id')[i].quantile(0.25)
    test_data[i+'_quantile_25'] = test.groupby(by='series_id')[i].quantile(0.25)
        
    train_data[i+'_quantile_50'] = train.groupby(by='series_id')[i].quantile(0.5)
    test_data[i+'_quantile_50'] = test.groupby(by='series_id')[i].quantile(0.5)
        
    train_data[i+'_quantile_75'] = train.groupby(by='series_id')[i].quantile(0.75)
    test_data[i+'_quantile_75'] = test.groupby(by='series_id')[i].quantile(0.75)
        
    train_data[i+'_abs_max'] = train.groupby(by='series_id')[i].apply(lambda x: np.max(np.abs(x)))
    test_data[i+'_abs_max'] = test.groupby(by='series_id')[i].apply(lambda x: np.max(np.abs(x)))
        
    train_data[i+'_std'] = train.groupby(by='series_id')[i].std()
    test_data[i+'_std'] = test.groupby(by='series_id')[i].std()
         
    train_data[i + '_range'] = train_data[i + '_max'] - train_data[i + '_min']
    test_data[i + '_range'] = test_data[i + '_max'] - test_data[i + '_min']
        
    train_data[i + '_maxtoMin'] = train_data[i + '_max'] / train_data[i + '_min']
    test_data[i + '_maxtoMin'] = test_data[i + '_max'] / test_data[i + '_min']


In [None]:
print(train_data.shape)
train_data.head()

In [None]:
# It seems no NaN values
train_data.isnull().values.any()

In [None]:
# There is missing data, we shall replace the same by zeroes
train_data.fillna(0,inplace=True)
train_data.replace(-np.inf,0,inplace=True)
train_data.replace(np.inf,0,inplace=True)
test_data.fillna(0,inplace=True)
test_data.replace(-np.inf,0,inplace=True)
test_data.replace(np.inf,0,inplace=True)

In [None]:
train_data.isnull().values.any()

In [None]:
#label Encoding
le = LabelEncoder()
target['surface'] = le.fit_transform(target['surface'])

In [None]:
target['surface'].value_counts()

In [None]:
# Using RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [None]:
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=20)
predicted = np.zeros((test_data.shape[0],9)) # we have 9 labels
measured= np.zeros((train_data.shape[0]))
score = 0

In [None]:
for times, (trn_idx,val_idx) in enumerate(folds.split(train_data.values,target['surface'].values)):
    rf = RandomForestClassifier(n_estimators=500)
    rf.fit(train_data.iloc[trn_idx],target['surface'][trn_idx])
    measured[val_idx] = rf.predict(train_data.iloc[val_idx])
    predicted += rf.predict_proba(test_data)/folds.n_splits
    score += rf.score(train_data.iloc[val_idx],target['surface'][val_idx])
    print("Fold: {} score: {}".format(times,rf.score(train_data.iloc[val_idx],target['surface'][val_idx])))
    gc.collect()
    print('Avg. accuracy',score /folds.n_splits)

In [None]:
 print('Avg. accuracy',score /folds.n_splits)

In [None]:
submission['surface'] = le.inverse_transform(predicted.argmax(axis=1))
submission.to_csv('submission_stratified_kfold.csv',index=False)
submission.head()

In [None]:
submission.head(100)

In [None]:
# import time
# params = {
#     'num_leaves': 18,
#     'min_data_in_leaf': 40,
#     'objective': 'multiclass',
#     'metric': 'multi_error',
#     'max_depth': 8,
#     'learning_rate': 0.01,
#     "boosting": "gbdt",
#     "bagging_freq": 5,
#     "bagging_fraction": 0.8126672064208567,
#     "bagging_seed": 11,
#     "verbosity": -1,
#     'reg_alpha': 0.1,
#     'reg_lambda': 0,
#     "num_class": 9,
#     'nthread': -1
# }

# t0 = time.time()
# train_set = lgb.Dataset(train_df, label=target)
# eval_hist = lgb.cv(params, train_set, nfold=10, num_boost_round=9999,
#                    early_stopping_rounds=100, seed=19)
# num_rounds = len(eval_hist['multi_error-mean'])
# # retrain the model and make predictions for test set
# clf = lgb.train(params, train_set, num_boost_round=num_rounds)
# predictions = clf.predict(test_df, num_iteration=None)
# print("Timer: {:.1f}s".format(time.time() - t0))

In [None]:
# Predictions on test set
# y_pred = rfc.predict(test_data)

In [None]:
len(y_pred)

In [None]:
# target = pd.DataFrame(y_pred,index=test_data.index,columns=['surface'])

In [None]:
# len(target['surface'])

In [None]:
# target.to_csv('rf.csv')

In [None]:
# more work to come