In [1]:
import numpy as np 
import pandas as pd 
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_digits
from sklearn import metrics
%matplotlib inline

import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
train=pd.read_csv('X_train.csv')
y=pd.read_csv('y_train.csv')
test=pd.read_csv('X_test.csv')
sub = pd.read_csv('sample_submission.csv')
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3816 entries, 0 to 3815
Data columns (total 2 columns):
series_id    3816 non-null int64
surface      3816 non-null object
dtypes: int64(1), object(1)
memory usage: 59.7+ KB


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487680 entries, 0 to 487679
Data columns (total 13 columns):
row_id                   487680 non-null object
series_id                487680 non-null int64
measurement_number       487680 non-null int64
orientation_X            487680 non-null float64
orientation_Y            487680 non-null float64
orientation_Z            487680 non-null float64
orientation_W            487680 non-null float64
angular_velocity_X       487680 non-null float64
angular_velocity_Y       487680 non-null float64
angular_velocity_Z       487680 non-null float64
linear_acceleration_X    487680 non-null float64
linear_acceleration_Y    487680 non-null float64
linear_acceleration_Z    487680 non-null float64
dtypes: float64(10), int64(2), object(1)
memory usage: 48.4+ MB


In [4]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487680 entries, 0 to 487679
Data columns (total 13 columns):
row_id                   487680 non-null object
series_id                487680 non-null int64
measurement_number       487680 non-null int64
orientation_X            487680 non-null float64
orientation_Y            487680 non-null float64
orientation_Z            487680 non-null float64
orientation_W            487680 non-null float64
angular_velocity_X       487680 non-null float64
angular_velocity_Y       487680 non-null float64
angular_velocity_Z       487680 non-null float64
linear_acceleration_X    487680 non-null float64
linear_acceleration_Y    487680 non-null float64
linear_acceleration_Z    487680 non-null float64
dtypes: float64(10), int64(2), object(1)
memory usage: 48.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488448 entries, 0 to 488447
Data columns (total 13 columns):
row_id                   488448 non-null object
series_id                488448 no

In [6]:
list(pd.unique(y['surface']))

['fine_concrete',
 'concrete',
 'soft_tiles',
 'tiled',
 'soft_pvc',
 'hard_tiles_large_space',
 'carpet',
 'hard_tiles',
 'wood']

In [7]:
train.describe()

Unnamed: 0,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z
count,487680.0,487680.0,487680.0,487680.0,487680.0,487680.0,487680.0,487680.0,487680.0,487680.0,487680.0,487680.0
mean,1904.5,63.5,-0.01805,0.075062,0.012458,-0.003804,0.000178,0.008338,-0.019184,0.129281,2.886468,-9.364886
std,1099.853353,36.949327,0.685696,0.708226,0.105972,0.104299,0.117764,0.088677,0.229153,1.8706,2.140067,2.845341
min,0.0,0.0,-0.9891,-0.98965,-0.16283,-0.15662,-2.371,-0.92786,-1.2688,-36.067,-121.49,-75.386
25%,952.0,31.75,-0.70512,-0.68898,-0.089466,-0.10606,-0.040752,-0.033191,-0.090743,-0.530833,1.9579,-10.193
50%,1904.5,63.5,-0.10596,0.237855,0.031949,-0.018704,8.4e-05,0.005412,-0.005335,0.12498,2.8796,-9.3653
75%,2857.0,95.25,0.651803,0.80955,0.12287,0.097215,0.040527,0.048068,0.064604,0.792263,3.7988,-8.5227
max,3809.0,127.0,0.9891,0.98898,0.15571,0.15477,2.2822,1.0791,1.3873,36.797,73.008,65.839


In [8]:
def quaternion_to_euler(x, y, z, w):
    import math
    t0 = +2.0 * (w * x + y * z)
    t1 = +1.0 - 2.0 * (x * x + y * y)
    X = math.atan2(t0, t1)

    t2 = +2.0 * (w * y - z * x)
    t2 = +1.0 if t2 > +1.0 else t2
    t2 = -1.0 if t2 < -1.0 else t2
    Y = math.asin(t2)

    t3 = +2.0 * (w * z + x * y)
    t4 = +1.0 - 2.0 * (y * y + z * z)
    Z = math.atan2(t3, t4)

    return X, Y, Z

def feature_extraction(actual):
    new = pd.DataFrame()
    actual['total_angular_velocity'] = actual['angular_velocity_X'] + actual['angular_velocity_Y'] + actual['angular_velocity_Z']
    actual['total_linear_acceleration'] = actual['linear_acceleration_X'] + actual['linear_acceleration_Y'] + actual['linear_acceleration_Z']
    
    actual['acc_vs_vel'] = actual['total_linear_acceleration'] / actual['total_angular_velocity']
    
    x, y, z, w = actual['orientation_X'].tolist(), actual['orientation_Y'].tolist(), actual['orientation_Z'].tolist(), actual['orientation_W'].tolist()
    nx, ny, nz = [], [], []
    for i in range(len(x)):
        xx, yy, zz = quaternion_to_euler(x[i], y[i], z[i], w[i])
        nx.append(xx)
        ny.append(yy)
        nz.append(zz)
    
    actual['euler_x'] = nx
    actual['euler_y'] = ny
    actual['euler_z'] = nz
    
    def f1(x):
        return np.mean(np.diff(np.abs(np.diff(x))))
    
    def f2(x):
        return np.mean(np.abs(np.diff(x)))
    
    for col in actual.columns:
        if col in ['row_id', 'series_id', 'measurement_number']:
            continue
        new[col + '_mean'] = actual.groupby(['series_id'])[col].mean()
        new[col + '_min'] = actual.groupby(['series_id'])[col].min()
        new[col + '_max'] = actual.groupby(['series_id'])[col].max()
        new[col + '_std'] = actual.groupby(['series_id'])[col].std()
        new[col + '_max_to_min'] = new[col + '_max'] / new[col + '_min']
        
        # Change. 1st order.
        new[col + '_mean_abs_change'] = actual.groupby('series_id')[col].apply(f2)
        
        # Change of Change. 2nd order.
        new[col + '_mean_change_of_abs_change'] = actual.groupby('series_id')[col].apply(f1)
        
        new[col + '_abs_max'] = actual.groupby('series_id')[col].apply(lambda x: np.max(np.abs(x)))
        new[col + '_abs_min'] = actual.groupby('series_id')[col].apply(lambda x: np.min(np.abs(x)))

    return newdef missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        
        return mis_val_table_ren_columns

In [10]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        
        return mis_val_table_ren_columns

In [11]:
train_df = feature_extraction(train)
test_df = feature_extraction(test)

In [12]:
train_df['acc_vs_vel_std']=train_df['acc_vs_vel_std'].fillna(0)
train_df['acc_vs_vel_mean_change_of_abs_change']=train_df['acc_vs_vel_mean_change_of_abs_change'].fillna(0)

In [13]:
train_df=train_df.astype('float32')
test_df=test_df.astype('float32')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3810 entries, 0 to 3809
Columns: 144 entries, orientation_X_mean to euler_z_abs_min
dtypes: float32(144)
memory usage: 2.1 MB


In [14]:
le = LabelEncoder()
target = le.fit_transform(y['surface'])    #label Encoding is required to convert names to num

In [15]:
target

array([2, 1, 1, ..., 2, 7, 5])

In [16]:
# Importing random forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

# Running the random forest with default parameters.
rfc = RandomForestClassifier()

In [21]:
missing_values_table(train_df) 

Your selected dataframe has 144 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [22]:
train_df.fillna(0, inplace = True)
test_df.fillna(0, inplace = True)
train_df.replace(-np.inf, 0, inplace = True)
train_df.replace(np.inf, 0, inplace = True)
test_df.replace(-np.inf, 0, inplace = True)
test_df.replace(np.inf, 0, inplace = True)

In [23]:
# fit
rfc.fit(train_df,target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
# Making predictions
predictions = rfc.predict(test_df)

In [28]:
sub['surface'] = le.inverse_transform(predictions)
sub.to_csv('sample2.csv', index=False)

In [31]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
     'bootstrap': [False],
     'max_depth': [40, 50, 60, 80],
     'max_features': ['auto','sqrt'],
     'min_samples_leaf': [1, 2, 3],
     'min_samples_split': [4, 3, 2],
     'n_estimators': [500, 900, 1000, 1200]
 }
 # Create a based model
rf = RandomForestClassifier()
 # Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 3)
 # Fit the grid search to the data
grid_search.fit(train_df, target)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 62.8min
[Parallel(n_jobs=-1)]: Done 504 tasks      | elapsed: 111.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 173.2min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed: 188.6min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'bootstrap': [False], 'max_depth': [40, 50, 60, 80], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [4, 3, 2], 'n_estimators': [500, 900, 1000, 1200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [32]:
best_model=grid_search.best_estimator_
x=grid_search.best_params_
best_model.set_params()
best_model.get_params()

{'bootstrap': False,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 40,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [33]:
best_model.fit(train_df,target)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=40, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
pred2=best_model.predict(test_df)
pred2

array([4, 1, 7, ..., 4, 1, 8])

In [35]:
sub['surface'] = le.inverse_transform(pred2)
sub.to_csv('sample-7.csv', index=False)
sub.head(3)

Unnamed: 0,series_id,surface
0,0,hard_tiles_large_space
1,1,concrete
2,2,tiled
