In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy import stats
import math
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, VotingClassifier
import lightgbm as lgb
import xgboost as xgb
import gc
import itertools
from tsfresh.feature_extraction.feature_calculators import kurtosis, skewness, mean_abs_change, mean_change, sample_entropy, abs_energy, absolute_sum_of_changes, quantile
from tsfresh.feature_selection.selection import select_features
from sklearn.cluster import KMeans
from scipy.spatial.distance import euclidean
from tqdm import tqdm

In [2]:
train = pd.read_csv("X_train.csv")
test = pd.read_csv("X_test.csv")
label = pd.read_csv("y_train.csv")
sub = pd.read_csv("sample_submission.csv")

In [3]:
def reduce_mem_usage(df):
    # iterate through all the columns of a dataframe and modify the data type
    #   to reduce memory usage.        
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [4]:
def plot_confusion_matrix(truth, pred, classes, normalize=False, title=''):
    cm = confusion_matrix(truth, pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.figure(figsize=(10, 10))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion matrix', size=15)
    plt.colorbar(fraction=0.046, pad=0.04)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.grid(False)
    plt.tight_layout()

In [5]:
def create_valid_set(label):
    # Lets try creating a validation set of 10% of the total size.
    ldict = {
        'concrete': 0.16,
        'soft_pvc': 0.18,
        'wood': 0.06,
        'tiled': 0.03,
        'fine_concrete': 0.10,
        'hard_tiles_large_space': 0.12,
        'soft_tiles': 0.23,
        'carpet': 0.05,
        'hard_tiles': 0.07,
    }
    score = 0
    print("Required count of target classes for the Valid Set :: ")
    for key, value in ldict.items():
        score += value
        print(key, int(value * 380)) # Multiplying by 380 i.e 10% of 3810 for our validation size of 10%.
        ldict[key] = int(value * 380)
    print("\nTotal Weights of class :: ", score)
    
    # Grouping surface with group_id and the count attached to each surface.
    ser = label.groupby(['surface'])['group_id'].value_counts()
    ser = pd.DataFrame(ser)
    ser.columns = ['count']
    
    # Maually creating the valid set using the counts using the required count and the count we have in the train set.
    # This dictionary consists of the group_id for the required valid set. 
    cv_set = {
        'concrete': [0],
        'soft_pvc': [69],
        'wood': [2],
        'tiled': [28],
        'fine_concrete': [36],
        'hard_tiles_large_space': [16],
        'soft_tiles': [4, 17],
        'carpet': [52],
        'hard_tiles': [27],
    }

    cv_size = 0
    for key, value in cv_set.items():
        print(key)
        for i in value:
            cv_size += label[label['group_id'] == i].shape[0]
            print("\nGot shape :: ", label[label['group_id'] == i].shape[0])
        print("Expected shape :: ", ldict[key])
    
    val_df = pd.DataFrame()
    for key, value in cv_set.items():
        for i in value:
            val_df = pd.concat([val_df, label[label['group_id'] == i]])
    print("Valid Set Size :: ", val_df.shape[0])
    
    # We have only 1 group_id for the hard_tiles and it consists of only 21 records.
    # So we have added the same group_id in the train as well as valid set. GROUP_ID = 27(for "hard_tiles")
    hard_tiles_index = label[(label['surface'] == 'hard_tiles') & (label['group_id'] == 27)].index
    
    # Therefore train set = Total Set series_id - Valid Set series_id + Hard_Tiles.index
    trn_series_id_list = list(set(label.series_id.unique()) - set(val_df.series_id.unique())) + hard_tiles_index.tolist()
    
    print("Train Set Distribution")
    print(label['surface'].iloc[trn_series_id_list].value_counts())
    
    print("Valid Set Distribution")
    print(label['surface'].iloc[val_df.index].value_counts())
    
    trn_df = label.iloc[trn_series_id_list]
    
    trn_df.set_index(['series_id'], inplace=True)
    val_df.set_index(['series_id'], inplace=True)
    
    return trn_df, val_df

In [6]:
def FE(data):
    df = pd.DataFrame()
    data['totl_anglr_vel'] = (data['angular_velocity_X']**2 + data['angular_velocity_Y']**2 +
                             data['angular_velocity_Z']**2)** 0.5
    data['totl_linr_acc'] = (data['linear_acceleration_X']**2 + data['linear_acceleration_Y']**2 +
                             data['linear_acceleration_Z']**2)**0.5
    # data['totl_xyz'] = (data['orientation_X']**2 + data['orientation_Y']**2 +
    #                     data['orientation_Z']**2)**0.5
    # data['z_planar_anglr_vel'] = (data['angular_velocity_X']**2 + data['angular_velocity_Y']**2)**0.5
    # data['z_planar_linr_acc'] = (data['linear_acceleration_X']**2 + data['linear_acceleration_Y']**2)**0.5
    #Lets derive one more column since there is a relationship in velocity and acceleration
    # v = u + a*t , u is initial velocty. if u = 0, then v = at means t = v/a
    # but value of acceleration is more and value of velocity is less, lets do a/v relation
    # data['acc_vs_vel'] = data['totl_linr_acc'] / data['totl_anglr_vel']
    
    # Deriving more feature, since we are reducing rows now, we should know min,max,mean values
    for col in data.columns:
        if col in ['row_id','series_id','measurement_number']:
            continue
        df[col + '_mean'] = data.groupby(['series_id'])[col].mean()
        df[col + '_median'] = data.groupby(['series_id'])[col].median()
        # df[col + '_max'] = data.groupby(['series_id'])[col].max()
        df[col + '_min'] = data.groupby(['series_id'])[col].min()
        df[col + '_std'] = data.groupby(['series_id'])[col].std()
        df[col + '_maxtoMin'] = data.groupby(['series_id'])[col].max() / data.groupby(['series_id'])[col].min()
        df[col + '_mean_abs_chg'] = data.groupby(['series_id'])[col].apply(lambda x: mean_abs_change(x))
        df[col + '_abs_min'] = data.groupby(['series_id'])[col].apply(lambda x: np.min(np.abs(x)))
        # df[col + '_abs_max'] = data.groupby(['series_id'])[col].apply(lambda x: np.max(np.abs(x)))
        df[col + '_kurtosis'] = data.groupby(['series_id'])[col].apply(lambda x: kurtosis(x))
        df[col + '_skewness'] = data.groupby(['series_id'])[col].apply(lambda x: skewness(x))
        # df[col + '_abs_avg'] = (df[col + '_abs_min'] + df[col + '_abs_max'])/2
        # df["diff_"+col] = df[col]-df[col].shift(1)
        # df["ma_"+col] = np.square(abs(df[col]-df[col].rolling(8).mean()))
    return df

In [7]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 48.37 MB
Memory usage after optimization is: 14.88 MB
Decreased by 69.2%
Memory usage of dataframe is 48.45 MB
Memory usage after optimization is: 14.91 MB
Decreased by 69.2%


In [8]:
pmg_df = pd.read_csv('pmg_df.csv')
pmg_ts_df = pd.read_csv('pmg_ts_df.csv')

In [9]:
trn_df, val_df = create_valid_set(label)
train = FE(train)
test = FE(test)

Required count of target classes for the Valid Set :: 
concrete 60
soft_pvc 68
wood 22
tiled 11
fine_concrete 38
hard_tiles_large_space 45
soft_tiles 87
carpet 19
hard_tiles 26

Total Weights of class ::  1.0
concrete

Got shape ::  57
Expected shape ::  60
soft_pvc

Got shape ::  70
Expected shape ::  68
wood

Got shape ::  18
Expected shape ::  22
tiled

Got shape ::  36
Expected shape ::  11
fine_concrete

Got shape ::  36
Expected shape ::  38
hard_tiles_large_space

Got shape ::  45
Expected shape ::  45
soft_tiles

Got shape ::  57

Got shape ::  12
Expected shape ::  87
carpet

Got shape ::  11
Expected shape ::  19
hard_tiles

Got shape ::  21
Expected shape ::  26
Valid Set Size ::  363
Train Set Distribution
concrete                  722
soft_pvc                  662
wood                      589
tiled                     478
fine_concrete             327
hard_tiles_large_space    263
soft_tiles                228
carpet                    178
hard_tiles                 21
Na

In [10]:
le = LabelEncoder()
label['surface'] = le.fit_transform(label['surface'])

In [11]:
train.fillna(0,inplace=True)
train.replace(-np.inf,0,inplace=True)
train.replace(np.inf,0,inplace=True)
test.fillna(0,inplace=True)
test.replace(-np.inf,0,inplace=True)
test.replace(np.inf,0,inplace=True)

In [12]:
train_groups = pd.DataFrame(pmg_df['group_id'])
x_train = train.iloc[trn_df.index]
y_train = label['surface'].iloc[trn_df.index]

x_val = train.iloc[val_df.index]
y_val = label['surface'].iloc[val_df.index]

# print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

In [13]:
train_groups['group_id'].nunique()

359

In [14]:
rand = RandomForestClassifier(n_estimators=200, random_state=1337)

In [15]:
rand.fit(x_train, y_train)
train_probs = rand.predict_proba(train)
print(train_probs.shape)
print(train_groups.shape)
preds_groups_df = pd.DataFrame(train_probs).join(train_groups)

(3810, 9)
(3810, 1)


In [16]:
full_train_predictions = np.zeros((train_probs.shape[0]), dtype='int32')
for g_id in range(preds_groups_df['group_id'].nunique()):
    group_indices = preds_groups_df[preds_groups_df['group_id']==g_id].index
    predicted_value = preds_groups_df[preds_groups_df['group_id']==g_id].mean()[[0,1,2,3,4,5,6,7,8]].idxmax()
    full_train_predictions[group_indices] = int(predicted_value)

print("Grouped CV Acc:", accuracy_score(le.inverse_transform(full_train_predictions[val_df.index]),le.inverse_transform(label['surface'].iloc[val_df.index])))
print("Standard CV Acc:", accuracy_score(le.inverse_transform(label['surface'].iloc[val_df.index]), le.inverse_transform(rand.predict(train.iloc[val_df.index]))))

ValueError: cannot convert float NaN to integer

In [None]:
g_id = 22
full_train_predictions = np.zeros((train_probs.shape[0]))
# print("Indices", preds_groups_df[preds_groups_df['group_id']==g_id].index)
# print("Prediction",preds_groups_df[preds_groups_df['group_id']==g_id].mean()[[1,2,3,4,5,6,7,8]].idxmax())
# full_train_predictions[preds_groups_df[preds_groups_df['group_id']==g_id].index] = preds_groups_df[preds_groups_df['group_id']==g_id].mean()[[1,2,3,4,5,6,7,8]].idxmax()
print(preds_groups_df[preds_groups_df['group_id']==g_id].mean()[[0,1,2,3,4,5,6,7,8]])

In [None]:
# TODO
# Implement a surface based success metric rather than group based.
# If the grouping is successful, then the feature engineering part causes the error. Try signal processing.
# Some groups need to be further split, looking at the probabilities. Use this information.
# Implement the same for test set.
# Analyze which classes are getting misclassified.

In [None]:
# indices = train_groups[train_groups['group_id']==741].index

In [None]:
train_groups.head()

In [None]:
correctly_grouped_count = 0
total_count = 0
singular_groups = 0
for c in range(train_groups['group_id'].nunique()):
    indices = train_groups[train_groups['group_id']==c].index
    correctly_grouped_count += label['group_id'].iloc[indices].value_counts().iloc[0]
    total_count += label['group_id'].iloc[indices].value_counts().sum()
    if train_groups[train_groups['group_id']==c].index.size == 1 or 0: singular_groups+=1

print(correctly_grouped_count/total_count)

In [None]:
singular_groups

In [None]:
'''
correctly_grouped_count = 0
total_count = 0
singular_groups = 0
for c in range(train_groups['group_id'].nunique()):
    indices = train_groups[train_groups['group_id']==c].index
    if train_groups[train_groups['group_id']==c].index.size == 1 or 0: singular_groups+=1

print(singular_groups)
'''

In [2]:
pmg_df = pd.read_csv("pmg_df.csv")
x_train = pd.read_csv("x_train.csv")
y_train = pd.read_csv("y_train.csv")


In [10]:
x_train_g = x_train.groupby(['series_id'])
y_train_g = y_train.groupby(['series_id'])
print(pmg_df.iloc[0])

series_id          0.000000
group_id           0.000000
distance           0.000488
nearest_point    475.000000
Name: 0, dtype: float64


In [28]:
x = 996
print(x_train.groupby(['series_id']).last().iloc[x])

row_id                      996_127
measurement_number              127
orientation_X              -0.75996
orientation_Y               -0.6326
orientation_Z              -0.10481
orientation_W              -0.10628
angular_velocity_X        0.0051752
angular_velocity_Y       -0.0088096
angular_velocity_Z       0.00087172
linear_acceleration_X      -0.44258
linear_acceleration_Y        4.1739
linear_acceleration_Z       -9.6836
Name: 996, dtype: object


In [29]:
x = 1116
print(x_train.groupby(['series_id']).first().iloc[x])

row_id                     1116_0
measurement_number              0
orientation_X            -0.92772
orientation_Y             0.34254
orientation_Z            0.045284
orientation_W            -0.14123
angular_velocity_X      -0.058909
angular_velocity_Y       0.078028
angular_velocity_Z       -0.11527
linear_acceleration_X     -2.4032
linear_acceleration_Y      5.5525
linear_acceleration_Z     -6.6973
Name: 1116, dtype: object


In [31]:
a=1166
print(pmg_df['nearest_point'].iloc[a])
print(y_train['surface'].iloc[a])

0.0
fine_concrete


In [None]:
# TODO fix mislabeling due to group number increase