In [1]:
import pandas as pd
import numpy as np
np.random.seed(2018)

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [35]:
def map_to_int(x):
    return int(x.split(' ')[1])

train = pd.read_csv('../input/train.csv.zip', compression='zip', index_col='id', converters={'location': map_to_int})
test = pd.read_csv('../input/test.csv.zip', compression='zip', index_col='id', converters={'location': map_to_int})

event_type = pd.read_csv('../input/event_type.csv.zip', compression='zip', index_col='id', converters={'event_type': map_to_int})
resource_type = pd.read_csv('../input/resource_type.csv.zip', compression='zip', index_col='id', converters={'resource_type': map_to_int})
severity_type = pd.read_csv('../input/severity_type.csv.zip', compression='zip', index_col='id', converters={'severity_type': map_to_int})
log_feature = pd.read_csv('../input/log_feature.csv.zip', compression='zip', index_col='id', converters={'log_feature': map_to_int})


train_test = pd.concat([train, test],ignore_index=True)

## XGBoost

In [36]:
labels = train['fault_severity'].values

xgtrain = xgb.DMatrix(train[ ['location'] ].values, labels)
xgtest = xgb.DMatrix(test.values)

# XGBoost params
xgboost_params = {
    'objective': 'multi:softprob',
    'booster': 'gbtree',
    'eval_metric': 'mlogloss',
    'max_depth': 5,
    'eta': 0.3,
    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'num_class': 3
}


num_rounds = 1000
cv = xgb.cv(xgboost_params, xgtrain, metrics='mlogloss', num_boost_round=num_rounds, nfold=5, stratified=True, seed=2018, early_stopping_rounds=25, verbose_eval=True, show_stdv=False)

best_nrounds = cv.shape[0] - 1

[0]	train-mlogloss:0.962883	test-mlogloss:0.966358
[1]	train-mlogloss:0.88163	test-mlogloss:0.887452
[2]	train-mlogloss:0.827919	test-mlogloss:0.836546
[3]	train-mlogloss:0.790972	test-mlogloss:0.80216
[4]	train-mlogloss:0.763975	test-mlogloss:0.776642
[5]	train-mlogloss:0.744879	test-mlogloss:0.759878
[6]	train-mlogloss:0.729028	test-mlogloss:0.745822
[7]	train-mlogloss:0.71712	test-mlogloss:0.73543
[8]	train-mlogloss:0.707613	test-mlogloss:0.728393
[9]	train-mlogloss:0.699187	test-mlogloss:0.722283
[10]	train-mlogloss:0.691682	test-mlogloss:0.716367
[11]	train-mlogloss:0.686542	test-mlogloss:0.712781
[12]	train-mlogloss:0.681315	test-mlogloss:0.709341
[13]	train-mlogloss:0.677186	test-mlogloss:0.706351
[14]	train-mlogloss:0.673443	test-mlogloss:0.704122
[15]	train-mlogloss:0.670181	test-mlogloss:0.702141
[16]	train-mlogloss:0.667122	test-mlogloss:0.700511
[17]	train-mlogloss:0.663991	test-mlogloss:0.698987
[18]	train-mlogloss:0.661342	test-mlogloss:0.698255
[19]	train-mlogloss:0.6588

## Feature Engineering

In [106]:
def feature_engineering(df):
    combined = pd.merge(severity_type, df.copy(), left_index=True, right_index=True)
    
    ## Severity_type
    combined['time'] = combined.groupby('location')['severity_type'].transform(lambda x: np.arange(x.shape[0])+1)
    combined['reverse_time'] = combined.groupby('location')['severity_type'].transform(lambda x: np.arange(x.shape[0])[::-1]+1)
    
    location_count = combined[ ['location', 'time'] ].groupby('location').agg(len).to_dict()['time']
    combined['location_count'] = combined['location'].map(lambda x: location_count[x])
    
    ### Dummies for severity_type
    dummy_severity_type = pd.get_dummies( severity_type, columns=['severity_type'] )
    combined = pd.merge(dummy_severity_type, combined, left_index=True, right_index=True)

    ## Event type
    agg_et = pd.merge(combined, event_type, left_index=True, right_index=True)
    
    agg_funcs = [np.median, np.mean, np.max, np.min, len, np.sum]
    
    for agg_func in agg_funcs:
        dict_et_by_location = agg_et.groupby('location')['event_type'].agg(agg_func).to_dict()
        combined['{0}_et_by_location'.format(agg_func.__name__)] = combined.location.map(lambda x: dict_et_by_location[x])
        
    ### Dummies for Event_type
    dummy_event_type = pd.get_dummies(event_type, columns=['event_type'])
    combined = pd.merge(dummy_event_type, combined, left_index=True, right_index=True)
        
    ## Log feature
    log_feature['volume_log'] = np.log2( log_feature['volume'] + 1 )
    
    agg_lf = pd.merge(combined, log_feature, left_index=True, right_index=True)

    agg_funcs = [np.median, np.mean, np.max, np.min, len, np.sum]
    
    for agg_func in agg_funcs:
        dict_volume_by_id = log_feature.groupby('id')['volume'].agg(agg_func)
        combined['{0}_lf_volume_by_id'.format(agg_func.__name__)] = combined.index.map(lambda x: dict_volume_by_id[x])
        
    for agg_func in agg_funcs:
        agg_dict_by_location = agg_lf.groupby('location')['log_feature'].agg(agg_func).to_dict()
        combined['{0}_lf_by_location'.format(agg_func.__name__)] = combined.location.map(lambda x: agg_dict_by_location[x])
        
    for agg_func in agg_funcs:
        agg_dict_by_location = agg_lf.groupby('location')['volume'].agg(agg_func).to_dict()
        combined['{0}_lf_volume_by_location'.format(agg_func.__name__)] = combined.location.map(lambda x: agg_dict_by_location[x])

    for agg_func in agg_funcs:
        agg_dict_by_location = agg_lf.groupby('location')['volume_log'].agg(agg_func).to_dict()
        combined['{0}_lf_volume_log_by_location'.format(agg_func.__name__)] = combined.location.map(lambda x: agg_dict_by_location[x])

    ##dummies log feature
#     log_feature_dummy = pd.get_dummies( log_feature, columns=['log_feature'])
#     log_feats = [c for c in log_feature_dummy.columns if 'log_feature_' in c]

#     log_feature_dummy[ log_feats ]
        
    #Resource type
    resource_type_len = resource_type.groupby('id').agg(len).to_dict()['resource_type']
    combined['resource_type_len'] = combined.index.map(lambda x: resource_type_len[x])
    

    agg_rt = pd.merge(combined, resource_type, left_index=True, right_index=True)

    agg_funcs = [np.median, np.mean, np.max, np.min, len, np.sum]
    
    for agg_func in agg_funcs:
        agg_dict_by_location = agg_rt.groupby('location')['resource_type'].agg(agg_func).to_dict()
        combined['{0}_rt_by_location'.format(agg_func.__name__)] = combined.location.map(lambda x: agg_dict_by_location[x])
       
    ### Dummies Resource_type
    dummy_resource_type = pd.get_dummies(resource_type, columns=['resource_type'])
    combined = pd.merge(dummy_resource_type, combined, left_index=True, right_index=True)

    
    return combined

combined_train = feature_engineering(train)

In [107]:
print(len(combined_train.columns.values))
print(combined_train.columns.values)

111
['resource_type_1' 'resource_type_2' 'resource_type_3' 'resource_type_4'
 'resource_type_5' 'resource_type_6' 'resource_type_7' 'resource_type_8'
 'resource_type_9' 'resource_type_10' 'event_type_1' 'event_type_2'
 'event_type_3' 'event_type_4' 'event_type_5' 'event_type_6' 'event_type_7'
 'event_type_8' 'event_type_9' 'event_type_10' 'event_type_11'
 'event_type_12' 'event_type_13' 'event_type_14' 'event_type_15'
 'event_type_17' 'event_type_18' 'event_type_19' 'event_type_20'
 'event_type_21' 'event_type_22' 'event_type_23' 'event_type_24'
 'event_type_25' 'event_type_26' 'event_type_27' 'event_type_28'
 'event_type_29' 'event_type_30' 'event_type_31' 'event_type_32'
 'event_type_33' 'event_type_34' 'event_type_35' 'event_type_36'
 'event_type_37' 'event_type_38' 'event_type_39' 'event_type_40'
 'event_type_41' 'event_type_42' 'event_type_43' 'event_type_44'
 'event_type_45' 'event_type_46' 'event_type_47' 'event_type_48'
 'event_type_49' 'event_type_50' 'event_type_51' 'event_ty

In [108]:
def get_feats():
    feats = ['location', 'severity_type', 'time', 'reverse_time', 'location_count']
    feats += ['severity_type_1', 'severity_type_2', 'severity_type_3', 'severity_type_4', 'severity_type_5' ]
    
    feats += ['event_type_1', 'event_type_2', 'event_type_3', 'event_type_4', 'event_type_5',
     'event_type_6', 'event_type_7', 'event_type_8', 'event_type_9',
     'event_type_10', 'event_type_11', 'event_type_12', 'event_type_13',
     'event_type_14', 'event_type_15', 'event_type_17', 'event_type_18',
     'event_type_19', 'event_type_20', 'event_type_21', 'event_type_22',
     'event_type_23', 'event_type_24', 'event_type_25', 'event_type_26',
     'event_type_27', 'event_type_28', 'event_type_29', 'event_type_30',
     'event_type_31', 'event_type_32', 'event_type_33', 'event_type_34',
     'event_type_35', 'event_type_36', 'event_type_37', 'event_type_38',
     'event_type_39', 'event_type_40', 'event_type_41', 'event_type_42',
     'event_type_43', 'event_type_44', 'event_type_45', 'event_type_46',
     'event_type_47', 'event_type_48', 'event_type_49', 'event_type_50',
     'event_type_51', 'event_type_52', 'event_type_53', 'event_type_54']
    
    feats += ['resource_type_1', 'resource_type_2', 'resource_type_3', 'resource_type_4',
     'resource_type_5', 'resource_type_6', 'resource_type_7', 'resource_type_8',
     'resource_type_9', 'resource_type_10']
    
    feats += ['median_et_by_location', 'mean_et_by_location', 'amax_et_by_location', 'amin_et_by_location', 'len_et_by_location', 'sum_et_by_location', ]


    feats += [ 'median_lf_by_location', 'mean_lf_by_location',
       'amax_lf_by_location', 'amin_lf_by_location', 'len_lf_by_location',
       'sum_lf_by_location', 'amax_lf_by_location', 'amin_lf_by_location' ]

    feats += ['median_lf_volume_by_location', 'mean_lf_volume_by_location',
           'amax_lf_volume_by_location', 'amin_lf_volume_by_location',
           'len_lf_volume_by_location', 'sum_lf_volume_by_location']
    
    feats += ['resource_type_len']
    feats += ['median_rt_by_location', 'mean_rt_by_location',
           'amax_rt_by_location', 'amin_rt_by_location', 'len_rt_by_location',
           'sum_rt_by_location']

    return feats

In [109]:
labels = combined_train['fault_severity'].values

xgtrain = xgb.DMatrix(combined_train[ get_feats() ].values, labels)

# XGBoost params
xgboost_params = {
    'objective': 'multi:softprob',
    'booster': 'gbtree',
    'eval_metric': 'mlogloss',
    'max_depth': 5,
    'eta': 0.3,
    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'num_class': 3
}


num_rounds = 1000
cv = xgb.cv(xgboost_params, xgtrain, metrics='mlogloss', num_boost_round=num_rounds, nfold=20, stratified=True, seed=2018, early_stopping_rounds=25, verbose_eval=20, show_stdv=False)

best_nrounds = cv.shape[0] - 1

[0]	train-mlogloss:0.942892	test-mlogloss:0.945727
[20]	train-mlogloss:0.481841	test-mlogloss:0.513889
[40]	train-mlogloss:0.411784	test-mlogloss:0.462284
[60]	train-mlogloss:0.364859	test-mlogloss:0.430579
[80]	train-mlogloss:0.327569	test-mlogloss:0.406969
[100]	train-mlogloss:0.296586	test-mlogloss:0.389011
[120]	train-mlogloss:0.270747	test-mlogloss:0.374188
[140]	train-mlogloss:0.247913	test-mlogloss:0.361728
[160]	train-mlogloss:0.228272	test-mlogloss:0.350516
[180]	train-mlogloss:0.211497	test-mlogloss:0.341805
[200]	train-mlogloss:0.195788	test-mlogloss:0.333234
[220]	train-mlogloss:0.182161	test-mlogloss:0.326305
[240]	train-mlogloss:0.170082	test-mlogloss:0.320976
[260]	train-mlogloss:0.158875	test-mlogloss:0.316283
[280]	train-mlogloss:0.148936	test-mlogloss:0.312363
[300]	train-mlogloss:0.139536	test-mlogloss:0.309069
[320]	train-mlogloss:0.131174	test-mlogloss:0.304584
[340]	train-mlogloss:0.123359	test-mlogloss:0.300863
[360]	train-mlogloss:0.116265	test-mlogloss:0.298893