In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/allstate-claims-severity/train.csv
/kaggle/input/allstate-claims-severity/train.csv.zip
/kaggle/input/allstate-claims-severity/test.csv.zip
/kaggle/input/allstate-claims-severity/sample_submission.csv.zip
/kaggle/input/allstate-claims-severity/test.csv
/kaggle/input/allstate-claims-severity/sample_submission.csv


In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder


data_dir = '/kaggle/input/allstate-claims-severity/'

train_data_path = data_dir+'train.csv'
test_data_path = data_dir+'test.csv'
submission_csv_path = data_dir+'sample_submission.csv'

In [3]:

# Label Encode all categorical features
def get_labelEncoded_dataframes(data_dir):
    '''
    creates a label encoded dataframe out of the categorical features using sklearns's LabelEncoder
    saves new dataframe in object_dir
    skips creating new dataframe if already exists
    '''
    print('Label Encoding categorical features . . .')
    train_data = pd.read_csv(data_dir+'train.csv')
    test_data = pd.read_csv(data_dir+'test.csv')
    cat_cols = [x for x in train_data.columns if x.startswith('cat')]

    for col in cat_cols:
        le = LabelEncoder()
        train_data[col] = le.fit_transform(train_data[col])
        # update::
        # Test data had some values in some cateogorical features that were unseen in train data
        # the next 2 lines fix that :|
        test_data[col] = test_data[col].map(lambda s: 'UNK' if s not in le.classes_ else s)
        le.classes_ = np.append(le.classes_, 'UNK')
        test_data[col] = le.transform(test_data[col])
    return train_data, test_data


train_data, test_data = get_labelEncoded_dataframes(data_dir)
submission = pd.read_csv(submission_csv_path)

Label Encoding categorical features . . .


In [4]:
X = train_data.iloc[:,1:-1]
Y = train_data.iloc[:,-1]


# get categorical and continuous features names
cat_cols = [x for x in train_data.columns if x.startswith('cat')]
cont_cols = [x for x in train_data.columns if x.startswith('cont')]

In [5]:
    
from sklearn.feature_selection     import    f_regression, mutual_info_regression

# f_regression
##############
f_reg_res = {}
fval, pval = f_regression(X, Y)
for i,c in enumerate(X.columns):
  f_reg_res[c] = fval[i]

# sort the features according to f_regression scores
sorted_res = [[k,v] for k, v in sorted(f_reg_res.items(), key=lambda item: item[1])]
sorted(sorted_res, key = lambda x: x[1])

# remove features that scored too low
high_score_features_F = [x[0] for x in list(filter(lambda x: x[1]>100, sorted_res))]
print("features with f_regression score > 100")
print(high_score_features_F)



# mutual_information
####################
# sampling a subset of data, as mutual_info calculation is intensive
sample = train_data.sample(10000)
x = sample.iloc[:,:-1]
y = sample.iloc[:,-1]

mutinf_res = {}
mi = mutual_info_regression(x, y)
for i,c in enumerate(X.columns):
  mutinf_res[c] = mi[i]

# sort the features according to mutual_information scores
sorted_res = [[k,v] for k, v in sorted(mutinf_res.items(), key=lambda item: item[1])]
sorted(sorted_res, key = lambda x: x[1])

# remove features that scored too low
high_score_features_MI = [x[0] for x in list(filter(lambda x: x[1]>0.001, sorted_res))]
print("features with mutual_information score > 100")
print(high_score_features_MI)

# get intersection of features which score high on both of these tests
# i.e. we are discarding features that did not do well in both the tests
common_features_union = list(set(high_score_features_F).union(set(high_score_features_MI)))
print("# feautres selected: ", common_features_union.__len__())

features with f_regression score > 100
['cat32', 'cat49', 'cat114', 'cat112', 'cat61', 'cont8', 'cat20', 'cat34', 'cat52', 'cat104', 'cat83', 'cat116', 'cat99', 'cat51', 'cat19', 'cat47', 'cont4', 'cat58', 'cat67', 'cont6', 'cat18', 'cat84', 'cat59', 'cat33', 'cat95', 'cat46', 'cat43', 'cat44', 'cat30', 'cat53', 'cat26', 'cat78', 'cat66', 'cat100', 'cat65', 'cat71', 'cat106', 'cat45', 'cat75', 'cat17', 'cat85', 'cat29', 'cat102', 'cat8', 'cat41', 'cat76', 'cat25', 'cat24', 'cat94', 'cat38', 'cont12', 'cont11', 'cat14', 'cat82', 'cat4', 'cat5', 'cat50', 'cont3', 'cat105', 'cat6', 'cont7', 'cat28', 'cat40', 'cont2', 'cat111', 'cat103', 'cat73', 'cat36', 'cat23', 'cat90', 'cat16', 'cat3', 'cat9', 'cat13', 'cat1', 'cat11', 'cat72', 'cat2', 'cat81', 'cat89', 'cat7', 'cat10', 'cat12', 'cat57', 'cat87', 'cat101', 'cat79', 'cat80']
features with mutual_information score > 100
['cat32', 'cat93', 'cat68', 'cat9', 'cat53', 'cat108', 'cat52', 'cont8', 'cat85', 'cat66', 'cat48', 'cat54', 'cat110', 

In [6]:
print("Features that would be used: ", common_features_union)
print("# features: ", common_features_union.__len__())


Features that would be used:  ['cat105', 'cat19', 'cat28', 'cat99', 'cat4', 'cat113', 'cont7', 'cont12', 'cont3', 'cat78', 'cat39', 'cat58', 'cont6', 'cont4', 'cat7', 'cat81', 'cat9', 'cat110', 'cat109', 'cont13', 'cat116', 'cat20', 'cat52', 'cat67', 'cat2', 'cat115', 'cat10', 'cat51', 'cat86', 'cat112', 'cont11', 'cat71', 'cat11', 'cat12', 'cat13', 'cont2', 'cat17', 'cat40', 'cat98', 'cat77', 'cat29', 'cat61', 'cat90', 'cat114', 'cat33', 'cat26', 'cat3', 'cat6', 'cat111', 'cat83', 'cat37', 'cat75', 'cat106', 'cat91', 'cat66', 'cat30', 'cat41', 'cat25', 'cat24', 'cat47', 'cont14', 'cat16', 'cat15', 'cat23', 'cat36', 'cat95', 'cat76', 'cont5', 'cat100', 'cat49', 'cat57', 'cat87', 'cat94', 'cat54', 'cat45', 'cat44', 'cat88', 'cat43', 'cat59', 'cat50', 'cat92', 'cat65', 'cont8', 'cont9', 'cont10', 'cat42', 'cat38', 'cat101', 'cat93', 'cat74', 'cat80', 'cat14', 'cat82', 'cat32', 'cat84', 'cat8', 'cat79', 'cat18', 'cat72', 'cat102', 'cat68', 'cat34', 'cat5', 'cat53', 'cat73', 'cat89', 'cat1

In [7]:

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LeakyReLU
from keras.preprocessing import text
from keras import utils

# set hyperparameters for MLP
class NN:
    def __init__(self):
        self.in_shape = common_features_union.__len__()
        self.num_layers = 3
        self.nodes = [2048,1024, 1]
        self.activations = ['relu', 'relu', 'relu']
        self.dropouts = [0.2,0.15,0]
        self.loss = 'mean_squared_logarithmic_error'
        self.optimizer = keras.optimizers.RMSprop(5e-4)



def sequential_MLP(nn):
    model = Sequential()
    for i in range(nn.num_layers):
        if i==0: # add input shape if first layer
            model.add(Dense(nn.nodes[i], activation=nn.activations[i], input_shape=(nn.in_shape,) ))
        else:
            model.add(Dense(nn.nodes[i], activation=nn.activations[i]))
        if(nn.dropouts[i] != 0): # skip adding dropout if dropout == 0
            model.add(Dropout(rate=nn.dropouts[i]))            
    model.compile(optimizer=nn.optimizer, loss=nn.loss, metrics=['mae'])

    return model

Using TensorFlow backend.


In [8]:


nn = NN()
model = sequential_MLP(nn)


for i in range(45):
  if i%5 == 0: verbose=True
  else: verbose = False
  model.fit(X[common_features_union], Y, epochs=1, batch_size=512, validation_split=0.25, verbose=verbose)

Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1
Train on 141238 samples, validate on 47080 samples
Epoch 1/1


In [9]:
test_predictions = model.predict(test_data[common_features_union])


submission = pd.read_csv('/kaggle/input/allstate-claims-severity/sample_submission.csv')
submission['loss'] = test_predictions
submission.to_csv('submission.csv', index=False)