In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, log_loss
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import spearmanr
from scipy.stats import chi2_contingency
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
import catboost as cat_
import seaborn as sns
import lightgbm as lgb
from sklearn.impute import SimpleImputer as Imputer
from sklearn import preprocessing
import re
import timeit
import random
random.seed(3)

In [2]:
# Importing data sets
train_data = pd.read_csv("datasets/train_data.csv")
test_data = pd.read_csv("datasets/test_data.csv")

### Merging the train and test data for cleaning and analysis

In [3]:
ntrain = train_data.shape[0] 
ntest = test_data.shape[0]
dataset = pd.concat((train_data, test_data), sort=False).reset_index(drop=True)

In [4]:
dataset.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0.0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0.0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0.0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0.0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0.0


In [11]:
dataset.columns

Index(['Customer Id', 'YearOfObservation', 'Insured_Period', 'Residential',
       'Building_Painted', 'Building_Fenced', 'Garden', 'Settlement',
       'Building Dimension', 'Building_Type', 'Date_of_Occupancy',
       'NumberOfWindows', 'Geo_Code', 'Claim'],
      dtype='object')

In [19]:
dataset.Date_of_Occupancy.min()

1545.0

In [6]:
y_train = train_data['Claim']

In [7]:
y_train.unique()

array([0, 1], dtype=int64)

In [8]:
dataset.isnull().sum()

Customer Id              0
YearOfObservation        0
Insured_Period           0
Residential              0
Building_Painted         0
Building_Fenced          0
Garden                  11
Settlement               0
Building Dimension     119
Building_Type            0
Date_of_Occupancy     1236
NumberOfWindows          0
Geo_Code               115
Claim                 3069
dtype: int64

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10229 entries, 0 to 10228
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Customer Id         10229 non-null  object 
 1   YearOfObservation   10229 non-null  int64  
 2   Insured_Period      10229 non-null  float64
 3   Residential         10229 non-null  int64  
 4   Building_Painted    10229 non-null  object 
 5   Building_Fenced     10229 non-null  object 
 6   Garden              10218 non-null  object 
 7   Settlement          10229 non-null  object 
 8   Building Dimension  10110 non-null  float64
 9   Building_Type       10229 non-null  int64  
 10  Date_of_Occupancy   8993 non-null   float64
 11  NumberOfWindows     10229 non-null  object 
 12  Geo_Code            10114 non-null  object 
 13  Claim               7160 non-null   float64
dtypes: float64(4), int64(3), object(7)
memory usage: 1.1+ MB


In [10]:
dataset.NumberOfWindows.unique()

array(['   .', '4', '3', '2', '5', '>=10', '6', '7', '9', '8', '1'],
      dtype=object)

In [11]:
dataset = dataset.drop(['Customer Id','Geo_Code','Claim'],axis=1)

In [12]:
dataset.head()

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows
0,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.
1,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4
2,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.
3,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.
4,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3


In [13]:
dataset.Building_Painted.unique()

array(['N', 'V'], dtype=object)

In [14]:
dataset.Building_Fenced.unique()

array(['V', 'N'], dtype=object)

In [15]:
dataset.Garden.unique()

array(['V', 'O', nan], dtype=object)

In [16]:
dataset.Settlement.unique()

array(['U', 'R'], dtype=object)

In [17]:
dataset['Building_Painted'] = dataset['Building_Painted'].replace({'N':1, 'V':0})
dataset['Building_Fenced'] = dataset['Building_Fenced'].replace({'V':0, 'N':1})
dataset['Garden'] = dataset['Garden'].replace({'V':1, 'O':0})
dataset['Settlement'] = dataset['Settlement'].replace({'U':0, 'R':1})
dataset['NumberOfWindows'] =dataset['NumberOfWindows'].replace({'   .':0, '4':4, '3':3, '2':2, '5':5, '>=10':10, '6':6, '7':7, '9':9, '8':8, '1':1})

In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10229 entries, 0 to 10228
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearOfObservation   10229 non-null  int64  
 1   Insured_Period      10229 non-null  float64
 2   Residential         10229 non-null  int64  
 3   Building_Painted    10229 non-null  int64  
 4   Building_Fenced     10229 non-null  int64  
 5   Garden              10218 non-null  float64
 6   Settlement          10229 non-null  int64  
 7   Building Dimension  10110 non-null  float64
 8   Building_Type       10229 non-null  int64  
 9   Date_of_Occupancy   8993 non-null   float64
 10  NumberOfWindows     10229 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 879.2 KB


### Missing Valuse

In [19]:
le = preprocessing.LabelEncoder()
for col in dataset.columns:
    dataset[col] = le.fit_transform(dataset[col])

In [20]:
def fill_nulls(value):
    cols_fill = ['Garden','Building Dimension', 'Date_of_Occupancy']
    
    if value == -9999:
        for col in cols_fill:
            dataset.loc[dataset[col].isnull(), col] = -9999
    else : 
        for col in cols_fill:
            dataset.loc[dataset[col].isnull(), col] = dataset[col].median()

In [21]:
fill_nulls(-9999)

In [22]:
dataset.isnull().sum()

YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
dtype: int64

In [23]:
columns = ['YearOfObservation', 'Insured_Period', 'Residential',
       'Building_Painted', 'Building_Fenced', 'Garden', 'Settlement',
       'Building Dimension', 'Building_Type', 'Date_of_Occupancy','NumberOfWindows']

### Scaling the dataset

In [24]:
# Use 3 features
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer
from sklearn.feature_selection import SelectKBest, f_classif
df = SelectKBest(f_classif, k=3)
df1 = StandardScaler()
data_scale = df1.fit_transform(dataset) 
data = data_scale

In [25]:
data_df = pd.DataFrame(data, columns=columns)
data = data_df
data.head()

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows
0,-0.468486,0.374577,-0.625255,1.494642,-0.875247,0.867799,-0.875595,-1.104061,-1.287566,-0.457683,-0.766203
1,0.980167,0.374577,-0.625255,-0.669056,1.142535,-1.142327,1.14208,-0.86757,-1.287566,-2.74286,0.865757
2,0.25584,0.374577,-0.625255,1.494642,-0.875247,0.867799,-0.875595,-0.75309,-1.287566,-0.457683,-0.766203
3,-0.468486,0.374577,-0.625255,-0.669056,-0.875247,0.867799,-0.875595,0.986702,-1.287566,-0.457683,-0.766203
4,0.25584,0.374577,-0.625255,-0.669056,1.142535,-1.142327,1.14208,-0.656686,-1.287566,-3.019851,0.457767


In [26]:
#Train and test datasets
train = data[:ntrain].copy()
test = data[ntrain:].copy()
test = test.reset_index(drop=True)

In [27]:
train.isnull().sum()

YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
dtype: int64

In [28]:
y_train.unique()

array([0, 1], dtype=int64)

In [29]:
# lightgbm for classifier
start = timeit.default_timer()
from lightgbm import LGBMClassifier
from matplotlib import pyplot

# evaluate the model
#model = LGBMClassifier()
# fit the model on the whole dataset
model_lgb = LGBMClassifier()
model_lgb.fit(train, y_train)
y_pred_lgb = model_lgb.predict(test)
y_pred_prob_lgb = model_lgb.predict_proba(test.values)[:,1]
stop = timeit.default_timer()
print(stop - start)

0.17964300000000044


In [30]:
# catboost for classification
start = timeit.default_timer()
from catboost import CatBoostClassifier
from matplotlib import pyplot
# evaluate the model
#cat_features_index = [0,1,2,3,4,5,6]
#,cat_features= cat_features_index
#model = CatBoostClassifier(verbose=0, n_estimators=100)
# fit the model on the whole dataset
model_cat = CatBoostClassifier(verbose=0, n_estimators=100)
model_cat.fit(train, y_train)
y_pred_cat = model_cat.predict(test)
y_pred_prob_cat = model_cat.predict_proba(test.values)[:,1]
stop = timeit.default_timer()
print(stop - start)

0.5884003


In [31]:
# xgboost for regression
start = timeit.default_timer()
from xgboost import XGBClassifier
# fit the model on the whole dataset
model_xgb = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='mlogloss')
model_xgb.fit(train, y_train)
y_pred_xgb = model_xgb.predict(test)
y_pred_prob_xgb = model_xgb.predict_proba(test.values)[:,1]
stop = timeit.default_timer()
print(stop - start)

0.5418469999999997


In [32]:
# make a prediction with a stacking mlxten
start = timeit.default_timer()

from sklearn.linear_model import LinearRegression
import mlxtend
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression

# define meta learner model
lr = LogisticRegression()
# define the stacking ensemble
model = StackingClassifier(classifiers=[model_lgb, model_cat, model_xgb], meta_classifier=lr)
# fit the model on all available data
model = model.fit(train, y_train)
#pred = list(pred.ravel())
#stack_result = list(stack_result.ravel())

stop = timeit.default_timer()
print(stop - start)

1.3872240000000007


In [33]:
train.shape

(7160, 11)

## Artificial Nueral Network ANN

In [34]:
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 5,  activation = 'relu', input_dim = 11))

# Adding the second hidden layer
classifier.add(Dense(units = 5, activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, activation = 'sigmoid'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(train, y_train, batch_size = 11, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1f62f80cd60>

In [35]:
y_pred_ann = classifier.predict(test)

In [36]:
y_pred_ann

array([[0.0872795 ],
       [0.10897222],
       [0.05219954],
       ...,
       [0.45526335],
       [0.5323517 ],
       [0.5070087 ]], dtype=float32)

In [37]:
stack_result = model.predict(test)
stack_result_prob = model.predict_proba(test.values)[:,1]

In [38]:
stack_result_prob

array([0.13927139, 0.13927139, 0.13927139, ..., 0.13927139, 0.13927139,
       0.13927139])

In [39]:
# Using Joblib
import joblib

model_file = open("models/lgb_model_2.pkl","wb")
joblib.dump(model_lgb,model_file)
model_file.close()

model_file = open("models/cat_model_2.pkl","wb")
joblib.dump(model_cat,model_file)
model_file.close()

model_file = open("models/xgb_model_2.pkl","wb")
joblib.dump(model_xgb,model_file)
model_file.close()

model_file = open("models/stack_model_2.pkl","wb")
joblib.dump(model,model_file)
model_file.close()

In [40]:
test_data.columns

Index(['Customer Id', 'YearOfObservation', 'Insured_Period', 'Residential',
       'Building_Painted', 'Building_Fenced', 'Garden', 'Settlement',
       'Building Dimension', 'Building_Type', 'Date_of_Occupancy',
       'NumberOfWindows', 'Geo_Code'],
      dtype='object')

In [41]:
index = test_data['Customer Id']

In [42]:
#result = pd.concat((index, stack_result),sort=False).reset_index(drop=False)

In [43]:
Submission1 = pd.DataFrame(stack_result_prob, columns=['Claim']).to_csv('datasets/Submission_Claim.csv')

In [44]:
pd.DataFrame(y_pred_ann, columns=['Claim']).to_csv('datasets/Submission_Claim_ann.csv')