In [2]:
#Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Seaborn visualization set up
%matplotlib inline
sns.set_style('darkgrid')

In [3]:
#Reading the dataset

train = pd.read_csv('train_data.csv')
train.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0


In [4]:
train.shape

(7160, 14)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
Customer Id           7160 non-null object
YearOfObservation     7160 non-null int64
Insured_Period        7160 non-null float64
Residential           7160 non-null int64
Building_Painted      7160 non-null object
Building_Fenced       7160 non-null object
Garden                7153 non-null object
Settlement            7160 non-null object
Building Dimension    7054 non-null float64
Building_Type         7160 non-null int64
Date_of_Occupancy     6652 non-null float64
NumberOfWindows       7160 non-null object
Geo_Code              7058 non-null object
Claim                 7160 non-null int64
dtypes: float64(3), int64(4), object(7)
memory usage: 783.2+ KB


In [6]:
#Reading test data

test = pd.read_csv('test_data_original.csv')
test.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321


In [7]:
test.shape

(3069, 13)

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3069 entries, 0 to 3068
Data columns (total 13 columns):
Customer Id           3069 non-null object
YearOfObservation     3069 non-null int64
Insured_Period        3069 non-null float64
Residential           3069 non-null int64
Building_Painted      3069 non-null object
Building_Fenced       3069 non-null object
Garden                3065 non-null object
Settlement            3069 non-null object
Building Dimension    3056 non-null float64
Building_Type         3069 non-null int64
Date_of_Occupancy     2341 non-null float64
NumberOfWindows       3069 non-null object
Geo_Code              3056 non-null object
dtypes: float64(3), int64(3), object(7)
memory usage: 311.8+ KB


# Feature Engineering

In [9]:
#Checking for null values

train.isnull().sum()      #for train data

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Geo_Code              102
Claim                   0
dtype: int64

In [10]:
test.isnull().sum()        #for test data

Customer Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  4
Settlement              0
Building Dimension     13
Building_Type           0
Date_of_Occupancy     728
NumberOfWindows         0
Geo_Code               13
dtype: int64

In [11]:
#Handling missing values for train data

#Impute values for Garden using mode
train['Garden'] = train['Garden'].fillna(train['Garden'].mode()[0])

#Impute values for Building Dimension using mean
train['Building Dimension'] = train['Building Dimension'].fillna(train['Building Dimension'].mean())

#Impute values for Date_of_Occupancy using mean
train['Date_of_Occupancy'] = train['Date_of_Occupancy'].fillna(train['Date_of_Occupancy'].mean())

#Impute values for Geo_Code using mode
train['Geo_Code'] = train['Geo_Code'].fillna(train['Geo_Code'].mode()[0])

In [12]:
#Confirming that the missing values have been treated for train data
train.isnull().sum()

Customer Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
Geo_Code              0
Claim                 0
dtype: int64

In [13]:
#Handling Missing values for test data

#Impute values for Garden using mode
test['Garden'] = test['Garden'].fillna(test['Garden'].mode()[0])

#Impute values for Building Dimension using mean
test['Building Dimension'] = test['Building Dimension'].fillna(test['Building Dimension'].mean())

#Impute values for Date_of_Occupancy using mean
test['Date_of_Occupancy'] = test['Date_of_Occupancy'].fillna(test['Date_of_Occupancy'].mean())

#Impute values for Geo_Code using mode
test['Geo_Code'] = test['Geo_Code'].fillna(test['Geo_Code'].mode()[0])

In [14]:
#Confirming that the missing vales have been treated for test data
test.isnull().sum()

Customer Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
Geo_Code              0
dtype: int64

In [15]:
#Drop Customer Id column for train data
train.drop(['Customer Id'], axis=1, inplace=True)

In [16]:
#Drop Customer Id column for test data
test.drop(['Customer Id'], axis=1, inplace=True)

In [17]:
train.shape

(7160, 13)

In [18]:
test.shape

(3069, 12)

In [19]:
#Saving clean test data
test.to_csv('clean_test_data.csv', index=False)

In [85]:
#Concatenating the two data sets (train data and test data)
final_df = pd.concat([train,test], axis=0)

In [86]:
final_df.shape

(10229, 13)

In [87]:
#Creating dummies for categorical variables(columns)
final_df = pd.get_dummies(final_df, columns=['Building_Painted','Building_Fenced','Garden','Settlement','NumberOfWindows','Geo_Code'])

In [88]:
final_df.shape

(10229, 1551)

In [89]:
#Removing duplicate columns in df
final_df=final_df.loc[:,~final_df.columns.duplicated()]

In [90]:
final_df.shape

(10229, 1551)

In [91]:
final_df.head()

Unnamed: 0,Building Dimension,Building_Type,Claim,Date_of_Occupancy,Insured_Period,Residential,YearOfObservation,Building_Painted_N,Building_Painted_V,Building_Fenced_N,...,Geo_Code_95428,Geo_Code_95488,Geo_Code_95500,Geo_Code_95539,Geo_Code_95555,Geo_Code_95563,Geo_Code_95582,Geo_Code_95585,Geo_Code_95598,Geo_Code_95607
0,290.0,1,0.0,1960.0,1.0,0,2013,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,490.0,1,0.0,1850.0,1.0,0,2015,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,595.0,1,0.0,1960.0,1.0,0,2014,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2840.0,1,0.0,1960.0,1.0,0,2013,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,680.0,1,0.0,1800.0,1.0,0,2014,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [92]:
#Splitiing train data and test data from final_df
train_data = final_df.iloc[:7160,:]
test_data = final_df.iloc[7160:,:]

In [93]:
X_train = train_data.drop(['Claim'], axis=1)
y_train = train_data['Claim']

In [94]:
test_data.drop(['Claim'], axis=1, inplace=True)

# Hyperparameter Optimization for XGBoost

In [95]:
#Creating parameters
params = {
    'learning_rate': [0.05,0.10,0.15,0.20,0.25,0.30],
    'max_depth': [3,4,5,6,8,10,12,15],
    'min_child_weight': [1,3,5,7],
    'gamma': [0.0,0.1,0.2,0.3,0.4],
    'colsample_bytree': [0.3,0.4,0.5,0.7] 
}

In [96]:
def timer (start_time=None):
    if not start_time:
        start_time=datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now()-start_time).total_seconds(),3600)
        tmin,tsec=divmod(temp_sec,60)
        print('\n Time Taken: %i hours %i minutes and %s seconds.' %(thour, tmin, round(tsec,2)))

In [97]:
#Building model using XGBoost
import xgboost
classifier = xgboost.XGBClassifier()    

In [98]:
#Using RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(classifier, param_distributions=params, 
                                   n_iter = 5, scoring='roc_auc', n_jobs=-1, 
                                   cv=5, verbose=3)

In [99]:
from datetime import datetime

start_time=timer(None)
random_search.fit(X_train,y_train)
timer(start_time)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  7.6min finished



 Time Taken: 0 hours 8 minutes and 8.76 seconds.


In [100]:
#Finding the best estimator
random_search.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [101]:
#Finding the best parameters for the classification
random_search.best_params_

{'min_child_weight': 5,
 'max_depth': 3,
 'learning_rate': 0.1,
 'gamma': 0.0,
 'colsample_bytree': 0.7}

In [102]:
#Fitting the classifier
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [103]:
#Creating an instance for the classifier using the best parameters
classifier = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.15, max_delta_step=0, max_depth=3,
       min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1) 

In [104]:
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
       learning_rate=0.15, max_delta_step=0, max_depth=3,
       min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=None, subsample=1, verbosity=1)

In [105]:
predictions = classifier.predict(test_data)

In [106]:
predictions

array([0., 0., 0., ..., 0., 0., 0.])

In [107]:
pred=pd.DataFrame(predictions)
sub_df=pd.read_csv('test_data_original.csv')
datasets=pd.concat([sub_df['Customer Id'],pred],axis=1)
datasets.columns=['Customer Id','Claim']
datasets.to_csv('submission_10.csv', index=False)