In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [3]:
train_data_original = train_data.copy()
test_data_original = test_data.copy()

### Data Exploration

In [4]:
train_data.head()

Unnamed: 0,Customer_Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building_Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053.0,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053.0,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053.0,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053.0,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053.0,0


In [5]:
train_data.shape

(7160, 14)

In [6]:
test_data.head()

Unnamed: 0,Customer_Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building_Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310.0
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310.0
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310.0
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321.0
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321.0


In [7]:
test_data.shape

(3069, 13)

In [8]:
train_data.describe()

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Dimension,Building_Type,Date_of_Occupancy,Geo_Code,Claim
count,7160.0,7160.0,7160.0,7054.0,7160.0,6652.0,7058.0,7160.0
mean,2013.669553,0.909758,0.305447,1883.72753,2.186034,1964.456404,50394.259139,0.228212
std,1.383769,0.239756,0.460629,2278.157745,0.940632,36.002014,30206.394653,0.419709
min,2012.0,0.0,0.0,1.0,1.0,1545.0,1053.0,0.0
25%,2012.0,0.997268,0.0,528.0,2.0,1960.0,19031.0,0.0
50%,2013.0,1.0,0.0,1083.0,2.0,1970.0,57215.0,0.0
75%,2015.0,1.0,1.0,2289.75,3.0,1980.0,75118.0,0.0
max,2016.0,1.0,1.0,20940.0,4.0,2016.0,95607.0,1.0


In [9]:
train_data.isnull().sum()

Customer_Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  7
Settlement              0
Building_Dimension    106
Building_Type           0
Date_of_Occupancy     508
NumberOfWindows         0
Geo_Code              102
Claim                   0
dtype: int64

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7160 entries, 0 to 7159
Data columns (total 14 columns):
Customer_Id           7160 non-null object
YearOfObservation     7160 non-null int64
Insured_Period        7160 non-null float64
Residential           7160 non-null int64
Building_Painted      7160 non-null object
Building_Fenced       7160 non-null object
Garden                7153 non-null object
Settlement            7160 non-null object
Building_Dimension    7054 non-null float64
Building_Type         7160 non-null int64
Date_of_Occupancy     6652 non-null float64
NumberOfWindows       7160 non-null object
Geo_Code              7058 non-null float64
Claim                 7160 non-null int64
dtypes: float64(4), int64(4), object(6)
memory usage: 783.2+ KB


In [11]:
train_data.dtypes

Customer_Id            object
YearOfObservation       int64
Insured_Period        float64
Residential             int64
Building_Painted       object
Building_Fenced        object
Garden                 object
Settlement             object
Building_Dimension    float64
Building_Type           int64
Date_of_Occupancy     float64
NumberOfWindows        object
Geo_Code              float64
Claim                   int64
dtype: object

In [12]:
# converting datatypes 
train_data = train_data.infer_objects() 
print(train_data.dtypes) 

Customer_Id            object
YearOfObservation       int64
Insured_Period        float64
Residential             int64
Building_Painted       object
Building_Fenced        object
Garden                 object
Settlement             object
Building_Dimension    float64
Building_Type           int64
Date_of_Occupancy     float64
NumberOfWindows        object
Geo_Code              float64
Claim                   int64
dtype: object


In [13]:
train_data.Claim.value_counts(normalize=True)

0    0.771788
1    0.228212
Name: Claim, dtype: float64

In [14]:
train_data['Garden'].fillna(train_data['Garden'].mode()[0], inplace=True) 
train_data['Building_Dimension'].fillna(train_data['Building_Dimension'].mode()[0], inplace=True) 
train_data['Date_of_Occupancy'].fillna(train_data['Date_of_Occupancy'].mode()[0], inplace=True) 
train_data['Geo_Code'].fillna(train_data['Geo_Code'].mode()[0], inplace=True) 


In [15]:
train_data.isnull().sum()

Customer_Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building_Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
Geo_Code              0
Claim                 0
dtype: int64

In [16]:
test_data.isnull().sum()

Customer_Id             0
YearOfObservation       0
Insured_Period          0
Residential             0
Building_Painted        0
Building_Fenced         0
Garden                  4
Settlement              0
Building_Dimension     13
Building_Type           0
Date_of_Occupancy     728
NumberOfWindows         0
Geo_Code               13
dtype: int64

In [17]:
test_data['Garden'].fillna(test_data['Garden'].mode()[0], inplace=True) 
test_data['Building_Dimension'].fillna(test_data['Building_Dimension'].mode()[0], inplace=True) 
test_data['Date_of_Occupancy'].fillna(test_data['Date_of_Occupancy'].mode()[0], inplace=True) 
test_data['Geo_Code'].fillna(test_data['Geo_Code'].mode()[0], inplace=True) 


In [18]:
test_data.isnull().sum()

Customer_Id           0
YearOfObservation     0
Insured_Period        0
Residential           0
Building_Painted      0
Building_Fenced       0
Garden                0
Settlement            0
Building_Dimension    0
Building_Type         0
Date_of_Occupancy     0
NumberOfWindows       0
Geo_Code              0
dtype: int64

In [19]:

X = train_data.drop(['Customer_Id','Claim','NumberOfWindows'],axis=1)
y = train_data['Claim']
test_data = test_data.drop(['Customer_Id','NumberOfWindows'],axis=1)

from sklearn.model_selection import train_test_split



In [20]:
X = pd.get_dummies(X)
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [21]:
train_data.shape, test_data.shape

((7160, 7187), (3069, 15))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5728, 15), (1432, 15), (5728,), (1432,))

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [25]:
logit_model=LogisticRegression().fit(X_train, y_train)

In [26]:
print('Accuracy of training: {:.4f}'.format(logit_model.score(X_train,y_train)))
print('Accuracy of testing: {:.4f}'.format(logit_model.score(X_test,y_test)))

Accuracy of training: 0.7854
Accuracy of testing: 0.7758


In [27]:
prediction = logit_model.predict(X_test)

confusion_matrix(prediction, y_test)

array([[1073,  302],
       [  19,   38]], dtype=int64)

In [28]:
RF_model = RandomForestClassifier().fit(X_train, y_train)

In [29]:
print('Accuracy of training: {:.4f}'.format(RF_model.score(X_train,y_train)))
print('Accuracy of testing: {:.4f}'.format(RF_model.score(X_test,y_test)))

Accuracy of training: 0.9988
Accuracy of testing: 0.7654


In [30]:
prediction1 = RF_model.predict(X_test)

confusion_matrix(prediction1, y_test)

array([[1027,  271],
       [  65,   69]], dtype=int64)

In [31]:
import xgboost as xgb
from xgboost import XGBClassifier

In [32]:
xgb_model = XGBClassifier().fit(X_train, y_train)
print('Accuracy of training: {:.4f}'.format(xgb_model.score(X_train,y_train)))
print('Accuracy of testing: {:.4f}'.format(xgb_model.score(X_test,y_test)))

Accuracy of training: 0.8034
Accuracy of testing: 0.7800


In [33]:
xgb_pred = xgb_model.predict(X_test)

In [34]:
confusion_matrix(xgb_pred, y_test)

array([[1063,  286],
       [  29,   54]], dtype=int64)

In [35]:
Claim = xgb_model.predict(test_data)

In [36]:
len(Claim)

3069

In [37]:
test_data['Claim']=Claim

In [38]:
test_data_original['Claim']=Claim

In [39]:
test_data_original.head(10)

Unnamed: 0,Customer_Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building_Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310.0,0
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310.0,0
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310.0,0
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321.0,0
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321.0,0
5,H10545,2012,1.0,0,V,V,V,U,3225.0,2,1988.0,.,4070.0,0
6,H8962,2015,0.986301,0,N,V,V,U,31.0,2,,.,4070.0,0
7,H1015,2013,1.0,0,V,V,V,U,1400.0,2,1980.0,.,4088.0,0
8,H9710,2012,1.0,0,V,V,V,U,1300.0,2,,.,4094.0,0
9,H9029,2012,1.0,0,N,V,V,U,1200.0,2,,.,4205.0,0


In [40]:
test_data_original.to_csv('Predicted Results.csv', index=False)