In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

warnings.filterwarnings('ignore')

In [91]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [92]:
train_data_original = train_data.copy()
test_data_original = test_data.copy()

In [93]:
train_data.head()

Unnamed: 0,Customer_Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building_Dimension,Building_Type,Date_of_Occupancy,Claim
0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,0
1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,0
2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,0
3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,0
4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,0


In [94]:
train_data.shape

(7160, 12)

In [95]:
test_data.head()

Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code
0,H11920,2013,1.0,0,V,N,O,R,300.0,1,1960.0,3,3310
1,H11921,2016,0.997268,0,V,N,O,R,300.0,1,1960.0,3,3310
2,H9805,2013,0.369863,0,V,V,V,U,790.0,1,1960.0,.,3310
3,H7493,2014,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321
4,H7494,2016,1.0,0,V,N,O,R,1405.0,1,2004.0,3,3321


In [96]:
test_data.shape

(3069, 13)

In [97]:
train_data.dtypes


Customer_Id            object
YearOfObservation       int64
Insured_Period        float64
Residential             int64
Building_Painted       object
Building_Fenced        object
Garden                 object
Settlement             object
Building_Dimension    float64
Building_Type           int64
Date_of_Occupancy     float64
Claim                   int64
dtype: object

In [98]:
train_data['Garden'].fillna(train_data['Garden'].mode()[0], inplace=True) 
train_data['Building_Dimension'].fillna(train_data['Building_Dimension'].mode()[0], inplace=True) 
train_data['Date_of_Occupancy'].fillna(train_data['Date_of_Occupancy'].mode()[0], inplace=True) 
train_data['Geo_Code'].fillna(train_data['Geo_Code'].mode()[0], inplace=True) 

In [99]:
test_data['Garden'].fillna(test_data['Garden'].mode()[0], inplace=True) 
test_data['Building Dimension'].fillna(test_data['Building Dimension'].mode()[0], inplace=True) 
test_data['Date_of_Occupancy'].fillna(test_data['Date_of_Occupancy'].mode()[0], inplace=True) 

In [100]:
X = train_data.drop(['Customer_Id','Claim'],axis=1)
y = train_data['Claim']

from sklearn.model_selection import train_test_split



In [101]:
X = pd.get_dummies(X)
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [103]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5728, 14), (1432, 14), (5728,), (1432,))

In [104]:
from sklearn.utils import resample


In [105]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

In [106]:
# separate minority and majority classes
fraud = X[X.Claim==0]
not_fraud = X[X.Claim==1]

In [107]:
# upsample minority
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

In [108]:
# combine majority and upsampled minority
upsampled = pd.concat([not_fraud, fraud_upsampled])

In [109]:
# check new class counts
upsampled.Claim.value_counts()

1    1294
0    1294
Name: Claim, dtype: int64

In [110]:
import sklearn


In [111]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [112]:
y_train = upsampled.Claim
X_train = upsampled.drop('Claim', axis=1)

In [113]:
upsampled_logit_model = LogisticRegression(solver='liblinear').fit(X_train, y_train)

In [114]:
print('Accuracy on the training data: {:.4f}'.format(upsampled_logit_model.score(X_train, y_train)))
print('Accuracy on the training data: {:.4f}'.format(upsampled_logit_model.score(X_test, y_test)))

Accuracy on the training data: 0.6592
Accuracy on the training data: 0.7018


In [115]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [116]:
upsampled_RF = RandomForestClassifier().fit(X_train, y_train)
print('Accuracy on the training data: {:.4f}'.format(upsampled_RF.score(X_train, y_train)))
print('Accuracy on the training data: {:.4f}'.format(upsampled_RF.score(X_test, y_test)))

Accuracy on the training data: 0.9934
Accuracy on the training data: 0.5936


In [117]:
y_predicted = upsampled_RF.predict(X_test)
confusion_matrix(y_predicted,y_test)

array([[642, 132],
       [450, 208]], dtype=int64)

In [118]:
from sklearn.feature_selection import RFE

In [None]:
model = RandomForestClassifier
