#### Import Library

In [30]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score
import pickle

#### Load Data

In [31]:
data = pd.read_csv('https://raw.githubusercontent.com/faazanaima/ftde_prediction_insurance/main/Prediction%20Insurance.csv')

#### Exploratory Data Analysis

In [32]:
data.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28,0,> 2 Years,Yes,40454,26,217,1
1,2,Male,76,1,3,0,1-2 Year,No,33536,26,183,0
2,3,Male,47,1,28,0,> 2 Years,Yes,38294,26,27,1
3,4,Male,21,1,11,1,< 1 Year,No,28619,152,203,0
4,5,Female,29,1,41,1,< 1 Year,No,27496,152,39,0


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   id                    381109 non-null  int64 
 1   Gender                381109 non-null  object
 2   Age                   381109 non-null  int64 
 3   Driving_License       381109 non-null  int64 
 4   Region_Code           381109 non-null  int64 
 5   Previously_Insured    381109 non-null  int64 
 6   Vehicle_Age           381109 non-null  object
 7   Vehicle_Damage        381109 non-null  object
 8   Annual_Premium        381109 non-null  int64 
 9   Policy_Sales_Channel  381109 non-null  int64 
 10  Vintage               381109 non-null  int64 
 11  Response              381109 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 34.9+ MB


In [34]:
data.describe()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0,381109.0
mean,190555.0,38.822584,0.997869,26.388807,0.45821,30564.389581,112.034295,154.347397,0.122563
std,110016.836208,15.511611,0.04611,13.229888,0.498251,17213.155057,54.203995,83.671304,0.327936
min,1.0,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,95278.0,25.0,1.0,15.0,0.0,24405.0,29.0,82.0,0.0
50%,190555.0,36.0,1.0,28.0,0.0,31669.0,133.0,154.0,0.0
75%,285832.0,49.0,1.0,35.0,1.0,39400.0,152.0,227.0,0.0
max,381109.0,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [49]:
data.columns

Index(['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured',
       'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Response'],
      dtype='object')

In [35]:
exclude_columns = ['id', 'Age', 'Annual_Premium', 'Region_Code', 'Vintage', 'Policy_Sales_Channel']
unique__columns = {col: data[col].value_counts() for col in data.columns if col not in exclude_columns}
unique__columns

{'Gender': Gender
 Male      206089
 Female    175020
 Name: count, dtype: int64,
 'Driving_License': Driving_License
 1    380297
 0       812
 Name: count, dtype: int64,
 'Previously_Insured': Previously_Insured
 0    206481
 1    174628
 Name: count, dtype: int64,
 'Vehicle_Age': Vehicle_Age
 1-2 Year     200316
 < 1 Year     164786
 > 2 Years     16007
 Name: count, dtype: int64,
 'Vehicle_Damage': Vehicle_Damage
 Yes    192413
 No     188696
 Name: count, dtype: int64,
 'Response': Response
 0    334399
 1     46710
 Name: count, dtype: int64}

#### Data Preparation

In [36]:
data.duplicated().sum()

0

In [38]:
data = data.drop('id', axis=1)
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
data['Vehicle_Age'] = data['Vehicle_Age'].map({'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2})
data['Vehicle_Damage'] = data['Vehicle_Damage'].map({'No': 0, 'Yes': 1})

#### Data Modelling

In [39]:
X = data.drop(columns=['Response'])
y = data['Response']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Model Evaluation

##### Random Forest Classifier

In [41]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [42]:
y_pred = model.predict(X_test)

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.97      0.93     66699
           1       0.37      0.12      0.18      9523

    accuracy                           0.87     76222
   macro avg       0.63      0.54      0.55     76222
weighted avg       0.82      0.87      0.83     76222



##### Voting Classifier: XGB, RF, GB

In [66]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [67]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [68]:
xgb_model = XGBClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)

In [69]:
voting_clf = VotingClassifier(estimators=[('xgb', xgb_model), ('rf', rf_model), ('gb', gb_model)], voting='soft')

In [70]:
voting_clf.fit(X_train_resampled, y_train_resampled)

In [71]:
y_pred = voting_clf.predict(X_test)

In [72]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.79      0.86     66699
           1       0.33      0.73      0.45      9523

    accuracy                           0.78     76222
   macro avg       0.64      0.76      0.66     76222
weighted avg       0.87      0.78      0.81     76222



VotingClassifier dengan penggunaan SMOTE untuk menangani ketidakseimbangan kelas cenderung meningkatkan recall untuk kelas minoritas, tetapi dengan trade-off sedikit precision dan akurasi keseluruhan dibandingkan dengan RandomForestClassifier yang tidak diimbangi.

#### Save Model

In [None]:
with open('prediction_insurance_model.pkl', 'wb') as file:
    pickle.dump(model, file)