



[notice] A new release of pip is available: 24.1.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import joblib


data = pd.read_csv('Insurance claims data.csv')


categorical_cols = ['region_code', 'segment', 'model', 'fuel_type', 'engine_type', 
                    'rear_brakes_type', 'transmission_type', 'steering_type']
boolean_cols = [col for col in ['is_esc', 'is_adjustable_steering', 'is_tpms', 'is_parking_sensors',
                                'is_parking_camera', 'is_front_fog_lights', 'is_rear_window_wiper',
                                'is_rear_window_washer', 'is_rear_window_defogger', 'is_brake_assist',
                                'is_power_door_locks', 'is_central_locking', 'is_power_steering',
                                'is_driver_seat_height_adjustable', 'is_day_night_rear_view_mirror',
                                'is_ecw', 'is_speed_alert']]
numerical_cols = ['subscription_length', 'vehicle_age', 'customer_age', 'region_density',
                  'displacement', 'cylinder', 'turning_radius', 'length', 'width',
                  'gross_weight', 'ncap_rating']


print(data['claim_status'].value_counts())


label_encoders = {}
for col in categorical_cols + boolean_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

X = data[categorical_cols + boolean_cols + numerical_cols]
y = data['claim_status']  


smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')


y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
print(f'Model Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


claim_status
0    54844
1     3748
Name: count, dtype: int64
Model Accuracy: 0.93
              precision    recall  f1-score   support

           0       0.93      0.93      0.93     11072
           1       0.93      0.92      0.93     10866

    accuracy                           0.93     21938
   macro avg       0.93      0.93      0.93     21938
weighted avg       0.93      0.93      0.93     21938

[[10348   724]
 [  828 10038]]
