<a href="https://colab.research.google.com/github/fjadidi2001/DataScienceJourney/blob/master/Insurance_MYV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.utils.class_weight import compute_class_weight


In [2]:
# Step 1: Load the dataset
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Specify file path
file_path = '/content/drive/My Drive/telematics_syn.csv'

# Import pandas (assuming you want to use it to read the CSV)
import pandas as pd

# Read the CSV file
data = pd.read_csv(file_path)

# Step 2: Explore the data
print(data.head())
print(data.info())

# Check for missing values
print(data.isnull().sum())

# Display basic statistics
print(data.describe())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   Duration  Insured.age Insured.sex  Car.age  Marital  Car.use  Credit.score  \
0       366           45        Male       -1  Married  Commute         609.0   
1       182           44      Female        3  Married  Commute         575.0   
2       184           48      Female        6  Married  Commute         847.0   
3       183           71        Male        6  Married  Private         842.0   
4       183           84        Male       10  Married  Private         856.0   

  Region  Annual.miles.drive  Years.noclaims  ...  Left.turn.intensity10  \
0  Urban             6213.71              25  ...                    1.0   
1  Urban            12427.42              20  ...                   58.0   
2  Urban            12427.42              14  ...                    0.0   
3  Urban             6213.71              43  ...                    0.0   
4  U

In [3]:
# Separate features and target
X = data.drop(['NB_Claim', 'AMT_Claim'], axis=1)
y = (data['NB_Claim'] > 0).astype(int)  # Convert to binary (0 for no claim, 1 for claim)

In [4]:
# Check class balance
class_counts = y.value_counts()
print("Class distribution:")
print(class_counts)
print(f"Class balance ratio: {class_counts[0] / class_counts[1]:.2f}")

Class distribution:
NB_Claim
0    95728
1     4272
Name: count, dtype: int64
Class balance ratio: 22.41


In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Define numeric and categorical columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = ['Insured.sex', 'Marital', 'Car.use', 'Region']

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [10]:
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

In [11]:
# Create Random Forest classifier with balanced class weights
rf_classifier = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=-1)


In [12]:
# Create the full pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])

In [13]:
# Fit the pipeline
pipeline.fit(X_train, y_train)

In [14]:
# Make predictions
y_pred = pipeline.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98     19130
           1       0.97      0.26      0.41       870

    accuracy                           0.97     20000
   macro avg       0.97      0.63      0.70     20000
weighted avg       0.97      0.97      0.96     20000



In [15]:
# Print the confusion matrix
print(confusion_matrix(y_test, y_pred))


[[19122     8]
 [  641   229]]


In [16]:

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")


Cross-validation scores: [0.968   0.9677  0.96725 0.96695 0.9692 ]
Mean CV score: 0.968 (+/- 0.002)


In [18]:
# Feature importance
feature_importance = pipeline.named_steps['classifier'].feature_importances_
feature_names = (preprocessor.named_transformers_['num'].get_feature_names_out().tolist() +
                 preprocessor.named_transformers_['cat'].get_feature_names_out().tolist())

In [19]:
# Sort feature importances in descending order
indices = np.argsort(feature_importance)[::-1]

# Print the top 20 feature ranking
print("\nTop 20 Feature ranking:")
for f in range(min(20, len(feature_names))):
    print("%d. %s (%f)" % (f + 1, feature_names[indices[f]], feature_importance[indices[f]]))


Top 20 Feature ranking:
1. Total.miles.driven (0.103526)
2. Annual.pct.driven (0.092099)
3. Duration (0.033678)
4. Credit.score (0.032826)
5. Pct.drive.2hrs (0.030017)
6. Avgdays.week (0.030016)
7. Pct.drive.thr (0.028129)
8. Pct.drive.rush am (0.027172)
9. Pct.drive.rush pm (0.024612)
10. Pct.drive.wed (0.024558)
11. Pct.drive.mon (0.024377)
12. Pct.drive.tue (0.023702)
13. Pct.drive.fri (0.023537)
14. Brake.06miles (0.022965)
15. Pct.drive.sat (0.022950)
16. Pct.drive.sun (0.022392)
17. Car.age (0.021313)
18. Accel.06miles (0.020572)
19. Years.noclaims (0.020419)
20. Left.turn.intensity09 (0.020307)
