<a href="https://colab.research.google.com/github/fjadidi2001/Insurance/blob/main/Nov21Insur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset

# 1. Handle missing values
# For numerical columns, we'll use median imputation
# For categorical columns, we'll use mode imputation
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df.select_dtypes(include=['object']).columns

# 2. Feature engineering
df['Age_Group'] = pd.cut(df['Insured.age'], bins=[0, 25, 35, 45, 55, 65, 100], labels=['18-25', '26-35', '36-45', '46-55', '56-65', '65+'])
df['Car_Age_Group'] = pd.cut(df['Car.age'], bins=[-1, 0, 3, 5, 10, 20, 100], labels=['New', '1-3', '4-5', '6-10', '11-20', '20+'])

# 3. Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 4. Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Fit and transform the data
X = df.drop(['NB_Claim', 'AMT_Claim'], axis=1)  # Features
y_nb = df['NB_Claim']  # Target for classification
y_amt = df['AMT_Claim']  # Target for regression

X_preprocessed = preprocessor.fit_transform(X)

# 6. Convert to DataFrame for better interpretability
feature_names = (numeric_features.tolist() +
                 preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names(categorical_features).tolist())
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

# 7. Print info about the preprocessed data
print("Preprocessed data shape:", X_preprocessed_df.shape)
print("\nFirst few rows of preprocessed data:")
print(X_preprocessed_df.head())

# 8. Handle target variables
# For NB_Claim (classification)
y_nb = (y_nb > 0).astype(int)  # Convert to binary (0 for no claim, 1 for claim)

# For AMT_Claim (regression)
# We'll keep it as is, but you might want to consider log transformation if the distribution is skewed

print("\nUnique values in NB_Claim (after binarization):", y_nb.unique())
print("AMT_Claim statistics:")
print(y_amt.describe())