#**Load and Explore the Data**

In [2]:
import pandas as pd

In [4]:
# Load the dataset
data = pd.read_csv('/content/consumer_insurance_claims_10000_rows.csv')


In [7]:
# Display the first few rows
print(data.head(10))

   Claim_ID  Customer_ID  Customer_Age Customer_Gender  \
0         1         3564            47            Male   
1         2         1490            67            Male   
2         3         5698            70          Female   
3         4         4897            60          Female   
4         5         7213            38            Male   
5         6         9649            26            Male   
6         7         6006            23            Male   
7         8         4960            65            Male   
8         9         1569            30            Male   
9        10         4891            29          Female   

  Customer_Marital_Status  Customer_Income Customer_Profession Insurance_Type  \
0                 Widowed         32449.82              Lawyer           Home   
1                Divorced         71833.10             Teacher           Life   
2                Divorced         95381.03              Farmer         Health   
3                Divorced         541

In [8]:
# Get a summary of the data
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Claim_ID                 10000 non-null  int64  
 1   Customer_ID              10000 non-null  int64  
 2   Customer_Age             10000 non-null  int64  
 3   Customer_Gender          10000 non-null  object 
 4   Customer_Marital_Status  10000 non-null  object 
 5   Customer_Income          10000 non-null  float64
 6   Customer_Profession      10000 non-null  object 
 7   Insurance_Type           10000 non-null  object 
 8   Claim_Type               10000 non-null  object 
 9   Claim_Amount             10000 non-null  float64
 10  Claim_Date               10000 non-null  object 
 11  Policy_Years             10000 non-null  int64  
 12  Claim_Status             10000 non-null  object 
 13  Claim_Settlement_Time    10000 non-null  int64  
 14  Vehicle_Type           

#**Preprocessing and Feature Engineering**

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [10]:
# Identify features and target variable
X = data.drop(['Claim_ID', 'Customer_ID', 'Claim_Status'], axis=1)  # drop ID columns and target
y = data['Claim_Status']  # Assuming 'Claim_Status' is the target variable


In [11]:
# Split the dataset into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [12]:
# Preprocessing pipeline: Handle missing values and encode categorical variables
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns


In [13]:
# Define transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [16]:
# Combine the preprocessing steps for both numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [17]:
# Apply preprocessing to the training and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


#**Modeling with Random Forest Classifier**

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
# Initialize the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=0)


In [20]:
# Fit the model with training data
clf.fit(X_train_processed, y_train)

#**Model Evaluation**

In [21]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [22]:
# Predict on the test data
y_pred = clf.predict(X_test_processed)

In [23]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.3315


In [24]:

# Classification report (precision, recall, f1-score)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

    Approved       0.33      0.39      0.36       661
     Pending       0.33      0.29      0.31       684
    Rejected       0.33      0.31      0.32       655

    accuracy                           0.33      2000
   macro avg       0.33      0.33      0.33      2000
weighted avg       0.33      0.33      0.33      2000



In [25]:
# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[258 197 206]
 [272 199 213]
 [244 205 206]]


In [44]:
import matplotlib.pyplot as plt
import seaborn as sns

#**Model Finalization and Saving**

In [47]:
import joblib

In [50]:
# Save the trained model to a file
joblib.dump(clf, 'insurance_claim_model.pkl')

['insurance_claim_model.pkl']