# ⚠️ Dataset Not Found
Place `merged_dataset_532_checked.csv` in the same folder as this notebook (`ps2 front`).
Or update the path in the code cell to the correct location of your CSV file.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle

In [None]:
df = pd.read_csv("merged_dataset_532_checked.csv")


In [6]:
# Drop rows with missing target
df = df.dropna(subset=['Total Insurance'])

In [7]:
# Clean columns
df['Net monthly income'] = pd.to_numeric(df['Net monthly income'], errors='coerce')
df['Assets'] = pd.to_numeric(df['Assets'], errors='coerce')

In [8]:
# Handle missing income columns
for col in ['Gross monthly income', 'Net monthly income']:
    df[col].fillna(df[col].median(), inplace=True)

In [9]:
# Select features
features = [
    'Age', 'Occupation', 'Assets', 'Investments', 'Savings', 'Debt',
    'Gross monthly income', 'Net monthly income', 'Rent/Mortgage', 'Utilities',
    'Emergency Fund', 'Dining out', 'Groceries', 'Miscellaneous', '% Expenses', '% Savings'
]
target = 'Total Insurance'

In [10]:
# Drop rows with any missing features
df = df.dropna(subset=features)

In [11]:
# Split X and y
X = df[features]
y = df[target]

In [12]:
# Column types
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()


In [13]:
# Preprocessing
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, num_cols),
    ('cat', categorical_pipeline, cat_cols)
])


In [14]:
# Model pipeline
model = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Train
model.fit(X_train, y_train)

In [17]:

# Predict
y_pred = model.predict(X_test)

In [18]:
# Evaluate
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 1894.197307692308
R² Score: 0.1413933364062846


In [19]:
# Select a test sample
sample_input = X_test.iloc[[0]]              # Pick the first row from test set
sample_actual = y_test.iloc[0]               # Actual insurance value for comparison

# Predict using the trained model
sample_prediction = model.predict(sample_input)[0]

# Display results
print("🔍 Input Data:")
display(sample_input)

print(f"✅ Actual Total Insurance: ₹{sample_actual:.2f}")
print(f"🤖 Predicted Total Insurance: ₹{sample_prediction:.2f}")


🔍 Input Data:


Unnamed: 0,Age,Occupation,Assets,Investments,Savings,Debt,Gross monthly income,Net monthly income,Rent/Mortgage,Utilities,Emergency Fund,Dining out,Groceries,Miscellaneous,% Expenses,% Savings
55,45,Homemaker,1883873.0,310957.0,40945.0,38262.0,955.0,51063.0,4302.0,232.0,3592.0,4536.0,4555.0,2218.0,7241.256545,37224.502618


✅ Actual Total Insurance: ₹5511.00
🤖 Predicted Total Insurance: ₹3687.84


In [20]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)