In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df=pd.read_csv("insurance_price_prediction.csv")
df.head()


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,...,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Year,Day,Month_name
0,51.0,Male,5839.0,Married,,Master's,Employed,7.352902,Suburban,Premium,...,588.0,2.0,Average,No,Rarely,Condo,710.0,2023,11,January
1,46.0,Female,917.0,Divorced,3.0,Bachelor's,Employed,18.717685,Suburban,Basic,...,441.0,4.0,Average,No,Rarely,Apartment,638.0,2024,4,June
2,44.0,Male,15982.0,Married,2.0,Bachelor's,Unemployed,49.848348,Urban,Comprehensive,...,699.0,4.0,Average,Yes,Daily,Apartment,605.0,2020,25,August
3,43.0,Male,29894.0,Single,2.0,High School,Unemployed,42.180632,Rural,Comprehensive,...,786.0,8.0,Poor,Yes,Weekly,House,1103.0,2023,27,January
4,52.0,Male,30861.0,Married,2.0,Bachelor's,Self-Employed,,Urban,Basic,...,676.0,8.0,,No,Weekly,Apartment,591.0,2023,9,August


In [3]:
numerical_features = [
    'Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
    'Previous Claims', 'Credit Score', 'Insurance Duration',"Year","Day"
]
categorical_features = [
    'Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
    'Policy Type', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 
    'Property Type', 'Month_name'
]
target_column = 'Premium Amount'

In [4]:
X=df.drop(columns=target_column,axis=1)
Y=df[target_column]

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler

In [6]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
     ('StandardScaler',StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))                      # Encode categorical features
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, numerical_features),
        ('cat', cat_pipeline, categorical_features)
    ]
)

X_processed = preprocessor.fit_transform(X)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_processed,Y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((28800, 54), (7200, 54))

In [8]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [9]:

models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 864.8910
- Mean Absolute Error: 668.4664
- R2 Score: 0.0056
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 864.7018
- Mean Absolute Error: 665.7911
- R2 Score: 0.0045


Lasso
Model performance for Training set
- Root Mean Squared Error: 865.0441
- Mean Absolute Error: 668.7356
- R2 Score: 0.0053
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 864.7825
- Mean Absolute Error: 666.2109
- R2 Score: 0.0043


Ridge
Model performance for Training set
- Root Mean Squared Error: 864.8819
- Mean Absolute Error: 668.4476
- R2 Score: 0.0056
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 864.7422
- Mean Absolute Error: 665.8280
- R2 Score: 0.0044




[WinError 2] The system cannot find the file specified
  File "c:\ProgramData\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\ProgramData\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 773.1351
- Mean Absolute Error: 592.6369
- R2 Score: 0.2054
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 937.3617
- Mean Absolute Error: 712.5304
- R2 Score: -0.1699


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1215.6220
- Mean Absolute Error: 886.4740
- R2 Score: -0.9675


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 320.8447
- Mean Absolute Error: 243.1124
- R2 Score: 0.8632
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 850.2647
- Mean Absolute Error: 650.0920
- R2 Score: 0.0374


AdaBoost Regressor
Model performance for Training set
- Root Mean Squared Error: 911.6856
- Mean Absolute Erro

In [11]:
random=RandomForestRegressor()
random=random.fit(X_train,y_train)
y_pred=random.predict(X_test)
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
print("Root mean square error of the model is : %2f" %rmse)
print("="*30)

Root mean square error of the model is : 849.711886
