In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
import warnings 
warnings.filterwarnings("ignore")

In [2]:
# loading the data
df = pd.read_csv('Downloads/insurance.csv')
new_df = df.copy()
df

FileNotFoundError: [Errno 2] No such file or directory: 'Downloads/insurance.csv'

### The above data contains 1338 rows and 7 Columns



## EDA

In [None]:
df.isna().sum() # we have no missing values in the data

In [None]:
df.info() # We can see that all feature are of the right data type

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates() # dropping the duplicated row
df.head()

In [None]:
# counting the unique values in the smokers column
df.value_counts('smoker')

In [None]:
# counting the unique values in the sex column
df.value_counts('sex')

In [None]:
# lets plot the region column to see how many unique catigories we have
sns.countplot(data=df, x='region')


In [None]:
sns.boxplot(data=df, x='bmi') 

In [None]:
sns.boxplot(data=df, x='bmi', y='region') 

In [None]:
sns.histplot(df, x='bmi', kde=True)

### Removing the outliers from the data


In [None]:
upper_limit = df.bmi.mean() +3*df.bmi.std()
upper_limit

In [None]:
lower_limit = df.bmi.mean() -3*df.bmi.std()
lower_limit

In [None]:
# Displaying the outlers in the data 
df[(df.bmi>upper_limit) | (df.bmi<lower_limit)]

In [None]:
# creating a new dataframe without the outliers
df =df[(df.bmi<upper_limit) & (df.bmi>lower_limit)]
df.shape

In [None]:
dataframe = df_new



In [None]:
# converting the categorical variables into numerical variables using label encoder



In [None]:

encoder= LabelEncoder()

cat_col = df.select_dtypes(exclude='number')

for i in cat_col:
    df[i] = encoder.fit_transform(df[i])
df.head()



In [None]:
sns.countplot(df, x='smoker')

In [None]:
sns.histplot(df, x='children')

In [None]:
sns.histplot(df, x='age')

In [None]:
sns.violinplot(data=df, x='smoker', y='charges', hue=None ,color='c')


### As we see above in the bivariate analysis of charges and smoker, if the person has a smoker then he/she pay high medical insurance.

In [None]:
sns.violinplot(data=df, x='children', y='charges', hue=None ,color='c')



In [None]:
sns.violinplot(data=df, x='sex', y='charges', hue=None ,color='c')



In [None]:
sns.violinplot(data=df, x='sex', y='charges', hug='smoker', color='c')



In [None]:
correlation_matrix = df.corr()

# Set up the matplotlib figure
#plt.figure(figsize=(16, 8))

# Create a heatmap using seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)




### From the correlation matrix we came to know that the charges and smoker has a correlation with the value of 0.79, which means smoker column has a greater influence on the charges of medical insurance.


In [None]:
sns.pairplot(new_df)

## Data Preprocessing and Modelling

In [None]:

dataframe = pd.DataFrame(new_df)
dataframe.head()



### Splitting into Train and test set

In [None]:
x = dataframe.drop('charges', axis=1)
y = dataframe['charges']

x_train, x_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=42)
                                                   

x_train.shape, x_test.shape, y_train.shape, y_test.shape

### Y_train Encoding

In [None]:
ohe= OneHotEncoder(handle_unknown="ignore")

x_train_ohe= ohe.fit_transform(x_train[['sex', 'smoker', 'region']])
x_train_ohe= x_train_ohe.toarray()

x_train_ohe_df= pd.DataFrame(x_train_ohe, columns=ohe.get_feature_names_out(['sex', 'smoker', 'region']))

# One-hot encoding removed an index. Let's put it back:
x_train_ohe_df.index= x_train.index

# Joining the tables
x_train = pd.concat([x_train, x_train_ohe_df], axis=1)

# Dropping old categorical columns
x_train.drop(["sex", "smoker", "region"], axis=1, inplace=True)

# Checking result
x_train.head()

### X_test Encoding

In [None]:
x_test_ohe= ohe.transform(x_test[['sex', 'smoker', 'region']])
x_test_ohe= x_test_ohe.toarray()

x_test_ohe_df= pd.DataFrame(x_test_ohe, columns=ohe.get_feature_names_out(['sex', 'smoker', 'region']))
#print(x_test_ohe_df)

# One-hot encoding removed an index. Let's put it back:
x_test_ohe_df.index= x_test.index

# Joining the tables
x_test= pd.concat([x_test, x_test_ohe_df], axis=1)

# Dropping old categorical columns
x_test.drop(["sex", "smoker", "region"], axis=1, inplace=True)

# Checking result
x_test.head()

In [None]:
models_parameters= {

       "LinearRegression":[LinearRegression(),  {'n_jobs':[-1]}],
       "RandomForestRegressor": [RandomForestRegressor(), {'n_estimators':[100], 'max_depth':[10], 'min_samples_split':[2], 'criterion':['squared_error']}],
       "DecisionTreeRegressor": [DecisionTreeRegressor(), {'splitter':['best'], 'max_depth':[12], 'min_samples_split':[2],'criterion':['squared_error']}],
       "GradientBoostingRegressor":[GradientBoostingRegressor(), {'n_estimators':[120], 'learning_rate':[0.1],'max_depth':[12], 'min_samples_leaf':[3],'loss':['squared_error']}],
       "SupportVectorRegressor": [SVR(), {'kernel':['rbf'], 'gamma':['scale']}],
       "Lasso":[ Lasso(), {'alpha':[1.0,1.1],'max_iter':[1000,1200],'selection':['cyclic', 'random']}],
       "Ridge":[Ridge(), { 'alpha':[1.0,1.1],'max_iter':[1000,1200],'solver':['auto','svd','lsqr']}]
}

In [None]:
results_df = pd.DataFrame(columns=["Model", "Best_Params", "RMSE"])

# Loop through each model and its hyperparameters in the models_parameters dictionary
for model_name, (model, hyperparameters) in models_parameters.items():
    # Create a GridSearchCV instance for the current model and hyperparameters
    grid_search = GridSearchCV(model, hyperparameters, cv=5, scoring='neg_root_mean_squared_error')
    
    # Fit the GridSearchCV to the training data
    grid_search.fit(x_train, y_train)
    
    # Get the best model and its best hyperparameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    
    # Make predictions using the best model on the test data
    y_pred = best_model.predict(x_test)
    
    # Calculate the root mean squared error (RMSE) for the predictions
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Add the model results to the results DataFrame
    results_df = results_df.append({"Model": model_name, "Best_Params": best_params, "RMSE": rmse}, ignore_index=True)

print(results_df)

In [None]:
result={}
for key, value in models_parameters.items():
    result_list=[]
    regressor = RandomizedSearchCV(value[0],value[1],cv=10, scoring="r2", n_jobs=-1).fit(x_train, y_train)
    y_pred = regressor.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    root_mse=np.sqrt(mse)
    mae=mean_absolute_error(y_test, y_pred)
    result_list.append(root_mse)
    result_list.append(mae)
    result[key]=result_list