In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

### EDA

In [4]:
data=pd.read_csv('data/insurance.csv')
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [8]:
data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [10]:
data.duplicated().sum()

1

In [12]:
data.drop_duplicates(inplace=True)

In [14]:
data.duplicated().sum()

0

In [16]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [20]:
data = pd.get_dummies(data, columns=['sex', 'smoker', 'region'], drop_first=True)

### Splitting x and y

In [26]:
X = data.drop('charges', axis=1)
y = data['charges']

### train test split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
scaler = StandardScaler()
X_train[['age', 'bmi', 'children']] = scaler.fit_transform(X_train[['age', 'bmi', 'children']])
X_test[['age', 'bmi', 'children']] = scaler.transform(X_test[['age', 'bmi', 'children']])

### Linear regression

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
print("\nModel Evaluation Metrics:")
print(f"Mean Absolute Error (MAE): {mean_absolute_error(y_test, y_pred):.2f}")
print(f"Mean Squared Error (MSE): {mean_squared_error(y_test, y_pred):.2f}")
print(f"Root Mean Squared Error (RMSE): {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"R-squared (R2): {r2_score(y_test, y_pred):.4f}")


Model Evaluation Metrics:
Mean Absolute Error (MAE): 4181.82
Mean Squared Error (MSE): 38940169.92
Root Mean Squared Error (RMSE): 6240.21
R-squared (R2): 0.7724


In [45]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("\nPredictions vs Actual Values:")
print(results.head())


Predictions vs Actual Values:
           Actual     Predicted
900    8688.85885   8075.919658
1064   5708.86700   5499.002788
1256  11436.73815  14345.712196
298   38746.35510  31428.203720
237    4463.20510   8874.781607


### SVR

In [48]:
from sklearn.svm import SVR
svr_model = SVR(kernel='linear', C=1.0, epsilon=0.1)
svr_model.fit(X_train, y_train)
y_pred_svr = svr_model.predict(X_test)

svr_mse = mean_squared_error(y_test, y_pred_svr)
svr_r2 = r2_score(y_test, y_pred_svr)

print(f"Support Vector Regression - MSE: {svr_mse:.2f}, R-squared: {svr_r2:.2f}")

Support Vector Regression - MSE: 188622170.23, R-squared: -0.10


### Decision tree

In [51]:
from sklearn.tree import DecisionTreeRegressor
decision_tree_model = DecisionTreeRegressor(random_state=42, max_depth=5)
decision_tree_model.fit(X_train, y_train)
y_pred_tree = decision_tree_model.predict(X_test)
decision_tree_mse = mean_squared_error(y_test, y_pred_tree)
decision_tree_r2 = r2_score(y_test, y_pred_tree)
print(f"Decision Tree Regressor - MSE: {decision_tree_mse:.2f}, R-squared: {decision_tree_r2:.2f}")

Decision Tree Regressor - MSE: 21844383.39, R-squared: 0.87


In [53]:
#END