## Data Preprocessing

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd 
import joblib

In [2]:
df = pd.read_csv("..\data\insurance.csv")

In [3]:
df.duplicated().sum()

0

In [4]:
df.isna().sum()

Id               0
age              5
gender           0
bmi              0
bloodpressure    0
diabetic         0
children         0
smoker           0
region           3
claim            0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isna().sum()

Id               0
age              0
gender           0
bmi              0
bloodpressure    0
diabetic         0
children         0
smoker           0
region           0
claim            0
dtype: int64

In [7]:
df.columns

Index(['Id', 'age', 'gender', 'bmi', 'bloodpressure', 'diabetic', 'children',
       'smoker', 'region', 'claim'],
      dtype='object')

In [8]:
X = df[["age", "gender", "bmi", "bloodpressure", "diabetic", "children", "smoker"]]
y = df["claim"]

In [9]:
X

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker
0,39.0,male,23.2,91,Yes,0,No
1,24.0,male,30.1,87,No,0,No
7,19.0,male,41.1,100,No,0,No
8,20.0,male,43.0,86,No,0,No
9,30.0,male,53.1,97,No,0,No
...,...,...,...,...,...,...,...
1335,44.0,female,35.5,88,Yes,0,Yes
1336,59.0,female,38.1,120,No,1,Yes
1337,30.0,male,34.5,91,Yes,3,Yes
1338,37.0,male,30.4,106,No,0,Yes


In [10]:
cat_cols = ["gender", "diabetic", "smoker"]
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le
    
    joblib.dump(le, f"../models/label_encoder_{col}.pkl")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
X

Unnamed: 0,age,gender,bmi,bloodpressure,diabetic,children,smoker
0,39.0,1,23.2,91,1,0,0
1,24.0,1,30.1,87,0,0,0
7,19.0,1,41.1,100,0,0,0
8,20.0,1,43.0,86,0,0,0
9,30.0,1,53.1,97,0,0,0
...,...,...,...,...,...,...,...
1335,44.0,0,35.5,88,1,0,1
1336,59.0,0,38.1,120,0,1,1
1337,30.0,1,34.5,91,1,3,1
1338,37.0,1,30.4,106,0,0,1


In [12]:
label_encoders

{'gender': LabelEncoder(),
 'diabetic': LabelEncoder(),
 'smoker': LabelEncoder()}

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
num_cols = ["age", "bmi", "bloodpressure", "children"]
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

joblib.dump(scaler, "../models/scaler.pkl")

['../models/scaler.pkl']

In [15]:
print(X_train.shape, y_train.shape)

(1065, 7) (1065,)


In [16]:
print(X_test.shape, y_test.shape)

(267, 7) (267,)


## Choose best model 

In [17]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

In [18]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return {"R2": r2, "MAE": mae, "RMSE": rmse}

In [19]:
results = {}

In [20]:
# --------------- Linear Regression ------------------
lr = LinearRegression()
lr.fit(X_train, y_train)
results["Linear Regression"] = evaluate_model(lr, X_train, X_test, y_train, y_test)
print("Linear Regression model trained")

# --------------- Polymoninal Regression ------------------
best_poly_model = None
best_poly_score = -np.inf

for degree in [2,3]:
    poly = PolynomialFeatures(degree=degree)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    
    poly_lr = LinearRegression()
    poly_lr.fit(X_train_poly, y_train)
    
    score = poly_lr.score(X_test_poly, y_test)
    
    if score > best_poly_score:
        
        best_poly_score = score 
        best_poly_model = (degree, poly, poly_lr)
        
degree, poly, poly_lr = best_poly_model

results[f"Polynomial Regression (deg) = {degree}"] = evaluate_model(poly_lr, poly.fit_transform(X_train), poly.transform(X_test), y_train, y_test)

print("Polynominal Regression models are trained")

# --------------- Random Forest ------------------
rf = RandomForestRegressor()

rf_params = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2]
}

rf_grid = GridSearchCV(rf, rf_params, cv=3, scoring="r2", n_jobs=1, verbose=1)
rf_grid.fit(X_train, y_train)
best_rf = rf_grid.best_estimator_

results["Random Forest"] = evaluate_model(best_rf, X_train, X_test, y_train, y_test)

print("Random Forest training is completed, best parameters", rf_grid.best_params_)

# --------------- Support Vector Machine ------------------
svr = SVR()

svr_params = {
    "kernel": ["rbf", "poly", "linear"],
    "C": [1, 10, 50],
    "epsilon": [0.1, 0.2, 0.5],
    "degree": [2,3]
}

svr_grid = GridSearchCV(svr, svr_params, cv=3, scoring="r2", n_jobs=1, verbose=1)
svr_grid.fit(X_train, y_train)

best_svr = svr_grid.best_estimator_

results["SVR"] = evaluate_model(best_svr, X_train, X_test, y_train, y_test)

print("SVR training is completed, best parameters:", svr_grid.best_params_)

# --------------- XGBoost ------------------
xgb = XGBRegressor(objective="reg:squarederror")

xgb_params = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.8, 1.0] 
}

xgb_grid = GridSearchCV(xgb, xgb_params, cv=3, scoring="r2", n_jobs=1, verbose=1)
xgb_grid.fit(X_train, y_train)
best_xgb = xgb_grid.best_estimator_

results["XGBoost"] = evaluate_model(best_xgb, X_train, X_test, y_train, y_test)

print("XGBoost training is completed, best parameters:", xgb_grid.best_params_)


Linear Regression model trained
Polynominal Regression models are trained
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Random Forest training is completed, best parameters {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Fitting 3 folds for each of 54 candidates, totalling 162 fits
SVR training is completed, best parameters: {'C': 50, 'degree': 2, 'epsilon': 0.5, 'kernel': 'linear'}
Fitting 3 folds for each of 36 candidates, totalling 108 fits
XGBoost training is completed, best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}


In [21]:
results

{'Linear Regression': {'R2': 0.7324298069230433,
  'MAE': 5088.40890701428,
  'RMSE': 6496.451574903517},
 'Polynomial Regression (deg) = 3': {'R2': 0.7989290738858008,
  'MAE': 4334.150451559573,
  'RMSE': 5631.602313102015},
 'Random Forest': {'R2': 0.8347352415354936,
  'MAE': 3977.177679647776,
  'RMSE': 5105.607985700932},
 'SVR': {'R2': 0.5161536087162986,
  'MAE': 6021.900913169492,
  'RMSE': 8735.969168900529},
 'XGBoost': {'R2': 0.8392102415134834,
  'MAE': 3884.584450857327,
  'RMSE': 5036.009387053693}}

In [22]:
results_df = pd.DataFrame(results).T.sort_values(by="R2", ascending=False)
results_df

Unnamed: 0,R2,MAE,RMSE
XGBoost,0.83921,3884.584451,5036.009387
Random Forest,0.834735,3977.17768,5105.607986
Polynomial Regression (deg) = 3,0.798929,4334.150452,5631.602313
Linear Regression,0.73243,5088.408907,6496.451575
SVR,0.516154,6021.900913,8735.969169


In [23]:
models = {
    "Linear Regression": lr,
    "Polynomial Regression": poly_lr,
    "Random Forest": best_rf,
    "SVR": best_svr,
    "XGBoost": best_xgb
}

best_r2 = results_df["R2"].max()

top_model = results_df[results_df["R2"] == best_r2]

In [24]:
top_model

Unnamed: 0,R2,MAE,RMSE
XGBoost,0.83921,3884.584451,5036.009387


In [25]:
best_model = models[top_model.index[0]]

In [26]:
joblib.dump(best_model, "../models/best_model.pkl")

['../models/best_model.pkl']

In [27]:
print(f"Best model selected: {top_model.index[0]}")

Best model selected: XGBoost
