In [2]:
import numpy as np
import pandas as pd


from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score,mean_absolute_error,root_mean_squared_error

In [3]:
df = pd.read_csv("medical_insurance.csv")

In [4]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
2767,47,female,45.320,1,no,southeast,8569.86180
2768,21,female,34.600,0,no,southwest,2020.17700
2769,19,male,26.030,1,yes,northwest,16450.89470
2770,23,male,18.715,0,no,northwest,21595.38229


In [5]:
# null check
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
# duplicate check 
df.duplicated().sum()

1435

In [7]:
df.shape

(2772, 7)

In [8]:
df = df.drop_duplicates()

In [9]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [10]:
# dependent and independent feature
X = df.drop('charges',axis=1)
y = np.log1p(df['charges'])

In [11]:
num_cols = X.select_dtypes(exclude='object').columns.to_list()
cat_cols = X.select_dtypes(include="object").columns.to_list()

In [12]:
cat_cols

['sex', 'smoker', 'region']

In [13]:
# data encoding and scaling

preprocess = ColumnTransformer(
        transformers=[
            ("num",StandardScaler(),num_cols),
            ("cat",OneHotEncoder(drop="first"),cat_cols),
        ]
)

In [14]:
# model
svr = SVR(kernel="rbf")


In [15]:
pipe = Pipeline(
    [
        ("prep",preprocess),
        ("model",svr)
    ]
)

In [16]:
# train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [17]:
pipe.fit(X_train,y_train)

In [18]:
y_pred_log = pipe.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

In [19]:
# metrics
print("Base model prediction")
print(mean_absolute_error(y_true,y_pred))
print(root_mean_squared_error(y_true,y_pred))
print(r2_score(y_true,y_pred))

Base model prediction
2518.3912741121917
5398.93353396909
0.8413740332526031


In [20]:
# hyperparameter tuning
params = {
    "model__C":[10,50,100,200],
    "model__epsilon":[0.1,0.2,0.5],
    "model__gamma":["sacle",0.05,0.1]
}



grid = GridSearchCV(estimator=pipe,param_grid=params,cv=5,scoring="r2",n_jobs=-1)

grid.fit(X_train,y_train)

60 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
59 fits failed with the following error:
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py", l

In [21]:
grid.best_params_

{'model__C': 50, 'model__epsilon': 0.2, 'model__gamma': 0.05}

In [22]:
grid.best_score_

0.8203691375817133

In [23]:
grid_pred_log = pipe.predict(X_test)
grid_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

In [24]:
print("Ater Hyperparameter tuning model prediction")
print(mean_absolute_error(y_true,grid_pred))
print(root_mean_squared_error(y_true,grid_pred))
print(r2_score(y_true,grid_pred))

Ater Hyperparameter tuning model prediction
2518.3912741121917
5398.93353396909
0.8413740332526031


In [27]:
import joblib 


joblib.dump(pipe, "model.pkl")


['model.pkl']

In [25]:
grid.best_estimator_