#Importing necessary data science libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display

#Metrics and Encoder Libraries

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_pinball_loss, mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

#Regression Models

In [3]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, learning_curve
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, VotingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import LinearRegression
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor


#Loading Dataset

In [4]:
data = pd.read_csv("/content/insurance.csv")


#Printing some dataset properties

In [5]:
print("*"*30, "HEAD", "*"*30)
display(data.head(5))
print("*"*30, "SHAPE", "*"*30)
print(f"Rows: {data.shape[0]}\nColumns: {data.shape[1]}")
print("*"*30, "INFO", "*"*30)
display(data.info())
print("*"*30, "DESCRIBE", "*"*30)
display(data.describe().T)
print("*"*30, "NULL?", "*"*30)
display(data.isnull().sum())
print("*"*30, "DUPLICATED", "*"*30)
display(data.duplicated().sum())
print("*"*30, "EXPLAINING", "*"*30)

****************************** HEAD ******************************


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


****************************** SHAPE ******************************
Rows: 1338
Columns: 7
****************************** INFO ******************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


None

****************************** DESCRIBE ******************************


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


****************************** NULL? ******************************


age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

****************************** DUPLICATED ******************************


1

****************************** EXPLAINING ******************************


##We have Rows: 1338, Columns: 7
##There is no null values
##Data have 1 duplicated values, we need to drop them.

In [6]:
data.drop_duplicates(inplace=True)

#Preparing Data
###Feature Label Split 

In [7]:
Y = data["charges"]
X = data.drop(labels = ["charges"],axis=1)


#Encoding the Object columns to achieve full numerical dataset

In [8]:
le = LabelEncoder()
X["sex"]  = le.fit_transform(X["sex"])
X["smoker"]  = le.fit_transform(X["smoker"])
X["region"]  = le.fit_transform(X["region"])

In [9]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.88,0,0,1


#Train Test Split

In [10]:

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

#Scaling the dataset

In [11]:
Xtrain = X_train.copy()
Xtest = X_test.copy()

#Standard Scaler
scaler = StandardScaler()
X_train_standard = scaler.fit_transform(X_train)
X_test_standard = scaler.fit_transform(X_test)

#Minmax Scaler
scaler = MinMaxScaler()
X_train_minmax = scaler.fit_transform(X_train)
X_test_minmax = scaler.fit_transform(X_test)

#Normalization
X_train_normalize = preprocessing.normalize(X_train)
X_test_normalize = preprocessing.normalize(X_test)

train_list = [Xtrain,X_train_standard,X_train_minmax,X_train_normalize]
scaler_list = ["without_scaler","standard_scaler","minmax_scaler","normalize"]

### We used two different scaler and one normalizer and raw data to try out all types and decide which gives the best score

#MODEL Selection


### We used 15 different models and 4 differently scaled data and choosed the best performing couple according to R2 score

In [12]:
kfold = StratifiedKFold(n_splits=10)
random_state = 42
z = 0

for i in train_list:
    linear_model = LinearRegression().fit(i,y_train)
    knn_model = KNeighborsRegressor().fit(i, y_train)
    decision_model = DecisionTreeRegressor().fit(i,y_train)
    mlp_model = MLPRegressor().fit(i, y_train)
    gaussian_model = BayesianRidge().fit(i, y_train)
    linear_svm_model = SVR(kernel='linear').fit(i,y_train)
    adaboost_model = AdaBoostRegressor(DecisionTreeRegressor(), learning_rate=0.1).fit(i,y_train)
    randomforest_model = RandomForestRegressor().fit(i,y_train)
    extra_model = ExtraTreesRegressor().fit(i,y_train)
    gb_model = GradientBoostingRegressor().fit(i,y_train)
    xgb_model = XGBRegressor().fit(i,y_train)
    lgbm_model = LGBMRegressor().fit(i,y_train)
    elastic_model = ElasticNet().fit(i,y_train)
    sgd_model = SGDRegressor().fit(i,y_train)
    kernel_model = KernelRidge().fit(i,y_train)
    model_names = ["Linear","Knn","DecisionTree","MLP","GaussianNB","SupportVectorMachine","AdaBoost","RandomForest","ExtraTrees","GradientBoost","Xgboost","Lightgbm", "Elastic", "SGD", "Kernel Ridge"]
    model_list = [linear_model,knn_model,decision_model,mlp_model,gaussian_model,linear_svm_model,adaboost_model,randomforest_model,extra_model,gb_model,xgb_model,lgbm_model,elastic_model,sgd_model,kernel_model]
    results = []
    z +=1
    if z ==1:
        print("*"*30, f"{scaler_list[z-1]}","*"*30)
    if z ==2:
        print("*"*30, f"{scaler_list[z-1]}","*"*30)
    if z ==3:
        print("*"*30, f"{scaler_list[z-1]}","*"*30)
    if z ==4:
        print("*"*30, f"{scaler_list[z-1]}","*"*30)
    for j in model_list:
        result = cross_val_score(j, i, y_train, scoring = "r2", cv = 5, n_jobs=4)
        results.append(result.mean())

    acc_of_models = {"Model": model_names, "R2": results}    
    acc_of_models = pd.DataFrame(acc_of_models)
    display(acc_of_models)
    print(np.max(acc_of_models["R2"]))


****************************** without_scaler ******************************


Unnamed: 0,Model,R2
0,Linear,0.7292348
1,Knn,0.03195326
2,DecisionTree,0.6688155
3,MLP,-0.4505553
4,GaussianNB,0.729258
5,SupportVectorMachine,-0.160848
6,AdaBoost,0.8004527
7,RandomForest,0.8247454
8,ExtraTrees,0.8034135
9,GradientBoost,0.841702


0.8456784125717617
****************************** standard_scaler ******************************


Unnamed: 0,Model,R2
0,Linear,0.729235
1,Knn,0.786199
2,DecisionTree,0.674978
3,MLP,-1.21254
4,GaussianNB,0.729264
5,SupportVectorMachine,-0.040895
6,AdaBoost,0.803707
7,RandomForest,0.823525
8,ExtraTrees,0.799969
9,GradientBoost,0.841667


0.8456798152695658
****************************** minmax_scaler ******************************


Unnamed: 0,Model,R2
0,Linear,0.729235
1,Knn,0.735039
2,DecisionTree,0.662807
3,MLP,-1.192378
4,GaussianNB,0.729259
5,SupportVectorMachine,-0.096059
6,AdaBoost,0.799921
7,RandomForest,0.820188
8,ExtraTrees,0.804231
9,GradientBoost,0.841598


0.8457029580999172
****************************** normalize ******************************


Unnamed: 0,Model,R2
0,Linear,0.523495
1,Knn,0.215502
2,DecisionTree,0.663077
3,MLP,-1.232832
4,GaussianNB,0.523561
5,SupportVectorMachine,-0.106283
6,AdaBoost,0.78757
7,RandomForest,0.812685
8,ExtraTrees,0.794174
9,GradientBoost,0.803789


0.8126848999763527


#The best score was XGBRegrossor algorithm trained with min-max scaled data

In [13]:
model = XGBRegressor().fit(X_train_minmax,y_train)



#Model *Train* Score

In [14]:
model.score(X_train_minmax,y_train)

0.897703181488283

#Model *Test* Score

In [15]:
y_preds = model.predict(X_test_minmax)
from sklearn.metrics import r2_score

r2_score(y_test, y_preds)

0.8643217022421513

#Hyperparameter Tuning

In [16]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='r2',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [18]:
model = XGBRegressor()
param_grid = {
    'n_estimators': [200, 300, 400],
    'colsample_bytree': [0.7, 0.8],
    'max_depth': [15,20,25],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'subsample': [0.7, 0.8, 0.9]
}

model, pred = algorithm_pipeline(X_train_minmax, X_test_minmax, y_train, y_test, model, 
                                 param_grid, cv=5)

# Root Mean Squared Error
print(model.best_score_)
print(model.best_params_)

Fitting 5 folds for each of 486 candidates, totalling 2430 fits
0.7688396759242134
{'colsample_bytree': 0.7, 'max_depth': 15, 'n_estimators': 200, 'reg_alpha': 1.2, 'reg_lambda': 1.3, 'subsample': 0.7}


In [19]:
print(r2_score(y_test, pred))

0.829471706765329
