In [1]:
pip install XGBoost

Note: you may need to restart the kernel to use updated packages.


In [10]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE, RFECV
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

insurance = pd.read_csv('Insurance/insurance.csv')
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
## Changing labels to numbers 
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)

## Extracting region dummies
region_dummies = pd.get_dummies(insurance['region']).iloc[:, 0:3]

## Appending dummies 
insurance = pd.concat([insurance, region_dummies], axis = 1)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [4]:
## Engineering features from decision-tree
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) & (insurance['age'] <= 32.5), 1, 0)
insurance['interaction_2'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 32.5) & (insurance['age'] <= 44.5), 1, 0)
insurance['interaction_3'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 44.5) & (insurance['age'] < 51.5), 1, 0)
insurance['interaction_4'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 51.5), 1, 0)

In [5]:
# Defining input and target variable
X = insurance[['age', 'bmi', 'children', 'smoker', 'interaction_4']]
Y = insurance['charges']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
## Defining the hyper-parameters for RF
RF_param_grid = {'n_estimators': [100, 300, 500],
                 'min_samples_split': [10, 15], 
                 'min_samples_leaf': [5, 7], 
                 'max_depth' : [3, 5, 7]}

# Performing GridSearch
RF_grid_search = GridSearchCV(RandomForestRegressor(), RF_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train, Y_train)

# Extracting the best model
RF_model = RF_grid_search.best_estimator_

# Predicting on test
RF_test_pred = RF_model.predict(X_test)

# Computing the mse on test
RF_test_mse = mean_squared_error(Y_test, RF_test_pred)
print("The mse of the model Random Forest Regressior on the test dataset is: ", round(RF_test_mse, 1))

The mse of the model Random Forest Regressior on the test dataset is:  19115371.6


In [9]:
## Defining the hyper-parameters for RF
XGBoost_param_grid = {'n_estimators': [500],
                        'max_depth': [3, 5, 7],
                        'min_child_weight': [5, 7],
                        'learning_rate': [0.01],
                        'gamma': [0.3, 0.1],
                        'subsample': [0.8, 1],
                        'colsample_bytree': [1]}

# Performing GridSearch
XGBoost_grid_search = GridSearchCV(XGBRegressor(), XGBoost_param_grid, cv = 3, scoring = 'neg_mean_squared_error', n_jobs = -1).fit(X_train, Y_train)

# Extracting the best model
XGBoost_model = XGBoost_grid_search.best_estimator_

# Predicting on test
XGBoost_test_pred = XGBoost_model.predict(X_test)

# Computing the mse on test
XGBoost_test_mse = mean_squared_error(Y_test, XGBoost_test_pred)
print("The mse of the model XGBoost Regressior on the test dataset is: ", round(XGBoost_test_mse, 1))

The mse of the model XGBoost Regressior on the test dataset is:  18846692.9


## Based on my results, XGBoost Regressior had a better performance.