In [2]:
import pandas as pd
import numpy as np
insurance_df=pd.read_csv('insurance.csv')

In [3]:
insurance_df.sample(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1136,44,female,25.0,1,no,southwest,7623.518
214,45,female,30.9,2,no,southwest,8520.026
82,22,male,37.62,1,yes,southeast,37165.1638
753,58,female,22.77,0,no,southeast,11833.7823
1083,32,male,31.5,1,no,southwest,4076.497


In [4]:
insurance_df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [4]:
X=insurance_df.iloc[:,:-1].values
y=insurance_df.iloc[:,-1].values

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
preprocessor = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(sparse=False),[1,4,5]),
    ('tnf2',StandardScaler(),[0,2,3])    
    ],remainder='passthrough')

In [6]:
# spliting the data in to test and train sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20,random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5,random_state=42)

In [7]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)
X_val=preprocessor.transform(X_val)

In [8]:
!pip install xgboost

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
Collecting xgboost
  Using cached xgboost-1.5.0-py3-none-manylinux2014_x86_64.whl (173.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.5.0


In [9]:
import xgboost as xgb


model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 0.1, max_depth = 5, n_estimators = 100)

model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [10]:
# predict the score of the trained model using the testing dataset

result = model.score(X_test, y_test)

print("Accuracy : {}".format(result))

Accuracy : 0.864160875354439


In [11]:
# make predictions on the test data

y_predict = model.predict(X_test)


In [12]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

RMSE = 4401.225 
MSE = 19370780.272913326 
MAE = 2452.6838513279495 
R2 = 0.864160875354439 
Adjusted R2 = 0.8519130854273802


In [13]:
# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.

train_data = pd.DataFrame({'Target': y_train})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]

In [14]:
train_data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,9,10
0,9193.8385,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.472227,-1.756525,0.734336
1,8534.6718,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.543313,-1.033082,-0.911192
2,27117.99378,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.898745,-0.943687,-0.911192
3,8596.8278,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.025379,0.622393,3.202629
4,12475.3513,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.040918,-1.504893,1.5571


In [15]:
val_data = pd.DataFrame({'Target':y_val})
for i in range(X_val.shape[1]):
    val_data[i] = X_val[:,i]

In [16]:
val_data.head()

Unnamed: 0,Target,0,1,2,3,4,5,6,7,8,9,10
0,1980.07,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.376021,0.40387,-0.088428
1,17043.3414,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.949502,-0.579482,-0.911192
2,1256.299,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.447107,-0.026553,-0.911192
3,5836.5204,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.309724,0.877336,0.734336
4,22478.6,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.258967,-0.639079,0.734336


In [17]:
# save train_data and validation_data as csv files.

train_data.to_csv('train.csv', header = False, index = False)
val_data.to_csv('validation.csv', header = False, index = False)

In [18]:
# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3
from sagemaker import Session

# Let's create a Sagemaker session
sagemaker_session = sagemaker.Session()
bucket = Session().default_bucket() 
prefix = 'XGBoost-Regressor'
key = 'XGBoost-Regressor'
#Roles give learning and hosting access to the data
#This is specified while opening the sagemakers instance in "Create an IAM role"
role = sagemaker.get_execution_role()

In [19]:
# read the data from csv file and then upload the data to s3 bucket
import os
with open('train.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

uploaded training data location: s3://sagemaker-us-east-2-363557075783/XGBoost-Regressor/train/XGBoost-Regressor


In [20]:
# read the data from csv file and then upload the data to s3 bucket

with open('validation.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(f)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

uploaded validation data location: s3://sagemaker-us-east-2-363557075783/XGBoost-Regressor/validation/XGBoost-Regressor


In [21]:
# creates output placeholder in S3 bucket to store the output

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

training artifacts will be uploaded to: s3://sagemaker-us-east-2-363557075783/XGBoost-Regressor/output


In [24]:
container = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=boto3.Session().region_name,
    version="1.3-1")# Latest version of XGboost

In [25]:
Xgboost_regressor1 = sagemaker.estimator.Estimator(container,
                                       role, 
                                       train_instance_count = 1, 
                                       train_instance_type = 'ml.m4.xlarge',
                                       output_path = output_location,
                                       sagemaker_session = sagemaker_session)

#We can tune the hyper-parameters to improve the performance of the model

Xgboost_regressor1.set_hyperparameters(max_depth = 6,
                           objective = 'reg:linear',
                           colsample_bytree = 0.2405025950329596,
                           eta=0.3011948580044998,            
                           alpha = 4.25822450396689,
                           num_round = 81
                           )

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [26]:
train_input = sagemaker.session.s3_input(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.session.s3_input(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')


data_channels = {'train': train_input,'validation': valid_input}


Xgboost_regressor1.fit(data_channels)

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


2021-11-02 08:57:38 Starting - Starting the training job...
2021-11-02 08:58:01 Starting - Launching requested ML instancesProfilerReport-1635843457: InProgress
...
2021-11-02 08:58:29 Starting - Preparing the instances for training............
2021-11-02 09:00:31 Downloading - Downloading input data
2021-11-02 09:00:31 Training - Downloading the training image.....
2021-11-02 09:01:32 Uploading - Uploading generated training model
2021-11-02 09:01:32 Completed - Training job completed
[34m[2021-11-02 09:01:19.715 ip-10-0-234-206.us-east-2.compute.internal:1 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2021-11-02:09:01:19:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2021-11-02:09:01:19:INFO] Failed to parse hyperparameter objective value reg:linear to Json.[0m
[34mReturning the value itself[0m
[34m[2021-11-02:09:01:19:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2021-11-02:09:01:19:INFO] Running XGBoost Sagemaker in a

In [27]:
# Deploy the model to perform inference 

Xgboost_regressor = Xgboost_regressor1.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

------!

In [28]:
'''
Content type over-rides the data that will be passed to the deployed model, since the deployed model expects data
in text/csv format, we specify this as content -type.

Serializer accepts a single argument, the input data, and returns a sequence of bytes in the specified content
type

Reference: https://sagemaker.readthedocs.io/en/stable/predictors.html
'''
from sagemaker.predictor import csv_serializer, json_deserializer


Xgboost_regressor.serializer = csv_serializer

In [29]:
y_test

array([ 2643.2685  , 12957.118   ,  4266.1658  , 12404.8791  ,
       12644.589   , 18246.4955  ,  5757.41345 ,  9095.06825 ,
        6435.6237  ,  4527.18295 ,  8932.084   ,  9283.562   ,
       18765.87545 ,  2597.779   ,  4753.6368  ,  8428.0693  ,
       34439.8559  ,  7256.7231  ,  6067.12675 ,  2404.7338  ,
       18804.7524  ,  4239.89265 , 33750.2918  , 11396.9002  ,
        2020.5523  , 12096.6512  ,  3392.9768  , 42983.4585  ,
       27941.28758 , 12347.172   , 12244.531   ,  3161.454   ,
       22192.43711 ,  6079.6715  ,  8059.6791  , 23045.56616 ,
        4889.9995  ,  3167.45585 , 13555.0049  ,  4687.797   ,
        1986.9334  , 37607.5277  ,  2755.02095 ,  3208.787   ,
       16577.7795  , 28101.33305 , 38709.176   ,  9644.2525  ,
        4391.652   , 38282.7495  ,  9432.9253  ,  2710.82855 ,
       12124.9924  , 23306.547   ,  4076.497   ,  9301.89355 ,
        1727.54    ,  1708.92575 ,  7935.29115 , 12029.2867  ,
        4137.5227  , 12363.547   ,  4463.2051  ,  2396.

In [30]:
# making prediction

predicted_values = Xgboost_regressor.predict(X_test)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [31]:
predicted_values

b'7484.79052734375,14343.375,6486.638671875,7432.80029296875,18730.865234375,23956.763671875,10871.689453125,10607.94921875,12277.6025390625,2602.725830078125,9207.275390625,11235.3212890625,21617.953125,3438.016845703125,5998.60302734375,5957.41650390625,33730.93359375,9617.125,5377.6396484375,5529.59521484375,8462.658203125,2453.546630859375,28166.662109375,15720.283203125,5840.13623046875,13725.400390625,4351.9140625,35597.640625,13958.6494140625,14846.16796875,18664.080078125,3763.89990234375,9112.064453125,14825.029296875,6393.2861328125,9972.7734375,9655.5673828125,5327.0556640625,16949.99609375,2260.758544921875,8407.8076171875,30378.16796875,3687.74755859375,1889.9066162109375,26586.83203125,35027.97265625,33752.43359375,6818.8037109375,5032.88427734375,35676.81640625,15300.498046875,4110.79931640625,13545.94140625,32543.08203125,3525.4130859375,9549.8369140625,3746.58056640625,1839.8446044921875,15373.8046875,8331.466796875,4468.1328125,12772.892578125,9185.9453125,4030.686523

In [32]:
# custom code to convert the values in bytes format to array

def bytes_2_array(x):
    l = str(x).split(',')
    l[0] = l[0][2:]
    l[-1] = l[-1][:-1]
    for i in range(len(l)):
        l[i] = float(l[i])
    l = np.array(l).astype('float32')
    return l.reshape(-1,1)

In [33]:
predicted_values=bytes_2_array(predicted_values)

In [34]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, predicted_values)),'.3f'))
MSE = mean_squared_error(y_test, predicted_values)
MAE = mean_absolute_error(y_test, predicted_values)
r2 = r2_score(y_test, predicted_values)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2) 

RMSE = 5487.131 
MSE = 30108608.610516828 
MAE = 3945.068554447029 
R2 = 0.7888610071289972 
Adjusted R2 = 0.769823884820956


In [35]:
Xgboost_regressor.delete_endpoint()