In [15]:
pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [17]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna

from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.feature_selection import RFE, RFECV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

#defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

#defining the file to be read from s3 bucket
file_key = 'insurance.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

#reading the datafile
insurance = pd.read_csv(file_content_stream)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [18]:
#changing labels to numbers
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)
#changing region to dummies
insurance = pd.concat([insurance.drop(columns=['region'],axis=1), pd.get_dummies(insurance[['region']])],axis=1)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [47]:
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) &
                                   (insurance['age'] <= 32.5), 1, 0)
insurance['interaction_2'] = np.where((insurance['smoker'] == 0) &
                                   (insurance['age'] > 32.5) &
                                   (insurance['age'] <= 44.5), 1, 0)
insurance['interaction_3'] = np.where((insurance['smoker'] == 0) &
                                   (insurance['age'] > 44.5) &
                                   (insurance['age'] < 51.5), 1, 0)
insurance['interaction_4'] = np.where((insurance['smoker'] == 0) &
                                   (insurance['age'] > 51.5), 1, 0)
#defining the input and target variables
x = insurance.drop(columns = ['charges'], axis = 1)
y = insurance['charges']

#spliting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [50]:
x = x_train
y = y_train

class Objective:
    
    def __init__(self, seed):
        
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000), 
                      min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                      min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                      max_depth = trial.suggest_int('max_depth', 2, 10))
        
        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x, y):
            
            x_train_1, x_valid_1 = x.iloc[train_idx], x.iloc[valid_idx]
            y_train_1, y_valid_1 = y.iloc[train_idx], y.iloc[valid_idx]
            
            rf_md = RandomForestRegressor(**params).fit(x_train_1,y_train_1)
            
            pred_valid = rf_md.predict(x_valid_1)
            
            score = mean_squared_error(y_valid_1, pred_valid)
            
            scores.append(score)
            
        return np.mean(scores)


In [51]:
SEED = 42
N_TRIALS = 20

#executing optuna
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)

[32m[I 2023-03-29 18:24:11,875][0m A new study created in memory with name: no-name-4afc25c2-a296-4070-a200-b15e4d521cbd[0m
[32m[I 2023-03-29 18:24:14,384][0m Trial 0 finished with value: 20296035.203546632 and parameters: {'n_estimators': 475, 'min_samples_split': 22, 'min_samples_leaf': 15, 'max_depth': 7}. Best is trial 0 with value: 20296035.203546632.[0m
[32m[I 2023-03-29 18:24:15,864][0m Trial 1 finished with value: 20444497.875332933 and parameters: {'n_estimators': 243, 'min_samples_split': 19, 'min_samples_leaf': 17, 'max_depth': 4}. Best is trial 0 with value: 20296035.203546632.[0m
[32m[I 2023-03-29 18:24:17,835][0m Trial 2 finished with value: 20592353.459554594 and parameters: {'n_estimators': 422, 'min_samples_split': 8, 'min_samples_leaf': 22, 'max_depth': 5}. Best is trial 0 with value: 20296035.203546632.[0m
[32m[I 2023-03-29 18:24:19,104][0m Trial 3 finished with value: 20909691.081865113 and parameters: {'n_estimators': 291, 'min_samples_split': 18, 'mi

In [52]:
#building optimized model
rf_md = RandomForestRegressor(**study.best_trial.params).fit(x_train,y_train)

#predict on test
rf_pred = rf_md.predict(x_test)

#computing mse
rf_mse = mean_squared_error(y_test, rf_pred)
print('The mse of the RF model is', rf_mse)

The mse of the RF model is 24514210.987051688


In [53]:
x = x_train
y = y_train

class Objective:
    
    def __init__(self, seed):
        
        self.seed = seed
        
    def __call__(self, trial):
        
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                      max_depth = trial.suggest_int('max_depth', 2, 10),
                      min_child_weight = trial.suggest_int('min_child_weight', 2, 20),
                      learning_rate = trial.suggest_float('learning_rate', 0.01, 100, log = True),
                      gamma = trial.suggest_float('gamma', 0, 10),
                      colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 0.9),
                      subsample = trial.suggest_float('subsample', 0.2, 0.9)
                      )
        
        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, valid_idx in skf.split(x, y):
            
            x_train_1, x_valid_1 = x.iloc[train_idx], x.iloc[valid_idx]
            y_train_1, y_valid_1 = y.iloc[train_idx], y.iloc[valid_idx]
            
            xgb_md = XGBRegressor(**params).fit(x_train_1,y_train_1)
            
            pred_valid = xgb_md.predict(x_valid_1)
            
            score = mean_squared_error(y_valid_1, pred_valid)
            
            scores.append(score)
            
        return np.mean(scores)


In [None]:
#executing optuna
study = optuna.create_study(direction = 'minimize')
study.optimize(Objective(SEED), n_trials = N_TRIALS)