In [34]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.metrics import recall_score, classification_report, make_scorer, confusion_matrix, mean_squared_error
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import boxcox
from sklearn.feature_selection import RFE, RFECV
from sklearn.svm import SVC, SVR
from precision_recall_cutoff import precision_recall_cutoff
from xgboost import XGBClassifier, XGBRegressor
from cost_function import cost_function, cost_function_cutoff

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'insurance.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
insurance = pd.read_csv(file_content_stream)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [35]:
## Changing labels to numbers 
insurance['sex'] = np.where(insurance['sex'] == 'female', 0, 1)
insurance['smoker'] = np.where(insurance['smoker'] == 'no', 0, 1)

## Extracting region dummies
region_dummies = pd.get_dummies(insurance['region']).iloc[:, 0:3]

## Appending dummies 
insurance = pd.concat([insurance, region_dummies], axis = 1)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast
0,19,0,27.9,0,1,southwest,16884.924,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1
2,28,1,33.0,3,0,southeast,4449.462,0,0,1
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0


In [36]:
## Computing interactions from chapter 4
insurance['interaction_1'] = np.where((insurance['smoker'] == 0) & (insurance['age'] <= 32.5), 1, 0)
insurance['interaction_2'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 32.5) & (insurance['age'] <= 44.5), 1, 0)
insurance['interaction_3'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 44.5) & (insurance['age'] < 51.5), 1, 0)
insurance['interaction_4'] = np.where((insurance['smoker'] == 0) & (insurance['age'] > 51.5), 1, 0)

insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northeast,northwest,southeast,interaction_1,interaction_2,interaction_3,interaction_4
0,19,0,27.9,0,1,southwest,16884.924,0,0,0,0,0,0,0
1,18,1,33.77,1,0,southeast,1725.5523,0,0,1,1,0,0,0
2,28,1,33.0,3,0,southeast,4449.462,0,0,1,1,0,0,0
3,33,1,22.705,0,0,northwest,21984.47061,0,1,0,0,1,0,0
4,32,1,28.88,0,0,northwest,3866.8552,0,1,0,1,0,0,0


In [37]:
## Defining input and target 
x = insurance[['age', 'bmi', 'children', 'smoker', 'interaction_4']]
y = insurance['charges']

## Splitting the data 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [30]:
x = x_train
y = y_train

class objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                        min_samples_split = trial.suggest_int('min_samples_split', 5, 30),
                        min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 30),
                        max_depth = trial.suggest_int('max_depth', 2, 10))
        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, val_idx in skf.split(x, y):
            
            ## Splitting the data 
            x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            rf_md = RandomForestRegressor(**params).fit(x_train, y_train)
            
            pred_val = rf_md.predict(x_val)
            
            score = mean_squared_error(y_val, pred_val)
            scores. append(score)
        
        return np.mean(scores)

In [31]:
seed = 42
n_trials = 20

study = optuna.create_study(direction = 'minimize')
study.optimize(objective(seed), n_trials = n_trials)

[32m[I 2023-03-29 17:43:57,112][0m A new study created in memory with name: no-name-b9d287c3-0470-4193-b4a4-b25b0402e525[0m
[32m[I 2023-03-29 17:44:04,306][0m Trial 0 finished with value: 26129044.650832023 and parameters: {'n_estimators': 1779, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_depth': 2}. Best is trial 0 with value: 26129044.650832023.[0m
[32m[I 2023-03-29 17:44:05,672][0m Trial 1 finished with value: 22292378.837772846 and parameters: {'n_estimators': 325, 'min_samples_split': 22, 'min_samples_leaf': 22, 'max_depth': 4}. Best is trial 1 with value: 22292378.837772846.[0m
[32m[I 2023-03-29 17:44:07,886][0m Trial 2 finished with value: 26102741.610423226 and parameters: {'n_estimators': 579, 'min_samples_split': 7, 'min_samples_leaf': 30, 'max_depth': 2}. Best is trial 1 with value: 22292378.837772846.[0m
[32m[I 2023-03-29 17:44:14,762][0m Trial 3 finished with value: 22080944.02715612 and parameters: {'n_estimators': 1494, 'min_samples_split': 13, 'min

In [33]:
# optimized model 
rf_md = RandomForestRegressor(**study.best_trial.params).fit(x_train, y_train)

# predicting on test
rf_pred = rf_md.predict(x_test)

# computing mse
rf_mse = mean_squared_error(y_test, rf_pred)
print('RF MSE: ', rf_mse)

RF MSE:  15303151.580665225


In [47]:
x = x_train
y = y_train

class objective:
    
    def __init__(self, seed):
        self.seed = seed
        
    def __call__(self, trial):
        params = dict(n_estimators = trial.suggest_int('n_estimators', 100, 2000),
                        max_depth = trial.suggest_int('max_depth', 2, 10),
                        min_child_weight = trial.suggest_int('min_child_weight', 2, 20),
                        learning_rate = trial.suggest_float('learning_rate', 0.01, 100, log = True),
                        gamma = trial.suggest_float('gamma', 0, 10),
                        colsample_bytree = trial.suggest_float('colsample_bytree', 0.2, 0.9),
                        subsample = trial.suggest_float('subsample', 0.2, 0.9))

        scores = list()
        
        skf = KFold(n_splits = 3, shuffle = True, random_state = self.seed)
        
        for train_idx, val_idx in skf.split(x, y):
            
            ## Splitting the data 
            x_train, x_val = x.iloc[train_idx], x.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            xgb_md = XGBRegressor(**params).fit(x_train, y_train)
            
            pred_val = xgb_md.predict(x_val)
            
            score = mean_squared_error(y_val, pred_val)
            scores. append(score)
        
        return np.mean(scores)

In [46]:
seed = 42
n_trials = 20

study = optuna.create_study(direction = 'minimize')
study.optimize(objective(seed), n_trials = n_trials)

[32m[I 2023-03-29 17:52:26,189][0m A new study created in memory with name: no-name-1b2706d4-ea16-426d-a148-c71557657304[0m
[32m[I 2023-03-29 17:52:27,297][0m Trial 0 finished with value: 309193759.5600136 and parameters: {'n_estimators': 382, 'max_depth': 7, 'min_child_weight': 13, 'learning_rate': 0.02650019887330892, 'gamma': 4.881369954695637, 'colsample_bytree': 0.6849880997045912, 'subsample': 0.704026663608383}. Best is trial 0 with value: 309193759.5600136.[0m
[33m[W 2023-03-29 17:52:27,573][0m Trial 1 failed with parameters: {'n_estimators': 1116, 'max_depth': 8, 'min_child_weight': 14, 'learning_rate': 19.757670785153007, 'gamma': 2.2208111810759448, 'colsample_bytree': 0.5720178018113113, 'subsample': 0.41845653743161537} because of the following error: ValueError('Input contains NaN.').[0m
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_val

ValueError: Input contains NaN.

In [39]:
# optimized model 
xgb_md = XGBRegressor(**study.best_trial.params).fit(x_train, y_train)

# predicting on test
xgb_pred = xgb_md.predict(x_test)

# computing mse
xgb_mse = mean_squared_error(y_test, xgb_pred)
print('RF MSE: ', xgb_mse)

Parameters: { "min_samples_leaf", "min_samples_split" } are not used.

RF MSE:  30178180.28033087
