In [1]:
#Define the AdaBoost classifier

#In the following exercises you'll revisit the Indian Liver Patient dataset which was introduced in a previous chapter.
#Your task is to predict whether a patient suffers from a liver disease using 10 features including Albumin, age and
#gender. However, this time, you'll be training an AdaBoost ensemble to perform the classification task. In addition, given
#that this dataset is imbalanced, you'll be using the ROC AUC score as a metric instead of accuracy.

#As a first step, you'll start by instantiating an AdaBoost classifier.

# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

# Instantiate ada
ada = AdaBoostClassifier(base_estimator=dt, n_estimators=180, random_state=1)

In [2]:
#NOTE: Next comes training ada and evaluating the probability of obtaining the positive class in the test set.

In [3]:
#Train the AdaBoost classifier

import pandas as pd
from sklearn.model_selection import train_test_split
df_liver = pd.read_csv('datasets/indian_liver_patient/indian_liver_patient.csv')
df_liver.dropna(inplace=True)
df_liver = pd.get_dummies(df_liver, drop_first=True)
df_liver['Dataset'] = df_liver['Dataset'].where(df_liver['Dataset'] != 2, 0)
df_liver_preprocessed = df_liver.copy()
df_liver_preprocessed = df_liver_preprocessed[['Age','Total_Bilirubin','Direct_Bilirubin','Alkaline_Phosphotase',
                                               'Alamine_Aminotransferase','Aspartate_Aminotransferase','Total_Protiens',
                                               'Albumin','Albumin_and_Globulin_Ratio','Gender_Male','Dataset']]
col_names = ['Age_std','Total_Bilirubin_std','Direct_Bilirubin_std','Alkaline_Phosphotase_std',
             'Alamine_Aminotransferase_std','Aspartate_Aminotransferase_std','Total_Protiens_std','Albumin_std',
             'Albumin_and_Globulin_Ratio_std','Is_male_std','Liver_disease']
df_liver_preprocessed.set_axis(col_names, axis='columns', inplace=True)
X = df_liver_preprocessed.drop('Liver_disease', axis=1)
y = df_liver_preprocessed['Liver_disease']
SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=SEED)

#Now that you've instantiated the AdaBoost classifier ada, it's time train it. You will also predict the probabilities of
#obtaining the positive class in the test set. This can be done as follows:

#Once the classifier ada is trained, call the .predict_proba() method by passing X_test as a parameter and extract these
#probabilities by slicing all the values in the second column as follows:

#ada.predict_proba(X_test)[:,1]

#The Indian Liver dataset is processed for you and split into 80% train and 20% test. Feature matrices X_train and X_test,
#as well as the arrays of labels y_train and y_test are available in your workspace. In addition, we have also loaded the
#instantiated model ada from the previous exercise.

# Fit ada to the training set
ada.fit(X_train, y_train)

# Compute the probabilities of obtaining the positive class
y_pred_proba = ada.predict_proba(X_test)[:,1]

In [4]:
#NOTE: Next, you'll evaluate ada's ROC AUC score.

In [5]:
#Evaluate the AdaBoost classifier

#Now that you're done training ada and predicting the probabilities of obtaining the positive class in the test set, it's
#time to evaluate ada's ROC AUC score. Recall that the ROC AUC score of a binary classifier can be determined using the
#roc_auc_score() function from sklearn.metrics.

#The arrays y_test and y_pred_proba that you computed in the previous exercise are available in your workspace.

# Import roc_auc_score
from sklearn.metrics import roc_auc_score

# Evaluate test-set roc_auc_score
ada_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print roc_auc_score
print('ROC AUC score: {:.2f}'.format(ada_roc_auc))

ROC AUC score: 0.70


In [6]:
#NOTE: This untuned AdaBoost classifier achieved a ROC AUC score of 0.71!

In [7]:
#Define the GB regressor

#You'll now revisit the Bike Sharing Demand dataset that was introduced in the previous chapter. Recall that your task is
#to predict the bike rental demand using historical weather data from the Capital Bikeshare program in Washington, D.C..
#For this purpose, you'll be using a gradient boosting regressor.

#As a first step, you'll start by instantiating a gradient boosting regressor which you will train in the next exercise.

# Import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate gb
gb = GradientBoostingRegressor(max_depth=4,
                               n_estimators=200,
                               random_state=2)

In [8]:
#NOTE: Time to train the regressor and predict test set labels.

In [9]:
#Train the GB regressor

import pandas as pd
from sklearn.model_selection import train_test_split
df_bike = pd.read_csv('datasets/bike_sharing_demand.csv')
X = df_bike.drop('cnt', axis=1)
y = df_bike['cnt']
SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

#You'll now train the gradient boosting regressor gb that you instantiated in the previous exercise and predict test set
#labels.

#The dataset is split into 80% train and 20% test. Feature matrices X_train and X_test, as well as the arrays y_train and
#y_test are available in your workspace. In addition, we have also loaded the model instance gb that you defined in the
#previous exercise.

# Fit gb to the training set
gb.fit(X_train, y_train)

# Predict test set labels
y_pred = gb.predict(X_test)

In [10]:
#NOTE: Time to evaluate the test set RMSE!

In [11]:
#Evaluate the GB regressor

#Now that the test set predictions are available, you can use them to evaluate the test set Root Mean Squared Error (RMSE)
#of gb.

#y_test and predictions y_pred are available in your workspace.

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute MSE
mse_test = MSE(y_test, y_pred)

# Compute RMSE
rmse_test = mse_test ** (1/2)

# Print RMSE
print('Test set RMSE of gb: {:.3f}'.format(rmse_test))

Test set RMSE of gb: 43.113


In [12]:
#Regression with SGB

#As in the exercises from the previous lesson, you'll be working with the Bike Sharing Demand dataset. In the following set
#of exercises, you'll solve this bike count regression problem using stochastic gradient boosting.

# Import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Instantiate sgbr
sgbr = GradientBoostingRegressor(max_depth=4,
                                 subsample=0.9,
                                 max_features=0.75,
                                 n_estimators=200,
                                 random_state=2)

In [13]:
#Train the SGB regressor

#In this exercise, you'll train the SGBR sgbr instantiated in the previous exercise and predict the test set labels.

#The bike sharing demand dataset is already loaded processed for you; it is split into 80% train and 20% test. The feature
#matrices X_train and X_test, the arrays of labels y_train and y_test, and the model instance sgbr that you defined in the
#previous exercise are available in your workspace.

# Fit sgbr to the training set
sgbr.fit(X_train, y_train)

# Predict test set labels
y_pred = sgbr.predict(X_test)

In [14]:
#NOTE: Next comes test set evaluation!

In [15]:
#Evaluate the SGB regressor

#You have prepared the ground to determine the test set RMSE of sgbr which you shall evaluate in this exercise.

#y_pred and y_test are available in your workspace.

# Import mean_squared_error as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute test set MSE
mse_test = MSE(y_test, y_pred)

# Compute test set RMSE
rmse_test = mse_test ** (1/2)

# Print rmse_test
print('Test set RMSE of sgbr: {:.3f}'.format(rmse_test))

Test set RMSE of sgbr: 45.143


In [16]:
#NOTE: The stochastic gradient boosting regressor achieves a lower test set RMSE than the gradient boosting regressor
#(which was 52.065)!