In [138]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from xgboost import XGBClassifier
import lightgbm as lgbm 
from sklearn.model_selection import GridSearchCV

### 1.	The UCI Abalone dataset is available from (https://archive.ics.uci.edu/ml/datasets/Abalone). It has been pre-downloaded and made available for this tutorial. The data file is “abalone.data”. It can be read into your Jupyter notebook using pandas’ read_csv function. The “abalone.names” file contains more information about the dataset, and the names for your headers can be found in this file.

### 2.	Read the dataset into your notebook and manually populate the headers with header names.

In [139]:
# Load data
df = pd.read_csv('abalone.data', 
                 names=['sex',
                        'length',
                        'diameter',
                        'height',
                        'whole_weight',
                        'shucked_weight',
                        'viscera_weight',
                        'shell_weight',
                        'rings'],
                header=None)

### 3.	Pre-process and one hot encode the ‘sex’ variable, since this variable is categorical.

In [146]:
# 'sex' is a categorical variable and needs to be preprocessed
one_hot_encoded_df = pd.get_dummies(df, 
                                    columns=['sex'],
                                    drop_first=True)

one_hot_encoded_df.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,bins,sex_I,sex_M
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,middle,0,1
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,young,0,1
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,young,0,0
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,young,0,1
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,young,1,0


### 4.	Our target is the ‘rings’ variable. As there are many values in this column, bin the values into 3 separate bins and label them (‘young’, ‘medium’ and ‘old’). The head() of the dataframe is given here:

In [141]:
df['bins']= pd.cut(df['rings'], 3, labels=['young', 'middle', 'old'])
df['bins'] = df['bins'].astype('str')
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings,bins
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,middle
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,young
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,young
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,young
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,young


### 5.	Set up the X and y variables and split your data into the training set and testing set.

In [142]:
# Set up X and y variables
y = df['bins']
X = one_hot_encoded_df.drop(['rings'], axis=1)

In [149]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### 6.	Fit and predict the y variables using a standard Decision Tree Classifier.

In [150]:
# Create standard decision tree classifier
dt_clf = DecisionTreeClassifier(max_depth=3)

# Train Decision Tree Classifier
dt_model = dt_clf.fit(X_train, y_train)

# Predict the response for the test dataset
y_pred_dt = dt_model.predict(X_test)

### 7.	Fit and predict the y variables using an ADABoost Classifier with n_estimators=10, learning_rate=1 and a decision tree base estimator with max_depth=3.

In [159]:
# Create AdaBoost classifier 
DTC = DecisionTreeClassifier(max_depth = 3)
abc = AdaBoostClassifier(base_estimator=DTC,learning_rate=1, n_estimators=10)

# Train Adaboost Classifier
abc_model = abc.fit(X_train, y_train)

# Predict the response for test dataset
y_pred_abc = abc_model.predict(X_test)

### 8.	Fit and predict the y variables using an XGBoost Classifier with max_depth=3, learning_rate=0.1,and  n_estimators=100.

In [152]:
# Create XGboost classifier 
xgb = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100)

# Train XGB Classifier
xgb_model = xgb.fit(X_train, y_train)

# Predict the response for test dataset
y_pred_xgb = xgb_model.predict(X_test)

### 9.	Fit and predict the y variables using an LightGBM Classifier with max_depth=3, learning_rate=0.1,and  n_estimators=100.

In [171]:
# Create LGBM classifier 
lgb = lgbm.LGBMClassifier(max_depth=3, learning_rate=0.1, n_estimators=100)

# Train LGBM Classifier
lgb_model = lgb.fit(X_train, y_train)

# Predict the response for test dataset
y_pred_lgb = lgb_model.predict(X_test)

### 10.	Print the accuracy score for each of these classifiers. 

In [172]:
print("DT Accuracy:", accuracy_score(y_test, y_pred_dt))
print("ABC Accuracy:", accuracy_score(y_test, y_pred_abc))
print("XGB Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("LGB Accuracy:", accuracy_score(y_test, y_pred_lgb))

DT Accuracy: 0.733652312599681
ABC Accuracy: 0.740829346092504
XGB Accuracy: 0.7639553429027113
LGB Accuracy: 0.7631578947368421


### 11.	Perform a grid search on ADABoost, XGBoost and LightGBM using the following parameters 
  'n_estimators': [100, 500, 1000],
   'learning_rate': [0.01, 0.1, 1],
            	   ‘max_depth': [1,2,3]


### ADABoost Grid Search

In [167]:
grid_param_abc = {  'n_estimators': [100, 500, 1000],
            'learning_rate': [0.01, 0.1, 1],
            'base_estimator__max_depth':[1,2,3]}

In [168]:
%%time

gd_sr_abc = GridSearchCV(estimator=abc, 
                    param_grid = grid_param_abc,
                    scoring='accuracy',
                    cv=5)

gd_sr_abc.fit(X_train, y_train)

print(gd_sr_abc.best_params_)
print(gd_sr_abc.best_score_)

{'base_estimator__max_depth': 1, 'learning_rate': 0.1, 'n_estimators': 500}
0.7495723571672939
Wall time: 8min 6s


### XGB Grid Search

In [161]:
grid_param = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [1, 2, 3]
}

In [162]:
%%time
gd_sr_xgb = GridSearchCV(estimator=xgb,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)
gd_sr_xgb.fit(X_train, y_train)
print(gd_sr_xgb.best_params_)
print(gd_sr_xgb.best_score_)

{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}
0.7724940130003422
Wall time: 39.6 s


### LightGBM Grid Search

In [163]:
%%time
gd_sr_lgb = GridSearchCV(estimator=lgb,
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)
gd_sr_lgb.fit(X_train, y_train)
print(gd_sr_lgb.best_params_)
print(gd_sr_lgb.best_score_)

  results['params'] = candidate_params


{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000}
0.7707834416695176
Wall time: 15.6 s


### 12.	Print the accuracy scores on the test set for the best estimators for each classifier from the grid search.

In [166]:
print("ABC Accuracy:", gd_sr_abc.best_estimator_.score(X_test, y_test))
print("XGB Accuracy:", gd_sr_xgb.best_estimator_.score(X_test, y_test))
print("LGB Accuracy:", gd_sr_lgb.best_estimator_.score(X_test, y_test))

ABC Accuracy: 0.7623604465709729
XGB Accuracy: 0.7655502392344498
LGB Accuracy: 0.7607655502392344
