# Capstone 2: Modeling

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
# read data
# drop unnamed column
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

In [3]:
# check training data
X_train.head()

Unnamed: 0,tree_dbh,latitude,longitude,curb_loc_OffsetFromCurb,curb_loc_OnCurb,spc_common_'Schubert' chokecherry,spc_common_American beech,spc_common_American elm,spc_common_American hophornbeam,spc_common_American hornbeam,...,brch_light_Yes,brch_shoe_No,brch_shoe_Yes,brch_other_No,brch_other_Yes,borough_Bronx,borough_Brooklyn,borough_Manhattan,borough_Queens,borough_Staten Island
0,-1.113026,0.869411,-0.146663,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,2.024813,0.540008,1.038479,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
2,-1.247814,-1.259557,0.813491,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
3,0.241652,0.882777,-0.222273,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
4,0.805051,0.493086,1.227581,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


## Logistic Regression

In [4]:
# create and fit logistic regression
lr = LogisticRegression(max_iter=10000, random_state=14)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=10000, random_state=14)

In [5]:
# make predictions on train and test sets
lr_tr_pred = lr.predict(X_train)
lr_te_pred = lr.predict(X_test)

In [6]:
# lr model accuracy scores
lr_tr_accuracy = accuracy_score(y_train, lr_tr_pred)
lr_te_accuracy = accuracy_score(y_test, lr_te_pred)
print('lr training accuracy: ', lr_tr_accuracy)
print('lr testing accuracy: ', lr_te_accuracy)

lr training accuracy:  0.5040909311149688
lr testing accuracy:  0.5041936777384214


In [7]:
# lr model classification reports
lr_tr_class_rep = classification_report(y_train, lr_tr_pred)
lr_te_class_rep = classification_report(y_test, lr_te_pred)
print('dtc training classification report: ', lr_tr_class_rep)
print('dtc testing classifiication report: ', lr_te_class_rep)

dtc training classification report:                precision    recall  f1-score   support

           0       0.43      0.19      0.26    399012
           1       0.49      0.74      0.59    399013
           2       0.55      0.59      0.57    399013

    accuracy                           0.50   1197038
   macro avg       0.49      0.50      0.47   1197038
weighted avg       0.49      0.50      0.47   1197038

dtc testing classifiication report:                precision    recall  f1-score   support

           0       0.44      0.19      0.26     99754
           1       0.49      0.74      0.59     99753
           2       0.55      0.59      0.57     99753

    accuracy                           0.50    299260
   macro avg       0.49      0.50      0.47    299260
weighted avg       0.49      0.50      0.47    299260



## Decision Tree Classifier Hyperparameter Tuning

In [8]:
# create and fit decision tree classifier
dtc = DecisionTreeClassifier(random_state=14)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=14)

In [9]:
# make predictions on train and test sets
dtc_tr_pred = dtc.predict(X_train)
dtc_te_pred = dtc.predict(X_test)

In [10]:
# dtc model accuracy scores
dtc_tr_accuracy = accuracy_score(y_train, dtc_tr_pred)
dtc_te_accuracy = accuracy_score(y_test, dtc_te_pred)
print('dtc training accuracy: ', dtc_tr_accuracy)
print('dtc testing accuracy: ', dtc_te_accuracy)

dtc training accuracy:  0.9999958230231621
dtc testing accuracy:  0.7802278954755062


In [11]:
# dtc model classification reports
dtc_tr_class_rep = classification_report(y_train, dtc_tr_pred)
dtc_te_class_rep = classification_report(y_test, dtc_te_pred)
print('dtc training classification report: ', dtc_tr_class_rep)
print('dtc testing classifiication report: ', dtc_te_class_rep)

dtc training classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00    399012
           1       1.00      1.00      1.00    399013
           2       1.00      1.00      1.00    399013

    accuracy                           1.00   1197038
   macro avg       1.00      1.00      1.00   1197038
weighted avg       1.00      1.00      1.00   1197038

dtc testing classifiication report:                precision    recall  f1-score   support

           0       0.72      0.72      0.72     99754
           1       0.80      0.78      0.79     99753
           2       0.81      0.84      0.83     99753

    accuracy                           0.78    299260
   macro avg       0.78      0.78      0.78    299260
weighted avg       0.78      0.78      0.78    299260



## Random Forest Classifier

In [12]:
# create and fit random forest classifier
rfc = RandomForestClassifier(random_state=14)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=14)

In [13]:
# make predictions on train and test sets
rfc_tr_pred = rfc.predict(X_train)
rfc_te_pred = rfc.predict(X_test)

In [14]:
# rfc model accuracy scores
rfc_tr_accuracy = accuracy_score(y_train, rfc_tr_pred)
rfc_te_accuracy = accuracy_score(y_test, rfc_te_pred)
print('dtc training accuracy: ', rfc_tr_accuracy)
print('dtc testing accuracy: ', rfc_te_accuracy)

dtc training accuracy:  0.9999891398602216
dtc testing accuracy:  0.8597574015905901


In [15]:
# rfc model classification reports
rfc_tr_class_rep = classification_report(y_train, rfc_tr_pred)
rfc_te_class_rep = classification_report(y_test, rfc_te_pred)
print('dtc training classification report: ', rfc_tr_class_rep)
print('dtc testing classifiication report: ', rfc_te_class_rep)

dtc training classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00    399012
           1       1.00      1.00      1.00    399013
           2       1.00      1.00      1.00    399013

    accuracy                           1.00   1197038
   macro avg       1.00      1.00      1.00   1197038
weighted avg       1.00      1.00      1.00   1197038

dtc testing classifiication report:                precision    recall  f1-score   support

           0       0.83      0.83      0.83     99754
           1       0.85      0.82      0.84     99753
           2       0.90      0.93      0.91     99753

    accuracy                           0.86    299260
   macro avg       0.86      0.86      0.86    299260
weighted avg       0.86      0.86      0.86    299260



* Logistic regression: very poor performance in both train and test set
* Both decision tree classifier and random forest classifier seem to be overfit
* Random forest classifier has slighter better performance in the training data

## Random Forest Classifier Hyperparameter Tuning

In [16]:
# get parameters for rfc
rfc.get_params().keys()

dict_keys(['bootstrap', 'ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'])

### Random Search CV

In [17]:
# rfc random param grid
random_param_grid = {
    'n_estimators' : [5, 10, 20],
    'max_depth' : [150, 200, 300],
    'min_samples_leaf' : [30, 50, 70]
   
}

In [18]:
# instantiate randomized search
random_cv = RandomizedSearchCV(rfc, param_distributions=random_param_grid, random_state=14, cv=3)

In [19]:
# fit randomized search
random_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=14),
                   param_distributions={'max_depth': [150, 200, 300],
                                        'min_samples_leaf': [30, 50, 70],
                                        'n_estimators': [5, 10, 20]},
                   random_state=14)

In [20]:
# find best parameters
random_best = random_cv.best_params_
print(random_best)

{'n_estimators': 20, 'min_samples_leaf': 30, 'max_depth': 150}


In [21]:
# fit rfc model using rfc_best
random_best_model = RandomForestClassifier(**random_best, random_state=14)
random_best_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=150, min_samples_leaf=30, n_estimators=20,
                       random_state=14)

In [22]:
# make predictions
random_best_tr_pred = random_best_model.predict(X_train)
random_best_te_pred = random_best_model.predict(X_test)

In [23]:
# score model
random_best_tr_accuracy = accuracy_score(y_train, random_best_tr_pred)
random_best_te_accuracy = accuracy_score(y_test, random_best_te_pred)
print('random search best training accuracy: ', random_best_tr_accuracy)
print('random search best testing accuracy: ', random_best_te_accuracy)

random search best training accuracy:  0.5891525582312341
random search best testing accuracy:  0.5819187328744235


* The random forest classifier model is less overfit but still needs improvement
* Continue hyperparameter tuning using Grid Search CV

### Grid Search CV

In [24]:
# rfc grid param grid
param_grid = {
    'n_estimators' : [15, 20, 25, 30],
    'max_depth' : [120, 150, 180, 200],
    'min_samples_leaf' : [20, 25, 30, 40]  
}

In [25]:
# instantiate rfc grid search
grid_cv = GridSearchCV(rfc, param_grid=param_grid, cv=5)

In [26]:
# fit grid search
grid_cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=14),
             param_grid={'max_depth': [120, 150, 180, 200],
                         'min_samples_leaf': [20, 25, 30, 40],
                         'n_estimators': [15, 20, 25, 30]})

In [28]:
# score model
grid_mean = grid_cv.cv_results_['mean_test_score']
grid_std = grid_cv.cv_results_['std_test_score']
print('mean test scores: ', grid_mean)
print('std test scores: ', grid_std)

mean test scores:  [0.58490792 0.58886769 0.5906145  0.59181413 0.57718886 0.58130736
 0.58243515 0.5830383  0.57138453 0.57392915 0.57524991 0.57567847
 0.5599204  0.56273735 0.56482584 0.56559858 0.58490792 0.58886769
 0.5906145  0.59181413 0.57718886 0.58130736 0.58243515 0.5830383
 0.57138453 0.57392915 0.57524991 0.57567847 0.5599204  0.56273735
 0.56482584 0.56559858 0.58490792 0.58886769 0.5906145  0.59181413
 0.57718886 0.58130736 0.58243515 0.5830383  0.57138453 0.57392915
 0.57524991 0.57567847 0.5599204  0.56273735 0.56482584 0.56559858
 0.58490792 0.58886769 0.5906145  0.59181413 0.57718886 0.58130736
 0.58243515 0.5830383  0.57138453 0.57392915 0.57524991 0.57567847
 0.5599204  0.56273735 0.56482584 0.56559858]
std test scores:  [0.00088498 0.00180726 0.00135533 0.00126466 0.00204084 0.00189927
 0.0014415  0.00117227 0.00236232 0.00145494 0.00137938 0.00131573
 0.00268277 0.00327631 0.00311754 0.00279281 0.00088498 0.00180726
 0.00135533 0.00126466 0.00204084 0.00189927 0.

In [29]:
# find best parameters
grid_best = grid_cv.best_params_
print(grid_best)

{'max_depth': 120, 'min_samples_leaf': 20, 'n_estimators': 30}


In [30]:
# fit model with best paramters
grid_best_model = RandomForestClassifier(**grid_best)
grid_best_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=120, min_samples_leaf=20, n_estimators=30)

In [31]:
# make predictions
grid_best_tr_pred = grid_best_model.predict(X_train)
grid_best_te_pred = grid_best_model.predict(X_test)

In [32]:
# rfc model accuracy scores
grid_best_tr_accuracy = accuracy_score(y_train, grid_best_tr_pred)
grid_best_te_accuracy = accuracy_score(y_test, grid_best_te_pred)
print('rfc grid search best training accuracy: ', grid_best_tr_accuracy)
print('rfc grid search best testing accuracy: ', grid_best_te_accuracy)

rfc grid search best training accuracy:  0.6140949577206405
rfc grid search best testing accuracy:  0.6032580364900086


* RFC model is less overfit but accuracy is still poor
* Overfit DTC model has higher accuracy
* Consider tuning DTC model instead

## Decision Tree Classifier Hyperparameter Tuning

In [45]:
# get parameters for dtc
dtc.get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

### Random Search CV

In [37]:
# dtc random param grid
dtc_random_grid = {
    'max_depth' : [100, 200, 300, 400],
    'min_samples_leaf' : [10, 50, 200, 1000]
}

In [38]:
# instantiate randomized search
dtc_random_cv = RandomizedSearchCV(dtc, param_distributions=dtc_random_grid, random_state=14, cv=3)

In [39]:
# fit randomized search
dtc_random_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=14),
                   param_distributions={'max_depth': [100, 200, 300, 400],
                                        'min_samples_leaf': [10, 50, 200,
                                                             1000]},
                   random_state=14)

In [40]:
# find best parameters
dtc_random_best = dtc_random_cv.best_params_
print(dtc_random_best)

{'min_samples_leaf': 10, 'max_depth': 200}


In [42]:
# fit dtc model using dtc_random_best
dtc_random_best_model = DecisionTreeClassifier(**dtc_random_best, random_state=14)
dtc_random_best_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=200, min_samples_leaf=10, random_state=14)

In [43]:
# make predictions
dtc_random_tr_pred = dtc_random_best_model.predict(X_train)
dtc_random_te_pred = dtc_random_best_model.predict(X_test)

In [44]:
# score model
dtc_random_tr_accuracy = accuracy_score(y_train, dtc_random_tr_pred)
dtc_random_te_accuracy = accuracy_score(y_test, dtc_random_te_pred )
print('random search best training accuracy: ', dtc_random_tr_accuracy)
print('random search best testing accuracy: ', dtc_random_te_accuracy)

random search best training accuracy:  0.8125447980765858
random search best testing accuracy:  0.7209149234779122


* Dtc random search accuracy higher than rtc grid search accuracy
* Dtc model still overfit
* Continue hyperparameter tuning

### Grid Search CV

In [65]:
# dtc grid param grid
dtc_param_grid = {
    'max_depth' : [110, 115, 125],
    'min_samples_leaf' : [2, 4]
}

In [66]:
# instantiate dtc grid search
dtc_grid_cv = GridSearchCV(dtc, param_grid=dtc_param_grid, cv=3)

In [67]:
# fit dtc grid search
dtc_grid_cv.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=14),
             param_grid={'max_depth': [110, 115, 125],
                         'min_samples_leaf': [2, 4]})

In [68]:
# find best parameters
dtc_grid_best = dtc_grid_cv.best_params_
print(dtc_grid_best)

{'max_depth': 110, 'min_samples_leaf': 2}


In [69]:
# fit dtc model using dtc_grid_best
dtc_grid_best_model = DecisionTreeClassifier(**dtc_grid_best, random_state=14)
dtc_grid_best_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=110, min_samples_leaf=2, random_state=14)

In [70]:
# make predictions
dtc_grid_tr_pred = dtc_grid_best_model.predict(X_train)
dtc_grid_te_pred = dtc_grid_best_model.predict(X_test)

In [71]:
# score model
dtc_grid_tr_accuracy = accuracy_score(y_train, dtc_grid_tr_pred)
dtc_grid_te_accuracy = accuracy_score(y_test, dtc_grid_te_pred )
print('random search best training accuracy: ', dtc_grid_tr_accuracy)
print('random search best testing accuracy: ', dtc_grid_te_accuracy)

random search best training accuracy:  0.941605863807164
random search best testing accuracy:  0.7569772104524494


In [72]:
# dtc model classification reports
dtc_grid_tr_class = classification_report(y_train, dtc_grid_tr_pred)
dtc_grid_te_class = classification_report(y_test, dtc_grid_te_pred)
print('dtc training classification report: ', dtc_grid_tr_class)
print('dtc testing classifiication report: ', dtc_grid_te_class)

dtc training classification report:                precision    recall  f1-score   support

           0       0.89      0.98      0.93    399012
           1       0.96      0.93      0.94    399013
           2       0.99      0.92      0.95    399013

    accuracy                           0.94   1197038
   macro avg       0.94      0.94      0.94   1197038
weighted avg       0.94      0.94      0.94   1197038

dtc testing classifiication report:                precision    recall  f1-score   support

           0       0.67      0.74      0.70     99754
           1       0.79      0.75      0.77     99753
           2       0.83      0.78      0.80     99753

    accuracy                           0.76    299260
   macro avg       0.76      0.76      0.76    299260
weighted avg       0.76      0.76      0.76    299260

