# Capstone 2: Modeling

In [1]:
# imports
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
# read data
# drop unnamed column
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [3]:
# check training data
X_train.head()

Unnamed: 0,tree_dbh,latitude,longitude,curb_loc_OffsetFromCurb,curb_loc_OnCurb,spc_common_'Schubert' chokecherry,spc_common_American beech,spc_common_American elm,spc_common_American hophornbeam,spc_common_American hornbeam,...,brch_light_Yes,brch_shoe_No,brch_shoe_Yes,brch_other_No,brch_other_Yes,borough_Bronx,borough_Brooklyn,borough_Manhattan,borough_Queens,borough_Staten Island
0,0.218518,0.116105,0.79597,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
1,0.511784,-0.542404,-0.282504,1,0,0,0,0,0,0,...,0,1,0,1,0,0,1,0,0,0
2,-0.323348,0.135166,0.355613,0,1,0,0,0,0,0,...,0,1,0,0,1,0,0,0,1,0
3,-0.074749,1.571412,0.117033,0,1,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,0
4,-0.589734,0.076934,0.231816,0,1,0,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0


In [4]:
y_train.head()

Unnamed: 0,0
0,0
1,1
2,2
3,0
4,2


## Decision Tree Classifier

In [5]:
# create and fit decision tree classifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

In [6]:
# make predictions on train and test sets
dtc_tr_pred = dtc.predict(X_train)
dtc_te_pred = dtc.predict(X_test)

In [7]:
# dtc model accuracy scores
dtc_tr_accuracy = accuracy_score(y_train, dtc_tr_pred)
dtc_te_accuracy = accuracy_score(y_test, dtc_te_pred)
print('dtc training accuracy: ', dtc_tr_accuracy)
print('dtc testing accuracy: ', dtc_te_accuracy)

dtc training accuracy:  0.9999933168370595
dtc testing accuracy:  0.777243868208247


In [8]:
# dtc model classification reports
dtc_tr_class_rep = classification_report(y_train, dtc_tr_pred)
dtc_te_class_rep = classification_report(y_test, dtc_te_pred)
print('dtc training classification report: ', dtc_tr_class_rep)
print('dtc testing classifiication report: ', dtc_te_class_rep)

dtc training classification report:                precision    recall  f1-score   support

           0       1.00      1.00      1.00    399013
           1       1.00      1.00      1.00    399012
           2       1.00      1.00      1.00    399013

    accuracy                           1.00   1197038
   macro avg       1.00      1.00      1.00   1197038
weighted avg       1.00      1.00      1.00   1197038

dtc testing classifiication report:                precision    recall  f1-score   support

           0       0.72      0.72      0.72     99753
           1       0.80      0.78      0.79     99754
           2       0.81      0.84      0.82     99753

    accuracy                           0.78    299260
   macro avg       0.78      0.78      0.78    299260
weighted avg       0.78      0.78      0.78    299260



## Hyperparameter Tuning

In [9]:
# get parameters for dtc
dtc.get_params().keys()

dict_keys(['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter'])

In [10]:
# dtc param grid
dtc_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_features': [5, 20, 100, 150, 170],
    'max_depth': range(3,10)
}

In [11]:
# Grid search cv for dtc
dtc_grid_cv = GridSearchCV(dtc, param_grid=dtc_param_grid, cv=5)

In [12]:
# fit model
dtc_grid_cv.fit(X_train, y_train)


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(3, 10),
                         'max_features': [5, 20, 100, 150, 170]})

In [13]:
# score model
dtc_grid_mean = dtc_grid_cv.cv_results_['mean_test_score']
dtc_grid_std = dtc_grid_cv.cv_results_['std_test_score']

In [14]:
# find best parameters
dtc_best = dtc_grid_cv.best_params_

In [15]:
# fit model
dtc_best_model = DecisionTreeClassifier(**dtc_best)
dtc_best_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=9, max_features=150)

In [16]:
# make predictions
dtc_best_tr_pred = dtc_best_model.predict(X_train)
dtc_best_te_pred = dtc_best_model.predict(X_test)

In [17]:
# dtc model accuracy scores
dtc_best_tr_accuracy = accuracy_score(y_train, dtc_best_tr_pred)
dtc_best_te_accuracy = accuracy_score(y_test, dtc_best_te_pred)
print('dtc best training accuracy: ', dtc_best_tr_accuracy)
print('dtc best testing accuracy: ', dtc_best_te_accuracy)

dtc best training accuracy:  0.49312302533419994
dtc best testing accuracy:  0.49096772037692976


#### Notes:
* Model performance decreased significantly
* What parameter values should I test instead for the grid search CV? 
* I could not run the random forest model CV because of computer limitations (but new computer is on the way!) 
* I tried undersampling (instead of oversampling) in order to run logistic regression/random forest, but model performance for all models decreased significantly 
* Once my new computer arrives, I will re-run all models and resubmit the assignment