# Import Libraries and Data

In [1]:
import pandas as pd # dataframe/data cleaning/manipulation
import numpy as np # array computations
from matplotlib import pyplot as plt # plotting/graphing
import matplotlib.patches as mpatches
from sklearn.tree import plot_tree, export_text, DecisionTreeClassifier # Decision tree algorithm and plotting functions for the Decision tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier # Bagging, Random Forest, and Boosting algorithms
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbor Algorithm
from sklearn.linear_model import LogisticRegression # Logistic Regression Algorithm
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict # train test split and cross validation accuracy/prediction functions
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, roc_auc_score # Various model evaluation metrics
import warnings
warnings.filterwarnings('ignore')

Note: If you are using Google Colab, you must upload the `hr.csv` file from Canvas by doing the following:

* On the left-side bar, click the folder icon.
* Click the 'Upload to session storage' button.
* Upload the CSV file; it will appear below the 'sample_data' folder.

**Unfortunately, this process must be done every time the runtime is disconnected - just a quirk with Google Colab.**

If you are using Jupyter notebook, just make sure the CSV file is in the same folder location as this .ipynb file.

In [2]:
hr_df = pd.read_csv('hr.csv',index_col=0)

# Binarize categorial variables and set X, y

In [3]:
hr_df_binarized = pd.get_dummies(hr_df)
X = hr_df_binarized.drop(columns=['Attrition'])
y = hr_df_binarized.Attrition

# Create Function - Model Evaluation

In [4]:
def report_score(mymodel, X, y, mycv):
# To use this function, all we need to do is feed it our model of interest, X, y, and the number of folds for cross-validation.

    # Calculate and print the cross-validation accuracy
    cross_val_accuracy = round(cross_val_score(mymodel, X, y, cv = mycv).mean()*100,2)
    print(f"{mycv}-Fold Cross-Validation Classification Accuracy: {cross_val_accuracy:.2f} %")

    # Calculate and print the cross-validation ROC AUC
    cross_val_accuracy_roc_auc = round(cross_val_score(mymodel, X, y, cv =  mycv, scoring = 'roc_auc').mean()*100,2)
    print(f"\n{mycv}-Fold Cross-Validation ROC AUC: {cross_val_accuracy_roc_auc:.2f} %")

    # Calculate the confusion matrix and print the true positives/negatives and false positives/negatives
    predictions = cross_val_predict(mymodel, X, y, cv = mycv)
    confusion = confusion_matrix(y, predictions)
    tn, fp, fn, tp = confusion.ravel()
    print(f"\nTrue Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}, True Positives: {tp}")

    # Calculate and print precision and recall:
    print("\nPrecision:", round(precision_score(y, predictions)*100,2))
    print("\nRecall:", round(recall_score(y, predictions)*100,2))

# Instructions and Workspace

**For this Instapoll assignment, your task is to take the code snippets used in Ensemble_Models_Final relating to bagging, random forest, and/or boosting, and play with the base model or hyper-parameter settings to try and maximize your 10-Fold Cross Validation ROC AUC.**

- **For a bagging model, try to exceed a 10-Fold Cross-Validation ROC AUC of 78.72%.**

- **For a random forest model, try to exceed a 10-Fold Cross-Validation ROC AUC of 79.78%.**

- **For a boosting model, try to exceed a 10-Fold Cross-Validation ROC AUC of 81.04%.**

The report_score function has been provided for you.

Please try your best and if you have any questions, please reach out to the Professor or TA!

In [17]:
base_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 6, random_state = 3)
bagging_model = BaggingClassifier(estimator = base_model, n_estimators = 50, random_state = 3).fit(X, y)
report_score(bagging_model, X, y, 10)

10-Fold Cross-Validation Classification Accuracy: 85.92 %

10-Fold Cross-Validation ROC AUC: 79.21 %

True Negatives: 1212, False Positives: 21, False Negatives: 186, True Positives: 51

Precision: 70.83

Recall: 21.52


In [19]:
random_forest_model = RandomForestClassifier(criterion = 'entropy', n_estimators = 100, max_features = 6).fit(X,y)
report_score(random_forest_model, X, y, 10)

10-Fold Cross-Validation Classification Accuracy: 85.85 %

10-Fold Cross-Validation ROC AUC: 80.92 %

True Negatives: 1225, False Positives: 8, False Negatives: 200, True Positives: 37

Precision: 82.22

Recall: 15.61


In [12]:
adaboost_model = AdaBoostClassifier(algorithm="SAMME.R", n_estimators=20, random_state = 3)
bagging_model_nested = BaggingClassifier(estimator = adaboost_model, n_estimators=20, random_state = 3)
report_score(bagging_model_nested, X, y, 10)

10-Fold Cross-Validation Classification Accuracy: 87.21 %

10-Fold Cross-Validation ROC AUC: 82.26 %

True Negatives: 1224, False Positives: 9, False Negatives: 179, True Positives: 58

Precision: 86.57

Recall: 24.47
