# Binary Classification

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score

from sklearn.tree import plot_tree
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
rng = np.random.RandomState(2)

## Read in dataset

In [None]:
import os
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    drive.mount('/content/drive')
    base_dir = "./drive/My Drive/Colab Notebooks/" # You may need to change this, depending on where your notebooks are on Google Drive
else:
    base_dir = "."
dataset_dir = os.path.join(base_dir, "datasets")

In [None]:
df = pd.read_csv(os.path.join(dataset_dir, "cs1109.csv"))

## Take a cheeky look

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe(include="all")

In [None]:
# Proportion of the dataset who passed

print((df["outcome"] == "Pass").sum() / df.shape[0])

In [None]:
features = ["lect", "lab"]

X = df[features]

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["outcome"])

In [None]:
label_encoder.inverse_transform([0, 1])

## Split into training set and test set - stratified holdout

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=df["outcome"], random_state=rng)

In [None]:
# Proportion of training set who passed, and same for the test set

y_train.sum() / y_train.shape[0], y_test.sum() / y_test.shape[0]

## Exploratory Data Analysis (EDA) - it's safe now - on the training set

In [None]:
# Histograms

fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.histplot(data=X_train, x="lect", binwidth=10, ax=axes[0])
sns.histplot(data=X_train, x="lab", binwidth=10, ax=axes[1])
plt.show()

In [None]:
# Histograms within class

fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.histplot(data=X_train, x="lect", hue=y_train, binwidth=10, multiple="stack", ax=axes[0])
sns.histplot(data=X_train, x="lab", hue=y_train, binwidth=10, multiple="stack", ax=axes[1])
plt.show()

In [None]:
# Scatter plot

plot = sns.scatterplot(data=X_train, x="lect", y="lab", hue=y_train, style=y_train)

In [None]:
# Strip plots - with jitter!

fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.stripplot(data=X_train, y="lect", x=y_train, ax=axes[0])
sns.stripplot(data=X_train, y="lab", x=y_train, ax=axes[1])
plt.show()

In [None]:
# Box-and-whisker plots

fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(data=X_train, y="lect", x=y_train, ax=axes[0])
sns.boxplot(data=X_train, y="lab", x=y_train, ax=axes[1])
plt.show()

## Train: fit a Decision Tree to the training set

In [None]:
decision_tree = DecisionTreeClassifier(max_depth=3, random_state=rng)

In [None]:
decision_tree.fit(X_train, y_train)

## Visualise the model (tree) it has learned

In [None]:
fig = plt.figure(figsize=(20,8))
plot_tree(decision_tree, feature_names=features, class_names=["Fail", "Pass"], fontsize=12)
plt.show()

## Inference - let's make some predictions

In [None]:
students = pd.DataFrame([[60, 25], [80, 90], [95, 70]], columns=["lect", "lab"])

In [None]:
label_encoder.inverse_transform(decision_tree.predict(students))

In [None]:
decision_tree.predict_proba(students)

## Visualise what it has learned

In [None]:
fig, ax = plt.subplots()
DecisionBoundaryDisplay.from_estimator(
    decision_tree, X_train, cmap=plt.cm.RdBu, response_method="predict", xlabel="lect", ylabel="lab", ax=ax)
sns.scatterplot(data=X_train, x="lect", y="lab", hue=y_train, style=y_train, ax=ax)
plt.show()

## Interpretability and Explainability

In [None]:
# Based on code from https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html
# My code assumes Pandas DataFrames with column names both for X and for the data the tree was trained on. 

def explain(decision_tree, X):
    features = decision_tree.tree_.feature
    thresholds = decision_tree.tree_.threshold
    node_indicator = decision_tree.decision_path(X)
    leaf_ids = decision_tree.apply(X)
    explanations = []
    for i in range(X.shape[0]):
        explanation = []
        node_index = node_indicator.indices[node_indicator.indptr[i] : node_indicator.indptr[i + 1]]
        for node_id in node_index:
            if leaf_ids[i] == node_id:
                continue
            sign = "<=" if X.at[i, df.columns[features[node_id]]] <= thresholds[node_id] else ">"
            explanation.append("({feature} = {value}) {sign} {threshold}".format(
                feature=df.columns[features[node_id]], 
                value=X.at[i, df.columns[features[node_id]]], 
                sign=sign, 
                threshold=thresholds[node_id]))
        explanations.append(explanation)
    return explanations

In [None]:
explain(decision_tree, students)

## Error estimation - evaluate the model - accuracy

In [None]:
accuracy_score(decision_tree.predict(X_test), y_test)

## What happens if we increase the maximum depth?

In [None]:
decision_tree = DecisionTreeClassifier(max_depth=4, random_state=rng)

In [None]:
decision_tree.fit(X_train, y_train)

In [None]:
fig = plt.figure(figsize=(20,8))
plot_tree(decision_tree, feature_names=features, class_names=["Fail", "Pass"], fontsize=10, label="none")
plt.show()

In [None]:
fig, ax = plt.subplots()
DecisionBoundaryDisplay.from_estimator(
    decision_tree, X_train, cmap=plt.cm.RdBu, response_method="predict", xlabel="lect", ylabel="lab", ax=ax)
sns.scatterplot(data=X_train, x="lect", y="lab", hue=y_train, style=y_train, ax=ax)
plt.show()

In [None]:
accuracy_score(decision_tree.predict(X_test), y_test)

## What if we have no maximum depth?

In [None]:
decision_tree = DecisionTreeClassifier(random_state=rng)

In [None]:
decision_tree.fit(X_train, y_train)

In [None]:
fig = plt.figure(figsize=(20,8))
plot_tree(decision_tree, feature_names=features, class_names=["Fail", "Pass"], fontsize=1, label="none")
plt.show()

In [None]:
fig, ax = plt.subplots()
DecisionBoundaryDisplay.from_estimator(
    decision_tree, X_train, cmap=plt.cm.RdBu, response_method="predict", xlabel="lect", ylabel="lab", ax=ax)
sns.scatterplot(data=X_train, x="lect", y="lab", hue=y_train, style=y_train, ax=ax)
plt.show()

In [None]:
accuracy_score(decision_tree.predict(X_test), y_test)

Turns out that depth of about 6 or 7 is the best.