# Data 102 Fall 2022 Lecture 4: Binary Classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

sns.set()  # This helps make our plots look nicer

In [None]:
def make_2x2_table(reality, decisions):
    return pd.DataFrame(
        confusion_matrix(reality, decisions),
        columns = ["D=0", "D=1"],
        index = ["R=0", "R=1"]
    )

## Decisions by thresholding: binary classification

In this section, we'll work with a dataset predicting breast cancer from biopsy data.

In [None]:
dataset = load_breast_cancer()
X = pd.DataFrame(dataset['data'], columns = dataset['feature_names'])
y = dataset['target']

# Randomly flip 20% of the outputs to make the problem a little harder
np.random.seed(42)
mask = np.random.random(y.shape) < 0.2
y[mask] = 1 - y[mask]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.33, random_state = 42
)

Let's use logistic regression to predict y from X:

In [None]:
model = LogisticRegression(solver = "liblinear")
model.fit(X_train, y_train)
y_hat_test = model.predict(X_test)

Now, we can use our 2x2 table to evaluate the performance:

In [None]:
make_2x2_table(reality=y_test, decisions=y_hat_test)

*Check: what is the false positive rate for these predictions? What about the false discovery rate?*

In [None]:
fpr = 32 / (32 + 41)
fpr

In [None]:
fdr = 32 / (32 + 99)
fdr

In [None]:
tpr = 99 / (99 + 16)
tpr

For patients with cancer (R=1), we're correct $86\%$ of the time. What if this isn't enough? Suppose we need a higher true positive rate: what can we do?

Recall that a logistic regression model's predictions are probabilities between 0 and 1: we always threshold these to obtain binary decisions.

So, let's look at the probabilities directly:

In [None]:
predicted_probs = model.predict_proba(X_test)[:, 1]
predicted_probs.round(2)

The predictions we used earlier were based on thresholding these probabilities at $0.5$. What if we try a different threshold?

If we want to do better than $86\%$ on patients with cancer, should the threshold be higher or lower than $0.5$? Experiment with different thresholds in this cell. What do you find?

In [None]:
threshold = 0.5
y_preds = (predicted_probs > threshold).astype(int)
make_2x2_table(y_test, y_preds)

In [None]:
threshold = 0.2
y_preds = (predicted_probs > threshold).astype(int)
make_2x2_table(y_test, y_preds)

In [None]:
threshold = 0.8
y_preds = (predicted_probs > threshold).astype(int)
make_2x2_table(y_test, y_preds)

Let's visualize these predictions and our threshold:


In [None]:
threshold = 0.5

plt.figure(figsize = (10,5))
sns.stripplot(x=predicted_probs, y=y_test, alpha = 0.8, order = [0, 1], orient = "h")
plt.axvline(threshold, c = "k", label = "threshold")
plt.xlabel("Predicted class 1 probability")
plt.ylabel("True label")
plt.title("True class vs predicted class probability")
plt.legend()
plt.show()

The blue points on the top correspond to healthy patients (R=0), and the orange points on the bototm correspond to cancer patients (R=1). For any point that falls to the left of the black line, we declare D=0.

Using this plot, where do we need to set the threshold to guarantee all the cancer patients (blue points) are classified correctly?

### ROC curves

Instead of trying one threshold at a time and seeing what we get, it would be nice if we could visualize the results from multiple thresholds all at once. This is what an ROC curve is for.

In [None]:
def get_tpr_fpr(reality, decision_probs, threshold):
    # Compute the denominators for the top and bottom rows (reality=0, reality=1)
    r1_count = reality.sum()
    r0_count = (1-reality).sum()
    decisions = (decision_probs >= threshold).astype(int)
    table = confusion_matrix(reality, decisions)
    tp_count = table[1,1]
    fp_count = table[0,1]
    tpr = tp_count / r1_count
    fpr = fp_count / r0_count
    
    return tpr, fpr

In [None]:
results = []
thresholds = np.arange(0, 1.01, 0.1)
for threshold in thresholds:
    TPR, FPR = get_tpr_fpr(y_test, predicted_probs, threshold)
    results.append((threshold, TPR, FPR))
results = pd.DataFrame(results, columns = ["threshold", "TPR", "FPR"])
results

In [None]:
plt.figure(figsize=(5, 5))
plt.scatter(results["FPR"], results["TPR"])
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title("FPR vs TPR")

In [None]:
plt.figure(figsize=(5, 5))
plt.plot(results["FPR"], results["TPR"])
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title("FPR vs TPR")

#### ROC curves in scikit-learn

`scikit-learn` already does (almost) all this work for us, using the `roc_curve` function.

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
fpr, tpr, _ = roc_curve(y_test, predicted_probs)

In [None]:
plt.figure(figsize=(5, 5))
plt.plot(fpr, tpr)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title("ROC curve (TPR vs FPR)")
plt.show()

### Precision-recall curves

The ROC curve is useful if we want to compare the tradeoff between doing well when reality = 0 and doing well when reality = 1 (in other words, between performance in the top row and bottom of our table).

We can also look at the tradeoff between FDP (column-wise performance for the right column) and TPR (row-wise performance for the bottom row). The standard way that people do this is by plotting a precision-recall curve. The precision is defined to be 1 - FDP (in other words, when we make a discovery, how often is that discovery correct?).

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
precision, recall, _ = precision_recall_curve(y_test, predicted_probs)

In [None]:
plt.figure(figsize = (5, 5))
plt.plot(recall, precision)
plt.xlabel("Recall (TPR)")
plt.ylabel("Precision (1-FDP)")
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.title("Precision (1-FDP) vs Recall (TPR)")
plt.show()