# The Area Under the Curve of the Receiver Operating Characteristic

In [84]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import numpy as np
from matplotlib import pyplot as plt

First let's generate some data:

In [30]:
X, y = make_classification(n_samples=10000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [1]:
logreg = LogisticRegression(solver='liblinear')

logreg.fit(X_train, y_train)

In [2]:
(logreg.predict_proba(X_test) >= 0.9).astype(float)[:1]

In [3]:
list((logreg.predict_proba(X_test) >= 0.5).astype(float)[0]).index(1)

In [4]:
logreg.predict(X_test)[:5]

In [36]:
cm = confusion_matrix(y_test, logreg.predict(X_test))

In [40]:
tp, tn, fp, fn = cm[1][1], cm[0][0], cm[0][1], cm[1][0]

tpr = tp / (tp + fn)
fpr = fp / (fp + tn)

In [82]:
def classify_rates(X_train, X_test, y_train, y_test, thresh):
    logreg = LogisticRegression(solver='liblinear')
    logreg.fit(X_train, y_train)
    y_hat_probs = logreg.predict_proba(X_test)
    y_hat = []
    for val in y_hat_probs:
        if val[0] <= thresh:
            y_hat.append(1)
        else:
            y_hat.append(0)
    cm = confusion_matrix(y_test, y_hat)
    tp, tn, fp, fn = cm[1][1], cm[0][0], cm[0][1], cm[1][0]
    tpr = tp / (tp + fn)
    fpr = fp / (fp + tn)
    return (tpr, fpr)

In [5]:
for x in np.linspace(0, 1, 11):
    print(classify_rates(X_train, X_test, y_train, y_test, x))

In [6]:
tprs = []
fprs = []
diffs = []
for x in np.linspace(0, 1, 101):
    fprs.append(classify_rates(X_train, X_test, y_train, y_test, x)[1])
    tprs.append(classify_rates(X_train, X_test, y_train, y_test, x)[0])
    xy2 = (fprs[-1] +  tprs[-1]) / 2
    diffs.append(np.sqrt((xy2 - fprs[-1])**2 + (xy2 - tprs[-1])**2))
    
max_dist = diffs.index(np.max(diffs))
print(f"""With a threshold of {(max_dist - 1) / 100}: \n"""
      f"""\tYou\'ll have a True Positive Rate of {round(tprs[max_dist], 3)} \n"""
      f"""\tand a False Positive Rate of {round(fprs[max_dist], 3)}""")

plt.figure(figsize = (8, 6))
plt.plot(fprs[:max_dist], tprs[:max_dist], 'r.')
plt.plot(fprs[max_dist], tprs[max_dist], 'ko', ms=10)
plt.plot(fprs[max_dist + 1:], tprs[max_dist + 1:], 'r.')
plt.plot(fprs, fprs);

In [7]:
round(roc_auc_score(y_test, logreg.predict(X_test)), 4)

In [146]:
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict(X_test))

In [8]:
fpr

In [9]:
plt.plot(fpr, tpr)