# Receiver Operating Characteristic (ROC)

In this notebook we see more examples of ROC and AuC.

In [1]:
from sklearn.metrics import roc_curve, roc_auc_score
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'xgboost'

## First example

Completely made-up data.

In [None]:
# Assuming you have your model predictions and true labels
y_true = np.array([0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1])
y_pred_proba = np.array([0.47, 0.28, 0.44, 0.83, 0.42, 0.87, 0.36, 0.81, 0.47, 0.52, 0.91, 0.32, 0.22, 0.68, 0.61, 0.08, 0.16, 0.18, 0.63, 0.37])

In [None]:
pd.DataFrame({'y_true': y_true, 'y_pred_prob': y_pred_proba}).sort_values(by='y_pred_prob')

In [None]:
y_pred_proba_pos = y_pred_proba[y_true==1]
y_pred_proba_neg = y_pred_proba[y_true==0]

sns.kdeplot(y_pred_proba_pos, label='Positive Class', bw_adjust=0.25)
sns.kdeplot(y_pred_proba_neg, label='Negative Class', bw_adjust=0.25)

plt.xlabel('Predicted Probability')
plt.ylabel('Density')
plt.title('Kernel Density Estimation of Predicted Probabilities')
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba)

# Calculate the AUC
auc = roc_auc_score(y_true, y_pred_proba)

In [None]:
# Plot the ROC curve
print('AuC = ', auc)
RoC = pd.DataFrame({'thresholds': thresholds, 'fpr': fpr, 'tpr': tpr})
print(RoC.head(6))
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.3f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()

## Second example

Use Gaussian distribution to create the data.

In [None]:
N = 1000
#
mu1 = 0.25
sd1 = 0.25
data = np.random.normal(loc=mu1, scale=sd1, size=1000).clip(0, 1)
df1 = pd.DataFrame({'y_true': 0, 'y_pred_proba': data})
#
mu2 = 0.75
sd2 = 0.25
data = np.random.normal(loc=mu2, scale=sd2, size=1000).clip(0, 1)
df2 = pd.DataFrame({'y_true': 1, 'y_pred_proba': data})
#
df = pd.concat([df1, df2], ignore_index=True).sample(2*N, replace=False)

In [None]:
df

In [None]:
# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(df.y_true, df.y_pred_proba)

# Calculate the AUC
auc = roc_auc_score(df.y_true, df.y_pred_proba)

In [None]:
# Plot the ROC curve
print('AuC = ', auc)
RoC = pd.DataFrame({'thresholds': thresholds, 'fpr': fpr, 'tpr': tpr})
print(RoC.head(6))
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()

## Third example

Use data from the Titanic dataset.

In [None]:
titanic = sns.load_dataset('titanic')
print(titanic.head())
titanic.describe()

In [None]:
titanic.drop(columns=['embarked', 'embark_town', 'alive'], inplace=True)

In [None]:
#titanic.isna().sum()
titanic.deck.value_counts(dropna=False)

In [None]:
titanic['deck'] = titanic['deck'].cat.add_categories(['X'])
titanic['deck'].fillna('X', inplace=True)
median_age = titanic['age'].median()
titanic['age'].fillna(median_age, inplace=True)

In [None]:
X = titanic.drop('survived', axis=1)
y = titanic['survived']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
num_cols = titanic.select_dtypes(['integer', 'float']).drop(columns = 'survived').columns
cat_cols = titanic.select_dtypes(['object']).columns

print("Numeric columns are {}.".format(", ".join(num_cols)))
print("Categorical columns are {}.".format(", ".join(cat_cols)))

In [None]:
onehoter = OneHotEncoder(sparse_output = False, drop = "first")
onehoter.fit(titanic[cat_cols])
onehot_cols = onehoter.get_feature_names_out(cat_cols)
titanic_train_onehot = pd.DataFrame(onehoter.transform(X_train[cat_cols]), columns = onehot_cols)
titanic_test_onehot = pd.DataFrame(onehoter.transform(X_test[cat_cols]), columns = onehot_cols)

In [None]:
znormalizer = StandardScaler()
znormalizer.fit(titanic[num_cols])
titanic_train_norm = pd.DataFrame(znormalizer.transform(X_train[num_cols]), columns = num_cols)
titanic_test_norm = pd.DataFrame(znormalizer.transform(X_test[num_cols]), columns = num_cols)

In [None]:
# Join the columns
titanic_train_featurized = titanic_train_onehot # add one-hot-encoded columns
titanic_test_featurized = titanic_test_onehot   # add one-hot-encoded columns
titanic_train_featurized[num_cols] = titanic_train_norm # add numeric columns
titanic_test_featurized[num_cols] = titanic_test_norm   # add numeric columns

print("Featurized training data has {} rows and {} columns.".format(*titanic_train_featurized.shape))
print("Featurized test data has {} rows and {} columns.".format(*titanic_test_featurized.shape))

In [None]:
knnb = KNeighborsClassifier(n_neighbors = 7)
knnb.fit(titanic_train_featurized, y_train)

y_hat_train = knnb.predict(titanic_train_featurized)
y_hat_test = knnb.predict(titanic_test_featurized)

In [None]:
# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_test, knnb.predict_proba(titanic_test_featurized)[:,1])

# Calculate the AUC
auc = roc_auc_score(y_test, knnb.predict_proba(titanic_test_featurized)[:,1])

In [None]:
# Plot the ROC curve
print('AuC = ', auc)
RoC = pd.DataFrame({'thresholds': thresholds, 'fpr': fpr, 'tpr': tpr})
print(RoC.head(6))
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.3f)' % auc)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()