In [6]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
# import sklearn.plt as skplt

In [7]:
class ROC:
    """
    ROC curve builder class.
    Classes are assumed to be binary

    """
    # results is an numpy array formed by stacking together fpr, tpr and corresponding thresholds.
    # use results for analysis
    results = None

    def __init__(self, proba, true_labels, pos_label_value, pos_label=1):
        """
        Use these values in calc_tpr_fpr() method

        :param proba: numpy array of class probabilities
        :param true_labels: numpy array of true labels
        :param pos_label_value: The value of the positive label (usually 1)
        :param pos_label: The relative order of positive label in proba
        """
        self.proba = proba
        self.true_labels = true_labels
        self.pos_label_value = pos_label_value
        self.pos_label = pos_label

    def plot(self):
        """
        Plots an ROC curve using True Positive Rate and False Positive rate lists calculated from __calc_tpr_fpr
        Calculates and outputs AUC score on the same graph
        """
        tpr, fpr, thresholds = self.__calc_tpr_fpr()
        self.results = np.column_stack((tpr, fpr, thresholds))

        # %%% TODO START YOUR CODE HERE %%%

        # %%% END YOUR CODE HERE %%%

    def __calc_tpr_fpr(self):
        """
        Calculates True Positive Rate, False Positive Rate and thresholds lists

        First, sorts probabilities of positive label in decreasing order
        Next, moving towards the least probability locates a threshold between instances with opposite classes
        (keeping instances with the same confidence value on the same side of threshold),
        computes TPR, FPR for instances above threshold and puts them in the lists

        :return:
        tpr: list
        fpr: list
        thresholds: list
        """
        # %%% TODO START YOUR CODE HERE %%%

        # %%% END YOUR CODE HERE %%%

In [174]:
def stratified_train_test_split(X, Y, test_size, random_seed=10):
    """
    Performs the stratified train/test split
    (with the same (!) inter-class ratio in train and test sets as compared to original set)
    input:
        X: numpy array of size (n,m)
        Y: numpy array of size (n,)
        test_size: number between 0 and 1, specifies the relative size of the test_set
        random_seed: random_seed

    returns:
        X_train
        X_test
        Y_train
        Y_test
    """
    if test_size < 0 or test_size > 1:
        raise Exception("Fraction for split is not valid")

    np.random.seed(random_seed)

    # %%% TODO START YOUR CODE HERE %%%

    positive_idx = np.where(Y==1)[0]
    negative_idx = np.where(Y==0)[0]
#     print(X[negative_idx][:,0].sum())
#     positive_X = X[positive_idx]
#     negative_X = X[negative_idx]
#     positive_Y = Y[positive_idx]
#     negative_Y = Y[negative_idx]
    m = Y.size
    m_pos_test = int(test_size*positive_idx.size)
    m_neg_test = int(test_size*negative_idx.size)
    m_test = m_pos_test + m_neg_test
    test_idx_pos = np.random.choice(positive_idx, m_pos_test, replace=False)
    test_idx_neg = np.random.choice(negative_idx, m_neg_test, replace=False)
    train_idx_pos = np.where(~np.isin(positive_idx, test_idx_pos))[0]
    print(train_idx_pos[:10])
#     print(test_idx_pos.shape)
    train_idx_neg = np.where(~np.isin(negative_idx, test_idx_neg))[0]
#     print(test_idx_pos.size)
#     print(test_idx_neg.size)
#     print(m_test)
#     print(test_idx_pos.size + train_idx_pos.size + test_idx_neg.size + train_idx_neg.size)
    train_idx = np.append(train_idx_pos, train_idx_neg)

    train_idx = np.random.choice(train_idx, train_idx.size, replace=False)
    test_idx = np.append(test_idx_pos, test_idx_neg)
    test_idx = np.random.choice(test_idx, m_test, replace=False)
    print(X[train_idx][:,0].sum() + X[test_idx][:,0].sum())
    print(train_idx.shape + test_idx.shape)
    train_Y = Y[train_idx]
    train_X = X[train_idx]
    test_Y = Y[test_idx]
    test_X = X[test_idx]
#     train_Y = np.append(Y[train_idx_pos], Y[train_idx_neg], axis=0)
#     train_Y = np.random.choice(train_Y, train_Y.size, replace=False)
    print(train_X[train_Y==1][:,0].sum() + test_X[test_Y==1][:,0].sum() + train_X[train_Y==0][:,0].sum() + test_X[test_Y==0][:,0].sum())
    print(train_Y.size + test_Y.size)
    # %%% END YOUR CODE HERE %%%

In [76]:
data = load_breast_cancer()

In [77]:
# Pre-processing: Exchange labels - make malignant 1, benign 0
data['target'] = np.array(data['target'], dtype=int) ^ 1

In [78]:
Y = data['target']
X = data['data']

In [173]:
X[:,0].sum()

8038.4290000000001

In [175]:
X_train, X_test, y_train, y_test = stratified_train_test_split(X, Y, 0.3, 10)

[ 0  3  4  8  9 11 12 13 14 15]
8177.228
(399, 170)
8177.228
569


TypeError: 'NoneType' object is not iterable

In [None]:
# Check that the ratio is preserved
print("Inter-class ratio in original set:", len(np.argwhere(data['target'] == 1))/len(np.argwhere(data['target'] == 0)))
print("Inter-class ratio in train set:", len(np.argwhere(y_train == 1))/len(np.argwhere(y_train == 0)))
print("Inter-class ratio in test set:", len(np.argwhere(y_test == 1))/len(np.argwhere(y_test == 0)))
print('\n')

In [9]:
# We pick Logistic Regression because it outputs probabilities
# Try different number of iterations to change ROC curve
model = LogisticRegression(max_iter=5)
model.fit(X_train, y_train)
probabilities = model.predict_proba(X_test)
y_pred = model.predict(X_test)
print("Classifier's Accuracy:", accuracy_score(y_test, y_pred))

# Build an ROC curve
roc = ROC(probabilities, y_test, 1)
roc.plot()
# Explore the results
results = roc.results

# Use scikitplot library to compare ROC curve with the one you are getting
skplt.metrics.plot_roc_curve(y_test, probabilities)
plt.show()


# ROC analysis questions:
# 1. What are fpr, tpr rates if we choose 0.5 as a threshold?
# %%% TODO Answer HERE %%%

# 2. Let's suppose this is a second cancer check for those who have high probability of cancer.
#    What threshold value will you use in this case and why?
# %%% TODO Answer HERE %%%

TypeError: 'NoneType' object is not iterable