In [1]:
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
results=[]

In [3]:
path = "/Users/dianaemal/breast_cancer_project/data/features/"
y_labels = np.load(f"{path}/y_labels.npy")
x_features = np.load(f"{path}/X_features.npy")
print(x_features.shape)
print(y_labels.shape)

(647, 1, 7, 7, 512)
(647,)


In [4]:
# 
# Currently our x_features array is 5 dimentional.  
# We will have to flatten the array using reshape to 2D:
# The first argument to reshape function tells it to keep the first dimention as it is and the second argumant(-1) means that flatten
# rest of the dimentions (1 x 7 x 7 x 512) = 25088
x_flattened = x_features.reshape(x_features.shape[0], -1)
print(x_flattened.shape)


(647, 25088)


In [5]:
# Use 80 % of the data for training and rest for testing:
num = (len(x_features) * 8) // 10
logreg = LogisticRegression(max_iter = 250, class_weight='balanced', penalty = 'l2', C=0.01, random_state=42)
x_train = x_flattened[0:num]
x_test = x_flattened[num:]
print(x_train.shape) 
y_train = y_labels[0:num]
y_test = y_labels[num:]
logreg.fit(x_train, y_train)


(517, 25088)


In [6]:
# Check for overfitting:
preds1 = logreg.predict(x_train)
print(["B", "M"])
print( confusion_matrix(y_train, preds1))
print("Accuracy: " + str(accuracy_score(y_train, preds1)))

['B', 'M']
[[341   1]
 [  0 175]]
Accuracy: 0.9980657640232108


In [7]:
# The model is highly overfitted because there is a huge number of features (25088) compared to the number of training images(data).
# To fix the issue of overfitting, we must reduced the features using PCA. Before reducing features, we will first scale them:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

pca = PCA(n_components=70)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

In [8]:
logreg.fit(x_train_pca, y_train)
preds2 = logreg.predict(x_train_pca)
confusion = confusion_matrix(y_train, preds2)
print(["B", "M"])
print(confusion)
print("Accuracy: " + str(accuracy_score(y_train, preds2)))

['B', 'M']
[[302  40]
 [ 18 157]]
Accuracy: 0.8878143133462283


In [25]:
# On testing set:
preds3 = logreg.predict(x_test_pca)
cm1 = confusion_matrix(y_test, preds3)
tn, fp, fn, tp = cm1.ravel()
acc1 = accuracy_score(y_test, preds3)
p1 = precision_recall_fscore_support(y_test, preds3, pos_label=1 , average='binary')
auc1 = roc_auc_score(y_test, preds3)
print(["B", "M"])
print(cm1)
print("Accuracy: ", acc1)
print(p1)
results.append({
    "Model": "Logreg1",
    "accuracy" :acc1,
    "precision" : p1[0],
    "recall": p1[1],
    "F1" : p1[2],
    "AUC" : auc1
})

['B', 'M']
[[88  7]
 [ 7 28]]
Accuracy:  0.8923076923076924
(0.8, 0.8, 0.8, None)


In [26]:
y_preds = logreg.predict_proba(x_test_pca)

# 2. Choose your own threshold (e.g., 0.3 instead of 0.5 to reduce false negatives)
threshold = 0.3
labels = logreg.classes_
y_pred_thresh=[]

for pred in y_preds:
    if pred[1] > threshold:
        y_pred_thresh.append(labels[1])
    else:
        y_pred_thresh.append(labels[0])


cm2 = confusion_matrix(y_test, y_pred_thresh)
tn, fp, fn, tp = cm2.ravel()
acc2 = accuracy_score(y_test, y_pred_thresh)
print(["B", "M"])
print(cm2)
print("Accuracy:", acc2)
p2 = precision_recall_fscore_support(y_test, y_pred_thresh, pos_label=1 , average='binary')
auc2 = roc_auc_score(y_test, y_pred_thresh)
print(p2)
results.append({
    "Model": "Logreg2",
    "accuracy" :acc2,
    "precision" : p2[0],
    "recall": p2[1],
    "F1" : p2[2],
    "AUC" : auc2
})

['B', 'M']
[[78 17]
 [ 4 31]]
Accuracy: 0.8384615384615385
(0.6458333333333334, 0.8857142857142857, 0.7469879518072289, None)


In [27]:
knn = KNeighborsClassifier(7)
knn.fit(x_train_pca, y_train)
preds4 = knn.predict(x_test_pca)
cm3 = confusion_matrix(y_test, preds4)
tn, fp, fn, tp = cm3.ravel()

acc3 = accuracy_score(y_test, preds4)
print(["B", "M"])
print(cm3)
print("Accuracy: " , acc3)
p3 = precision_recall_fscore_support(y_test, preds4, pos_label=1 , average='binary')
auc3 = roc_auc_score(y_test, preds4)
results.append({
    "Model": "KNN",
    "accuracy" :acc3,
    "precision" : p3[0],
    "recall": p3[1],
    "F1" : p3[2],
    "AUC" : auc3
})

['B', 'M']
[[86  9]
 [11 24]]
Accuracy:  0.8461538461538461


In [28]:
svm = SVC(kernel="rbf", class_weight='balanced', C=10, probability=True)
svm.fit(x_train_pca, y_train)
preds5 = svm.predict(x_test_pca)
cm4 = confusion_matrix(y_test, preds5)
tn, fp, fn, tp = cm4.ravel()
auc4 = roc_auc_score(y_test, preds5)
acc4 = accuracy_score(y_test, preds5)
print(["B", "M"])
print(cm4)
print("Accuracy: " , acc4)
p4 = precision_recall_fscore_support(y_test, preds5, pos_label=1 , average='binary')
print(p4)
results.append({
    "Model": "SVC",
    "accuracy" :acc4,
    "precision" : p4[0],
    "recall": p4[1],
    "F1" : p4[2],
    "AUC" : auc4
})

['B', 'M']
[[89  6]
 [ 8 27]]
Accuracy:  0.8923076923076924
(0.8181818181818182, 0.7714285714285715, 0.7941176470588235, None)


In [21]:
# Check for overfitting
preds5 = svm.predict(x_train_pca)
conf = confusion_matrix(y_train, preds5)
print(["B", "M"])
print(conf)
print(accuracy_score(y_train, preds5))

['B', 'M']
[[317  25]
 [  9 166]]
0.9342359767891683
