In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [146]:
path = "/Users/dianaemal/breast_cancer_project/data/features/"
y_labels = np.load(f"{path}/y_labels.npy")
x_features = np.load(f"{path}/X_features.npy")
print(x_features.shape)
print(y_labels.shape)

(647, 1, 7, 7, 512)
(647,)


In [147]:
# 
# Currently our x_features array is 5 dimentional.  
# We will have to flatten the array using reshape to 2D:
# The first argument to reshape function tells it to keep the first dimention as it is and the second argumant(-1) means that flatten
# rest of the dimentions (1 x 7 x 7 x 512) = 25088
x_flattened = x_features.reshape(x_features.shape[0], -1)
print(x_flattened.shape)


(647, 25088)


In [180]:
# Use 80 % of the data for training and rest for testing:
num = (len(x_features) * 8) // 10
logreg = LogisticRegression(max_iter = 250, class_weight='balanced', penalty = 'l2', C=0.01, random_state=42)
x_train = x_flattened[0:num]
x_test = x_flattened[num:]
print(x_train.shape) 
y_train = y_labels[0:num]
y_test = y_labels[num:]
logreg.fit(x_train, y_train)


(517, 25088)


In [181]:
# Check for overfitting:
preds1 = logreg.predict(x_train)
print(["B", "M"])
print( confusion_matrix(y_train, preds1))
print("Accuracy: " + str(accuracy_score(y_train, preds1)))

['B', 'M']
[[341   1]
 [  0 175]]
Accuracy: 0.9980657640232108


In [182]:
# The model is highly overfitted because there is a huge number of features (25088) compared to the number of training images(data).
# To fix the issue of overfitting, we must reduced the features using PCA. Before reducing features, we will first scale them:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

pca = PCA(n_components=70)
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

In [183]:
logreg.fit(x_train_pca, y_train)
preds2 = logreg.predict(x_train_pca)
confusion = confusion_matrix(y_train, preds2)
print(["B", "M"])
print(confusion)
print("Accuracy: " + str(accuracy_score(y_train, preds2)))

['B', 'M']
[[304  38]
 [ 20 155]]
Accuracy: 0.8878143133462283


In [191]:
# On testing set:
preds3 = logreg.predict(x_test_pca)
confusion = confusion_matrix(y_test, preds3)
print(["B", "M"])
print(confusion)
print("Accuracy: ", accuracy_score(y_test, preds3))
precision_recall_fscore_support(y_test, preds3, pos_label=0 , average='binary')

['B', 'M']
[[89  6]
 [ 8 27]]
Accuracy:  0.8923076923076924


(0.9175257731958762, 0.9368421052631579, 0.9270833333333334, None)

In [190]:
y_preds = logreg.predict_proba(x_test_pca)

# 2. Choose your own threshold (e.g., 0.3 instead of 0.5 to reduce false negatives)
threshold = 0.3
labels = logreg.classes_
y_pred_thresh=[]

for pred in y_preds:
    if pred[1] > threshold:
        y_pred_thresh.append(labels[1])
    else:
        y_pred_thresh.append(labels[0])


cm = confusion_matrix(y_test, y_pred_thresh)
acc = accuracy_score(y_test, y_pred_thresh)
print(["B", "M"])
print(cm)
print("Accuracy:", acc)
precision_recall_fscore_support(y_test, y_pred_thresh, pos_label=0 , average='binary')

['B', 'M']
[[77 18]
 [ 5 30]]
Accuracy: 0.823076923076923


(0.9390243902439024, 0.8105263157894737, 0.8700564971751412, None)

In [175]:
knn = KNeighborsClassifier(7)
knn.fit(x_train_pca, y_train)
preds2 = knn.predict(x_test_pca)
conf = confusion_matrix(y_test, preds2)
print(conf)
print(accuracy_score(y_test, preds2))

[[86  9]
 [13 22]]
0.8307692307692308
