Problem 3

In [83]:
import numpy  as np
import pandas as pd 
import sklearn
from sklearn import preprocessing

In [317]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def crossEntropyLoss(x, t):
    (1 - t) * np.log(sigmoid(x)) + t * np.log(1 - sigmoid(x))

def convert2prob(y):
    return np.array([1 if label == "B" else 0 for label in y])

def mbSGD(batch_size, alpha, epochs, X, t, w):
    m, n = X.shape
    X_b = np.c_[np.ones((m, 1)), X]
    new_w = w
    
    # Convert target values to 0 or 1: M = 0, B = 1 
    t = convert2prob(t.values.flatten()) 
    
    for epoch in range(epochs):
        # shuffle data 
        permutation = np.random.permutation(m)
        X_shuffled = X_b[permutation] 
        t_shuffled = t[permutation]
        
        # Loop through each batch 
        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i + batch_size]
            t_batch = t_shuffled[i:i + batch_size]
            y_batch = sigmoid(np.dot(X_batch, w)) 
            
            error = y_batch - t_batch 
            
            gradient = np.dot(X_batch.T, error) / len(t_batch)
            
            new_w -= alpha * gradient 
    
    return new_w 

def confusion_matrix(y_pred, y_true): 
    # Assume M = 0 and B = 1 
    TP = np.sum((y_pred == "B") & (y_true == "B"))
    FP = np.sum((y_pred == "B") & (y_true == "M"))
    TN = np.sum((y_pred == "M") & (y_true == "M"))
    FN = np.sum((y_pred == "M") & (y_true == "B"))
    return TP, FP, TN, FN 

def eval(y_pred, y_true):
    TP, FP, TN, FN = confusion_matrix(y_pred, y_true)

    accuracy = (TP + TN) / len(y_pred)
    precision = TP / (TP + FP) 
    recall = TP / (TP + FN)
    F1 = 2 * precision * recall / (precision + recall)

    return accuracy, precision, recall, F1
    



Problem 4 

A. Load the data 

In [282]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'ID': 230, 'type': 'NATIVE', 'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'venue': 'Electronic imaging', 'year': 1993, 'journal': None, 'DOI': '1

B. Split the dataset into train, validaton, and test sets.

In [283]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=42)

scaler = StandardScaler() 

X_train = scaler.fit_transform(X_train) 

X_val = scaler.transform(X_val) 
X_test = scaler.transform(X_test)

C. Report the size of each set 

In [309]:
print(f"The size of the training set is {X_train.shape[0]} rows by {X_train.shape[1]} columns. ")
print(f"The size of the validation set is {X_val.shape[0]} rows by {X_val.shape[1]} columns. ")
print(f"The size of the testing set is {X_test.shape[0]} rows by {X_test.shape[1]} columns. ")

The size of the training set is 434 rows by 30 columns. 
The size of the validation set is 49 rows by 30 columns. 
The size of the testing set is 86 rows by 30 columns. 


D. Train binary logistic model using Question 3

In [327]:
import random

w = np.array([random.gauss(0, 1) for _ in range(X.shape[1] + 1)])

epochs = 10 

# model 1 
alpha1 = 0.1
batch_size1 = 10
w1 = mbSGD(10, 0.1, 1000, X_train, y_train, w)


#model 2 
alpha2 = 0.00000001 
batch_size2 = 128
w2 = mbSGD(batch_size2, alpha2, epochs, X_train, y_train, w)


# Test on validation set 
X_val_b = np.c_[np.ones((X_val.shape[0], 1)), X_val]

predicted_probabilities1 = sigmoid(np.dot(X_val_b, w1))
predicted_probabilities2 = sigmoid(np.dot(X_val_b, w2))

predictions1 = np.where(predicted_probabilities1 > 0.5, "B", "M")
predictions2 = np.where(predicted_probabilities2 > 0.5, "B", "M")

accuracy1, precision1, recall1, F1_1 = eval(predictions1, y_val.values.flatten())
print(f"Accuracy of the model is {accuracy1}")
print(f"Precision of the model is {precision1}")
print(f"Recall of the model is {recall1}")
print(f"F1 of the model is {F1_1} \n")

accuracy2, precision2, recall2, F1_2 = eval(predictions2, y_val.values.flatten())
print(f"Accuracy of the model is {accuracy2}")
print(f"Precision of the model is {precision2}")
print(f"Recall of the model is {recall2}")
print(f"F1 of the model is {F1_2} \n")


Accuracy of the model is 0.9795918367346939
Precision of the model is 0.9655172413793104
Recall of the model is 1.0
F1 of the model is 0.9824561403508771 

Accuracy of the model is 0.9795918367346939
Precision of the model is 0.9655172413793104
Recall of the model is 1.0
F1 of the model is 0.9824561403508771 



E. Report performance of the model 

In [328]:
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]
target = convert2prob(y_test.values.flatten())

test_pred_prob = sigmoid(np.dot(X_test_b, w1))
test_pred_label = np.where(test_pred_prob > 0.5, "B", "M")


accuracy_test, precision_test, recall_test, F1_test = eval(test_pred_label, y_test.values.flatten())
print(f"Accuracy of the model is {accuracy_test}")
print(f"Precision of the model is {precision_test}")
print(f"Recall of the model is {recall_test}")
print(f"F1 of the model is {F1_test} \n")


Accuracy of the model is 0.9767441860465116
Precision of the model is 0.9814814814814815
Recall of the model is 0.9814814814814815
F1 of the model is 0.9814814814814815 



F. Summarize findings 

Using the w that we learned from the training data and finetuning the hyperparameters, alpha and batch size, with the validaton set, high scores in accuracy, precision, recall, and F1 were obtained when testing on the teseting data. Although the variation in these four metrics was minimal despite changing the learning rate and batch sizes when testing the obtained weights on the validation set, the model is accurate on both the validation and the testing data.  