In [1]:
#importing necessary libraries
import math
import csv
from collections import Counter
from random import shuffle

In [2]:
#creating a class that contains our filepath, reading the csv file, training and test sets, and separating our features and labels
class DataHandler:
    #contructor to start a filepath
    def __init__(self, filepath):
        self.filepath = filepath
        
    # reading the input csv file and store the data into a list
    def read_csv(self):
        with open(self.filepath, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)  # Skip the header row
            dataset = [row for row in csv_reader]
        return dataset

    #creating the training and test sets 
    def train_test_split(self, dataset, test_size=0.2):
        #shuffling the dataset to get random points
        shuffle(dataset)
        #split index to determine how big the test size would be
        split_index = int(len(dataset) * (1 - test_size))
        #splitting up into the training and test sets 
        return dataset[:split_index], dataset[split_index:]

    #creating separate features and labels from the dataset
    def separate_features_labels(self, dataset):
        # converting the feature values to floats 
        features = [list(map(float, data[1:-1])) for data in dataset]  # Exclude the ID and label
        labels = [data[-1] for data in dataset]  # The label is the last element in each row
        return features, labels

In [3]:
#creating our KNN Classifier class
class KNearestNeighbors:
    def __init__(self, k=3):
        #intializing knn with number of neighbors
        self.k = k
        self.X_train = []
        self.y_train = []

    def fit(self, X, y):
        #storing the training set and their labels
        self.X_train = X
        self.y_train = y

    def predict_neighbors(self, input_features):
        #calculating the distances between input features and training data
        distances = []
        for i in range(len(self.X_train)):
            distance = self._euclidean_distance(input_features, self.X_train[i])
            distances.append((distance, self.y_train[i]))

        #sorting the distances and finding the points for our k nearest neighbors
        distances.sort(key=lambda x: x[0])
        nearest_neighbors = distances[:self.k]
        #returning the labels
        labels = [label for _, label in nearest_neighbors]
        return labels

    def predict(self, X):
        #predicting the list of features
        #creating a list to store predicted values
        predictions = []
        for x in X:
            neighbors = self.predict_neighbors(x)
            predicted_label = self._vote(neighbors)
            predictions.append(predicted_label)
        return predictions

    def _vote(self, labels):
        #determining the most common neighbor 
        count = Counter(labels)
        return count.most_common(1)[0][0]    

    def _euclidean_distance(self, x1, x2):
        # calculating the euclidean distance from the points
        distance = 0.0
        for i in range(len(x1)):
            #finding the distance between 2 points
            distance += (x1[i] - x2[i]) ** 2
        return math.sqrt(distance)    

    def classification_report(self, y_true, y_pred):
        #creating a classification report for the predictions
        unique_labels = set(y_true)
        report = {}
        for label in unique_labels:
            tp = sum(1 for i in range(len(y_true)) if y_true[i] == label and y_pred[i] == label)
            fp = sum(1 for i in range(len(y_true)) if y_true[i] != label and y_pred[i] == label)
            fn = sum(1 for i in range(len(y_true)) if y_true[i] == label and y_pred[i] != label)
            tn = sum(1 for i in range(len(y_true)) if y_true[i] != label and y_pred[i] != label)

            #calculating precions, recall, and f-1 score per class
            precision = tp / (tp + fp) if tp + fp > 0 else 0
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
            accuracy = (tp + tn) / len(y_true)

            #printing/assigning the results
            report[label] = {
                'Precision': precision,
                'Recall': recall,
                'F1-score': f1,
                'Accuracy': accuracy
            }

        return report

In [4]:
def main():
    #defining the filepath to csv file
    filepath = '/Users/cindyhernandez/Downloads/breast_cancerno2.csv'

    # Initialize the data handler with the filepath
    # This object will handle all data operations
    data_handler = DataHandler(filepath)

    #reading the dataset from the csv file using the read_csv method
    # The dataset is returned as a list of lists, where each sublist is a row from the file
    dataset = data_handler.read_csv()

    # Split the dataset into training and testing parts using the train_test_split method
    # Default split is 80% training and 20% testing
    train_set, test_set = data_handler.train_test_split(dataset)

    # Separate features and labels for the training set
    # train_features will contain the data attributes, and train_labels will contain the target labels
    train_features, train_labels = data_handler.separate_features_labels(train_set)

    # Separate features and labels for the testing set
    # This setup mirrors the training separation
    test_features, test_labels = data_handler.separate_features_labels(test_set)

    #initializing knn classifier with k=5
    # This object will perform all classification tasks
    classifier = KNearestNeighbors(k=5)

    #fit the classifier on the training data
    classifier.fit(train_features, train_labels)

    #predict the class labels for the test set features
    predictions = classifier.predict(test_features)

    #generate a classification report comparing the true labels and predicted labels
    report = classifier.classification_report(test_labels, predictions)

    #printing out classification report
    print("Classification Report:")
    for label, metrics in report.items():
        print(f"Class {label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.2f}")
        print()

# This block checks if this script is the main program and runs the main function
if __name__ == "__main__":
    main()
    
#class 2 refers to if it is malignant tumor
#class 4 refers to if it is benign tumor 

Classification Report:
Class 4:
  Precision: 0.96
  Recall: 0.94
  F1-score: 0.95
  Accuracy: 0.96

Class 2:
  Precision: 0.97
  Recall: 0.98
  F1-score: 0.97
  Accuracy: 0.96

