# General Info

In [18]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load breast cancer dataset
data = load_breast_cancer()

# Create a Pandas DataFrame
dataset = pd.DataFrame(data.data, columns=data.feature_names)

# Add the target variable to the DataFrame
dataset['target'] = data.target

In [24]:
# Display the first few rows of the DataFrame
pd.set_option('display.max_columns', None)
dataset.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [25]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

<h2>Implementation from framework</h2>

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

In [31]:
# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target  # Ensure that y is categorical (0 or 1 for binary classification)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the KNN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=11)
knn_classifier.fit(X_train, y_train)

In [32]:
# Make predictions
predictions = knn_classifier.predict(X_test)

In [34]:
# Evaluate Model
cm = confusion_matrix(y_test,predictions)
asc = accuracy_score(y_test,predictions) 
fs = f1_score(y_test,predictions)

print("Confusion Matrix\n",cm)
print("\nAccuray Score\n",asc)
print("\nF1 Score\n",fs)

Confusion Matrix
 [[41  2]
 [ 0 71]]

Accuray Score
 0.9824561403508771

F1 Score
 0.9861111111111112


<h1>Implementation from scratch</h1>

In [84]:
import numpy as np
import pandas as pd

In [112]:
test_array = np.array([[5,0],[6,0],[7,1],[8,0],[9,1]])
testFrame = pd.DataFrame(test_array,columns=["feature","target"])
testFrame

Unnamed: 0,feature,target
0,5,0
1,6,0
2,7,1
3,8,0
4,9,1


In [113]:
def euclidean_distance(row1, row2):
    return np.sqrt(np.sum((row1 - row2)**2))

# Example usage:

#print(data.data[0])
#distance = euclidean_distance(data.data[1], data.data[2])

distance = euclidean_distance(test_array[0], test_array[4])
print(f"Euclidean Distance between row1 and row2: {distance:.2f}")


Euclidean Distance between row1 and row2: 4.12


In [126]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
    
    distances = list()
    
    for i, train_row in enumerate(train):
        
        dist = euclidean_distance(test_row, train_row)
        distances.append((i, dist))
        distances.sort(key=lambda tup: tup[1])

    neighbors = [index for index, _ in distances[:num_neighbors]]
    
    return neighbors
    
# Example usage:
neighbours = get_neighbors(test_array[:,0],[7],3)
print(neighbours)


[2, 1, 3]


In [136]:
# Make a classification prediction with neighbors
def predict_classification(train, test_row, num_neighbors, y_train):
    
    negativeCounter = 0
    positiveCounter = 0
    
    neighbors = get_neighbors(train, test_row, num_neighbors)

    print(y_train)
    
    for n in neighbors:
        if (y_train[n] == 0): negativeCounter += 1
        if (y_train[n] == 1): positiveCounter += 1

    if (negativeCounter > positiveCounter): return 0
    if (negativeCounter < positiveCounter): return 1
    if (negativeCounter == positiveCounter): print("choose anoter value for K")

predict_classification(test_array[:,0],[8.5],3,test_array[:,1])

[0 0 1 0 1]


1

<h2>Train data</h2>

In [137]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Cross-Validation

In [3]:
import numpy as np
def cross_validation(X,y,k):
    # Calculate the number of samples for testing
    num_test_samples = int(len(X) / k)
    remider = len(X)% k
    
    for i in range (k):
        X_test = X[i*num_test_samples:(i+1)*num_test_samples, :]
        X_train = np.concatenate((X[0:i*num_test_samples, :], X[(i+1)*num_test_samples:, :]), axis=0)

        y_test = y [i*num_test_samples:(i+1)*num_test_samples]
        y_train = np.concatenate((y[0:i*num_test_samples], y[(i+1)*num_test_samples:]), axis=0)

    
        print("Set" , i)
        print("test")
        print(X_test)
        print("train")
        print(X_train)
        print("y_test")
        print(y_test)
        print("y_train")
        print(y_train)


cross_validation(np.array([[1,1,1,1],[1,1,1,0],[1,1,0,0],[1,0,0,0]]),[0,0,1,1],2)

Set 0
test
[[1 1 1 1]
 [1 1 1 0]]
train
[[1 1 0 0]
 [1 0 0 0]]
y_test
[0, 0]
y_train
[1. 1.]
Set 1
test
[[1 1 0 0]
 [1 0 0 0]]
train
[[1 1 1 1]
 [1 1 1 0]]
y_test
[1, 1]
y_train
[0. 0.]


In [13]:
def cross_validation(X,y,k):
    # Calculate the number of samples for testing
    num_test_samples = int(len(X) / k)
    remider = len(X)% k
    X_test_set = list()
    X_train_set = list()
    y_test_set = list()
    y_train_set = list()
    
    for i in range (k):
        # Get test and train for features X
        X_test = X[i*num_test_samples:(i+1)*num_test_samples, :]
        X_train = np.concatenate((X[0:i*num_test_samples, :], X[(i+1)*num_test_samples:, :]), axis=0)

        # Get test and train for target Y
        y_test = y [i*num_test_samples:(i+1)*num_test_samples]
        y_train = np.concatenate((y[0:i*num_test_samples], y[(i+1)*num_test_samples:]), axis=0)

        # Append test and train data
        X_test_set.append(X_test)
        X_train_set.append(X_train)
        y_test_set.append(y_test)
        y_train_set.append(y_train)
    
    return X_test_set,X_train_set,y_test_set,y_train_set


cross_validation(np.array([[1,1,1,1],[1,1,1,0],[1,1,0,0],[1,0,0,0]]),[0,0,1,1],3)

([array([[1, 1, 1, 1]]), array([[1, 1, 1, 0]]), array([[1, 1, 0, 0]])],
 [array([[1, 1, 1, 0],
         [1, 1, 0, 0],
         [1, 0, 0, 0]]),
  array([[1, 1, 1, 1],
         [1, 1, 0, 0],
         [1, 0, 0, 0]]),
  array([[1, 1, 1, 1],
         [1, 1, 1, 0],
         [1, 0, 0, 0]])],
 [[0], [0], [1]],
 [array([0., 1., 1.]), array([0, 1, 1]), array([0, 0, 1])])

In [10]:
X_test,X_train,y_test,y_train = cross_validation(np.array([[1,1,1,1],[1,1,1,0],[1,1,0,0],[1,0,0,0]]),[0,0,1,1],2)

In [11]:
X_train

[array([[1, 1, 0, 0],
        [1, 0, 0, 0]]),
 array([[1, 1, 1, 1],
        [1, 1, 1, 0]])]