In [7]:
import warnings
import csv
from collections import Counter

1. Load the three datasets from the UCI-Machine Learning Repository:

In [8]:
# File names of the datasets
file1 = 'hayes-roth.data'
file2 = 'car.data'
file3 = 'breast-cancer.data'

In [9]:
print('hayes-roth.data')

hayes-roth.data


In [10]:
#Handle data
def loadCsv(filename):
	lines = csv.reader(open(filename, "r"))
	dataset = list(lines)
	return dataset

In [11]:
#Load All the three datasets

# 1. Hayes Roth data
Hayes_Roth_data1 = loadCsv(file1)
# 2. Car data
Car_data2 = loadCsv(file2)
# 3. Breast Cancer data
Breast_Cancer_data3 = loadCsv(file3)

#### Hayes-Roth Dataset Attributes
        1. name:              distinct for each instance and represented numerically
        2. hobby:             nominal values ranging between 1 and 3
        3. age:               nominal values ranging between 1 and 4
        4. educational level: nominal values ranging between 1 and 4
        5. marital status:    nominal values ranging between 1 and 4
        6. class:             nominal value between 1 and 3

#### Car Evaluation Dataset Attributes

        1. buying:   vhigh, high, med, low.
        2. maint:    vhigh, high, med, low.
        3. doors:    2, 3, 4, 5more.
        4. persons:  2, 4, more.
        5. lug_boot: small, med, big.
        6. safety:   low, med, high.
        7. class:    unacc, acc, good, vgood

#### Breast Cancer Dataset Attributes

        1. Class:       no-recurrence-events, recurrence-events
        2. age:         10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99.
        3. menopause:   lt40, ge40, premeno.
        4. tumor-size:  0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44, 45-49, 50-54, 55-59.
        5. inv-nodes:   0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26, 27-29, 30-32, 33-35, 36-39.
        6. node-caps:   yes, no.
        7. deg-malig:   1, 2, 3.
        8. breast:      left, right.
        9. breast-quad: left-up, left-low, right-up,	right-low, central.
        10. irradiat:   yes, no.

2. Prepare the dataset - Perform any necessary data preprocessing (e.g., one-hot encoding).

In [12]:
Hayes_Roth_data1[0:10]

[['92', '2', '1', '1', '2', '1'],
 ['10', '2', '1', '3', '2', '2'],
 ['83', '3', '1', '4', '1', '3'],
 ['61', '2', '4', '2', '2', '3'],
 ['107', '1', '1', '3', '4', '3'],
 ['113', '1', '1', '3', '2', '2'],
 ['80', '3', '1', '3', '2', '2'],
 ['125', '3', '4', '2', '4', '3'],
 ['36', '2', '2', '1', '1', '1'],
 ['105', '3', '2', '1', '1', '1']]

In [13]:
def encode_class_labels(data):
    class_labels = {label: i for i, label in enumerate(set(row[-1] for row in data))}
    for row in data:
        row[-1] = class_labels[row[-1]]
    return data

In [14]:
Hayes_Roth_data1 = encode_class_labels(Hayes_Roth_data1)
for i in range(len(Hayes_Roth_data1)):
    Hayes_Roth_data1[i] = [int(x) for x in Hayes_Roth_data1[i]]


In [15]:
# Encoding for the Car Evaluation Data

# Define attribute value mappings
attribute_maps_car = {
    'buying': {'vhigh': 0, 'high': 1, 'med': 2, 'low': 3},
    'maint': {'vhigh': 0, 'high': 1, 'med': 2, 'low': 3},
    'doors': {'2': 0, '3': 1, '4': 2, '5more': 3},
    'persons': {'2': 0, '4': 1, 'more': 2},
    'lug_boot': {'small': 0, 'med': 1, 'big': 2},
    'safety': {'low': 0, 'med': 1, 'high': 2}
}


# Loop through each row of the dataset and replace attribute values with mappings
for row in Car_data2:
    row[:6] = [attribute_maps_car[attr][value] for attr, value in zip(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], row[:6])]

# encode the class variables for the Car Evaluation data
Car_data2 = encode_class_labels(Car_data2)


In [16]:
# Encoding for the Breast Cancer Data

# Move the class label to the last column
for row in Breast_Cancer_data3:
    row.append(row.pop(0))

# Define attribute value mappings
attribute_maps_breast = {
    'age': {'10-19': 0, '20-29': 1, '30-39': 2, '40-49': 3, '50-59': 4, '60-69': 5, '70-79': 6, '80-89': 7, '90-99': 8},
    'menopause': {'lt40': 0, 'ge40': 1, 'premeno': 2},
    'tumor_size': {'0-4': 0, '5-9': 1, '10-14': 2, '15-19': 3, '20-24': 4, '25-29': 5, '30-34': 6, '35-39': 7, '40-44': 8, '45-49': 9, '50-54': 10},
    'inv_nodes': {'0-2': 0, '3-5': 1, '6-8': 2, '9-11': 3, '12-14': 4, '15-17': 5, '18-20': 6, '21-23': 7, '24-26': 8, '27-29': 9, '30-32': 10, '33-35': 11, '36-39': 12, '40-44': 13, '45-49': 14, '50-54': 15, '55-59': 16},
    'node_caps': {'yes': 0, 'no': 1},
    'deg_malig': {'1': 0, '2': 1, '3': 2},
    'breast': {'left': 0, 'right': 1},
    'breast_quad': {'left_up': 0, 'left_low': 1, 'right_up': 2, 'right_low': 3, 'central': 4},
    'irradiat': {'yes': 0, 'no': 1}
}


# Replace missing values with the most common value for each attribute
for i in range(len(Breast_Cancer_data3[0])):
    col = [row[i] for row in Breast_Cancer_data3 if row[i] != '?']
    most_common = Counter(col).most_common(1)[0][0]
    for row in Breast_Cancer_data3:
        if row[i] == '?':
            row[i] = most_common

# Loop through each row and replace attribute values with their integer mappings
for row in Breast_Cancer_data3:
    row[:9] = [attribute_maps_breast[attr][value] for attr, value in zip(['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat'], row[:9])]

# encode the class variables for the breast cancer data
Breast_Cancer_data3 = encode_class_labels(Breast_Cancer_data3)

3. Implementation of  the K-Nearest Neighbors algorithm from scratch.

In [17]:
# Euclidean distance function
def euclidean_distance(x1, x2):
    distance = 0
    for i in range(len(x1)):
        distance += (x1[i] - x2[i]) ** 2
    return distance ** 0.5


# KNN algorithm
def knn(train_data, test_instance, k):
    distances = []
    for i, train_instance in enumerate(train_data):
        dist = euclidean_distance(train_instance[:-1], test_instance[:-1])
        distances.append((i, dist))
    distances.sort(key=lambda x: x[1])
    neighbors = [train_data[i] for i, _ in distances[:k]]
    return neighbors


# Predict the class label based on k neighbors
def predict(neighbors):
    class_labels = [neighbor[-1] for neighbor in neighbors]
    most_common = Counter(class_labels).most_common(1)
    return most_common[0][0]

4. Implementation of  the K-Folds Cross Validation algorithm from scratch.

In [18]:
# K-fold cross-validation
def k_fold_cross_validation(data, k_folds, k_neighbors):
    kf = KFold(n_splits=k_folds, shuffle=True)
    accuracies = []

    for train_index, test_index in kf.split(data):
        train_data = [data[i] for i in train_index]
        test_data = [data[i] for i in test_index]

        correct_predictions = 0
        for test_instance in test_data:
            neighbors = knn(train_data, test_instance, k_neighbors)
            prediction = predict(neighbors)
            if prediction == test_instance[-1]:
                correct_predictions += 1

        accuracy = correct_predictions / len(test_data)
        accuracies.append(accuracy)
        print(f"Fold {len(accuracies)}, Accuracy: {accuracy:.2%}")

    average_accuracy = sum(accuracies) / k_folds
    print(f"\nAverage Accuracy over {k_folds} folds: {average_accuracy:.2%}")
    return accuracies

5. Apply KNN algorithm using 10-Fold cross validation on the following datasets:

        A. Hayes-Roth
        B. Car Evaluation
        C. Breast Cancer

6. Implementation of Scikit Learn based K-Nearest Neighbors Classifier for all Dataset:

In [19]:
#import the required libaries for the scikit learn based knn classifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [20]:
def knn_with_cross_validation(dataset, k_neighbors=3, k_folds=10):
    X = [row[:-1] for row in dataset]
    y = [row[-1] for row in dataset]

    knn_classifier = KNeighborsClassifier(n_neighbors=k_neighbors)
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

    accuracies = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

        knn_classifier.fit(X_train, y_train)
        y_pred = knn_classifier.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
        print(f"Accuracy (Scikit-learn KNN): {accuracy:.2%}")

    average_accuracy = sum(accuracies) / k_folds
    print(f"\nAverage Accuracy (Scikit-learn KNN) over {k_folds} folds: {average_accuracy:.2%}")
    return accuracies


In [21]:
# data1 is Hayes-Roth
print("\n(Scikit-learn KNN) with Cross-Validation for Hayes-Roth:")
hayes_roth_accuracies_sk = knn_with_cross_validation(Hayes_Roth_data1)


(Scikit-learn KNN) with Cross-Validation for Hayes-Roth:
Accuracy (Scikit-learn KNN): 42.86%
Accuracy (Scikit-learn KNN): 50.00%
Accuracy (Scikit-learn KNN): 46.15%
Accuracy (Scikit-learn KNN): 30.77%
Accuracy (Scikit-learn KNN): 46.15%
Accuracy (Scikit-learn KNN): 38.46%
Accuracy (Scikit-learn KNN): 38.46%
Accuracy (Scikit-learn KNN): 46.15%
Accuracy (Scikit-learn KNN): 38.46%
Accuracy (Scikit-learn KNN): 38.46%

Average Accuracy (Scikit-learn KNN) over 10 folds: 41.59%


In [22]:
# data2 is Car Evaluation
print("\n(Scikit-learn KNN) with Cross-Validation for Car Evaluation:")
car_evaluation_accuracies_sk = knn_with_cross_validation(Car_data2)


(Scikit-learn KNN) with Cross-Validation for Car Evaluation:
Accuracy (Scikit-learn KNN): 87.28%
Accuracy (Scikit-learn KNN): 87.86%
Accuracy (Scikit-learn KNN): 87.86%
Accuracy (Scikit-learn KNN): 91.33%
Accuracy (Scikit-learn KNN): 94.80%
Accuracy (Scikit-learn KNN): 87.28%
Accuracy (Scikit-learn KNN): 87.86%
Accuracy (Scikit-learn KNN): 90.17%
Accuracy (Scikit-learn KNN): 88.37%
Accuracy (Scikit-learn KNN): 84.30%

Average Accuracy (Scikit-learn KNN) over 10 folds: 88.71%


In [23]:
# data3 is Breast Cancer
print("\n(Scikit-learn KNN) with Cross-Validation for Breast Cancer:")
breast_cancer_accuracies_sk = knn_with_cross_validation(Breast_Cancer_data3)


(Scikit-learn KNN) with Cross-Validation for Breast Cancer:
Accuracy (Scikit-learn KNN): 75.86%
Accuracy (Scikit-learn KNN): 65.52%
Accuracy (Scikit-learn KNN): 75.86%
Accuracy (Scikit-learn KNN): 62.07%
Accuracy (Scikit-learn KNN): 72.41%
Accuracy (Scikit-learn KNN): 65.52%
Accuracy (Scikit-learn KNN): 71.43%
Accuracy (Scikit-learn KNN): 67.86%
Accuracy (Scikit-learn KNN): 60.71%
Accuracy (Scikit-learn KNN): 82.14%

Average Accuracy (Scikit-learn KNN) over 10 folds: 69.94%


7. Comparison of the Result Accuracies for all dataset btw KNN Classifier From Scratch and Scikit Learn based KNN Classifier

        Hayes Roth

In [24]:
def perform_paired_t_test(data1, data2):
    from scipy.stats import ttest_rel
    t_statistic, p_value = ttest_rel(data1, data2)
    print(f"T-statistic: {t_statistic}")
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("The difference is statistically significant.")
    else:
        print("The difference is not statistically significant.")


        Car Evaluation

        Breast Cancer

In [26]:
print("\nPaired t-test results for Hayes-Roth:")
perform_paired_t_test(hayes_roth_accuracies, hayes_roth_accuracies_sk)

print("\nPaired t-test results for Car Evaluation:")
perform_paired_t_test(car_evaluation_accuracies, car_evaluation_accuracies_sk)

print("\nPaired t-test results for Breast Cancer:")
perform_paired_t_test(breast_cancer_accuracies, breast_cancer_accuracies_sk)


Paired t-test results for Hayes-Roth:


TypeError: perform_paired_t_test() missing 1 required positional argument: 'data2'

In [27]:
import warnings
import csv
from collections import Counter
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import ttest_rel

warnings.filterwarnings('ignore')

# 1. Load the three datasets from the UCI-Machine Learning Repository:

# File names of the datasets
file1 = 'hayes-roth.data'
file2 = 'car.data'
file3 = 'breast-cancer.data'


# Handle data
def load_csv(filename):
    with open(filename, "r") as file:
        lines = csv.reader(file)
        dataset = list(lines)
    return dataset


# Load All the three datasets

# 1. Hayes Roth data
Hayes_Roth_data1 = load_csv(file1)
# 2. Car data
Car_data2 = load_csv(file2)
# 3. Breast Cancer data
Breast_Cancer_data3 = load_csv(file3)


# Function to Encode class labels to numerical values
def encode_class_labels(data):
    class_labels = {label: i for i, label in enumerate(set(row[-1] for row in data))}
    for row in data:
        row[-1] = class_labels[row[-1]]
    return data


# Encoding for Hayes data
Hayes_Roth_data1 = encode_class_labels(Hayes_Roth_data1)
for i in range(len(Hayes_Roth_data1)):
    Hayes_Roth_data1[i] = [int(x) for x in Hayes_Roth_data1[i]]

# Encoding for the Car Evaluation Data

# Define attribute value mappings
attribute_maps_car = {
    'buying': {'vhigh': 0, 'high': 1, 'med': 2, 'low': 3},
    'maint': {'vhigh': 0, 'high': 1, 'med': 2, 'low': 3},
    'doors': {'2': 0, '3': 1, '4': 2, '5more': 3},
    'persons': {'2': 0, '4': 1, 'more': 2},
    'lug_boot': {'small': 0, 'med': 1, 'big': 2},
    'safety': {'low': 0, 'med': 1, 'high': 2}
}


# Loop through each row of the dataset and replace attribute values with mappings
for row in Car_data2:
    row[:6] = [attribute_maps_car[attr][value] for attr, value in zip(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'], row[:6])]

# encode the class variables for the Car Evaluation data
Car_data2 = encode_class_labels(Car_data2)

# Encoding for the Breast Cancer Data

# Move the class label to the last column
for row in Breast_Cancer_data3:
    row.append(row.pop(0))

# Define attribute value mappings
attribute_maps_breast = {
    'age': {'10-19': 0, '20-29': 1, '30-39': 2, '40-49': 3, '50-59': 4, '60-69': 5, '70-79': 6, '80-89': 7, '90-99': 8},
    'menopause': {'lt40': 0, 'ge40': 1, 'premeno': 2},
    'tumor_size': {'0-4': 0, '5-9': 1, '10-14': 2, '15-19': 3, '20-24': 4, '25-29': 5, '30-34': 6, '35-39': 7, '40-44': 8, '45-49': 9, '50-54': 10},
    'inv_nodes': {'0-2': 0, '3-5': 1, '6-8': 2, '9-11': 3, '12-14': 4, '15-17': 5, '18-20': 6, '21-23': 7, '24-26': 8, '27-29': 9, '30-32': 10, '33-35': 11, '36-39': 12, '40-44': 13, '45-49': 14, '50-54': 15, '55-59': 16},
    'node_caps': {'yes': 0, 'no': 1},
    'deg_malig': {'1': 0, '2': 1, '3': 2},
    'breast': {'left': 0, 'right': 1},
    'breast_quad': {'left_up': 0, 'left_low': 1, 'right_up': 2, 'right_low': 3, 'central': 4},
    'irradiat': {'yes': 0, 'no': 1}
}


# Replace missing values with the most common value for each attribute
for i in range(len(Breast_Cancer_data3[0])):
    col = [row[i] for row in Breast_Cancer_data3 if row[i] != '?']
    most_common = Counter(col).most_common(1)[0][0]
    for row in Breast_Cancer_data3:
        if row[i] == '?':
            row[i] = most_common

# Loop through each row and replace attribute values with their integer mappings
for row in Breast_Cancer_data3:
    row[:9] = [attribute_maps_breast[attr][value] for attr, value in zip(['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat'], row[:9])]

# encode the class variables for the breast cancer data
Breast_Cancer_data3 = encode_class_labels(Breast_Cancer_data3)


# Euclidean distance function
def euclidean_distance(x1, x2):
    distance = 0
    for i in range(len(x1)):
        distance += (x1[i] - x2[i]) ** 2
    return distance ** 0.5


# KNN algorithm
def knn(train_data, test_instance, k):
    distances = []
    for i, train_instance in enumerate(train_data):
        dist = euclidean_distance(train_instance[:-1], test_instance[:-1])
        distances.append((i, dist))
    distances.sort(key=lambda x: x[1])
    neighbors = [train_data[i] for i, _ in distances[:k]]
    return neighbors


# Predict the class label based on k neighbors
def predict(neighbors):
    class_labels = [neighbor[-1] for neighbor in neighbors]
    most_common = Counter(class_labels).most_common(1)
    return most_common[0][0]


# K-fold cross-validation
def k_fold_cross_validation(data, k_folds, k_neighbors):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=None)
    accuracies = []
    for train_index, test_index in kf.split(data):
        train_data = [data[i] for i in train_index]
        test_data = [data[i] for i in test_index]
        correct = 0
        for test_instance in test_data:
            neighbors = knn(train_data, test_instance, k_neighbors)
            prediction = predict(neighbors)
            if prediction == test_instance[-1]:
                correct += 1
        accuracies.append(correct / len(test_data))
    return accuracies


# Perform paired t-test
def perform_paired_t_test(data1, data2):
    t_stat, p_value = ttest_rel(data1, data2)
    print(f"T-statistic: {t_stat}")
    print(f"P-value: {p_value}")
    if p_value < 0.05:
        print("The difference is statistically significant.")
    else:
        print("The difference is not statistically significant.")


# Apply KNN algorithm using 10-Fold cross validation on the datasets

# Hayes-Roth
hayes_roth_accuracies = k_fold_cross_validation(Hayes_Roth_data1, k_folds=10, k_neighbors=3)

# Car Evaluation
car_evaluation_accuracies = k_fold_cross_validation(Car_data2, k_folds=10, k_neighbors=3)

# Breast Cancer
breast_cancer_accuracies = k_fold_cross_validation(Breast_Cancer_data3, k_folds=10, k_neighbors=3)

# Scikit-learn based KNN classifier
def knn_with_cross_validation(dataset, k_neighbors=3, k_folds=10):
    X = [row[:-1] for row in dataset]
    y = [row[-1] for row in dataset]
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=None)
    accuracies = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]
        knn_classifier = KNeighborsClassifier(n_neighbors=k_neighbors)
        knn_classifier.fit(X_train, y_train)
        y_pred = knn_classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)
    return accuracies

# Hayes-Roth
hayes_roth_accuracies_sk = knn_with_cross_validation(Hayes_Roth_data1)

# Car Evaluation
car_evaluation_accuracies_sk = knn_with_cross_validation(Car_data2)

# Breast Cancer
breast_cancer_accuracies_sk = knn_with_cross_validation(Breast_Cancer_data3)

# Comparison of the Result Accuracies for all datasets between KNN Classifier From Scratch and Scikit Learn based KNN Classifier

print("\nPaired t-test results for Hayes-Roth:")
perform_paired_t_test(hayes_roth_accuracies, hayes_roth_accuracies_sk)

print("\nPaired t-test results for Car Evaluation:")
perform_paired_t_test(car_evaluation_accuracies, car_evaluation_accuracies_sk)

print("\nPaired t-test results for Breast Cancer:")
perform_paired_t_test(breast_cancer_accuracies, breast_cancer_accuracies_sk)



Paired t-test results for Hayes-Roth:
T-statistic: -1.176383517921496
P-value: 0.2696183557138195
The difference is not statistically significant.

Paired t-test results for Car Evaluation:
T-statistic: -0.3097139905233788
P-value: 0.7638294630615718
The difference is not statistically significant.

Paired t-test results for Breast Cancer:
T-statistic: -0.5998239009644394
P-value: 0.563418407031062
The difference is not statistically significant.
