In [1]:
import pandas as pd
import numpy as np

class Node:
    def __init__(self, label=None, attribute_name=None, attribute_value=None, branches=None):
        self.label = label
        self.attribute_name = attribute_name
        self.attribute_value = attribute_value
        self.branches = branches or {}

def entropy(data):
    target_values = data.iloc[:, -1].value_counts().values
    total = len(data)
    p = target_values / total
    return -np.sum(p * np.log2(p))

def information_gain(data, attribute_name):
    total_entropy = entropy(data)
    values, counts = np.unique(data[attribute_name], return_counts=True)
    weighted_entropy = sum([(counts[i]/len(data)) * entropy(data[data[attribute_name] == v]) for i, v in enumerate(values)])
    return total_entropy - weighted_entropy

def decision_tree_algorithm(data, attributes, depth=0, max_depth=5):
    unique_labels = data.iloc[:, -1].unique()
    if len(unique_labels) == 1:
        return Node(label=unique_labels[0])
    if not attributes or depth == max_depth:
        return Node(label=data.iloc[:, -1].mode().values[0])
    
    best_attribute = max(attributes, key=lambda attr: information_gain(data, attr))
    attributes.remove(best_attribute)
    
    tree = Node(attribute_name=best_attribute)
    for value in data[best_attribute].unique():
        subset = data[data[best_attribute] == value]
        tree.branches[value] = decision_tree_algorithm(subset, attributes.copy(), depth+1, max_depth)
    return tree

def classify(instance, tree):
    if tree.label is not None:
        return tree.label
    value = instance[tree.attribute_name]
    if value in tree.branches:
        return classify(instance, tree.branches[value])
    return None  

def k_fold_cross_validation(data, k=5):
    folds = np.array_split(data, k)
    accuracies = []

    for i in range(k):
        train = pd.concat([folds[j] for j in range(k) if j != i])
        validation = folds[i]
        
        attributes = train.columns[:-1].tolist()
        tree = decision_tree_algorithm(train, attributes)
        
        correct = 0
        for _, row in validation.iterrows():
            if classify(row, tree) == row[-1]:
                correct += 1
                
        accuracies.append(correct / len(validation))
    return np.mean(accuracies)

data = pd.read_csv('Car_evaluation.csv')
for column in data.columns:
    data[column] = data[column].astype('category').cat.codes

accuracy = k_fold_cross_validation(data)
print(f"Accuracy with 5-Fold Cross-Validation: {accuracy * 100:.2f}%")


Accuracy with 5-Fold Cross-Validation: 72.78%


In [2]:
import pandas as pd
import numpy as np

def euclidean_distance(instance1, instance2):
    return np.sqrt(np.sum((instance1 - instance2)**2))

def knn(training_data, test_instance, k=3):
    distances = []
    for index, training_instance in training_data.iterrows():
        dist = euclidean_distance(test_instance[:-1].values, training_instance[:-1].values)
        distances.append((training_instance, dist))
    distances.sort(key=lambda x: x[1])

    votes = {}
    for i in range(k):
        response = distances[i][0][-1]
        votes[response] = votes.get(response, 0) + 1
    return max(votes, key=votes.get)

def k_fold_cross_validation(data, k_neighbors=3, k_folds=5):
    shuffled_data = data.sample(frac=1).reset_index(drop=True)
    folds = np.array_split(shuffled_data, k_folds)
    
    accuracies = []
    for i in range(k_folds):
        training_data = pd.concat([folds[j] for j in range(k_folds) if j != i])
        validation_data = folds[i]
        
        correct_predictions = 0
        for _, instance in validation_data.iterrows():
            prediction = knn(training_data, instance, k_neighbors)
            if prediction == instance[-1]:
                correct_predictions += 1
        accuracies.append(correct_predictions / len(validation_data))
        
    return np.mean(accuracies)

data = pd.read_csv('Car_evaluation.csv')
for column in data.columns:
    data[column] = data[column].astype('category').cat.codes

data_sample = data.sample(frac=0.05).reset_index(drop=True)

accuracy = k_fold_cross_validation(data_sample, k_neighbors=13, k_folds=5)
print(f"Accuracy with 5-Fold Cross-Validation on 5% of the data: {accuracy * 100:.2f}%")


Accuracy with 5-Fold Cross-Validation on 5% of the data: 74.38%
