In [22]:
import os
import requests
import zipfile
import pandas as pd

def load_vertebral_column_uci():
    # URL do arquivo ZIP
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip"
    # Caminho local para salvar o arquivo ZIP
    zip_path = "dados/vertebral_column_data.zip"
    # Caminho local para o arquivo de dados extraído
    data_path = "dados/column_3C.dat"

    # Baixar o arquivo ZIP se ainda não foi baixado
    if not os.path.exists(zip_path):
        r = requests.get(url)
        with open(zip_path, "wb") as f:
            f.write(r.content)

    # Extrair o arquivo de dados do ZIP se ainda não foi extraído
    if not os.path.exists(data_path):
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall("dados")

    # Ler o arquivo de dados
    column_names = ['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis', 'class']
    vertebral_data = pd.read_csv(data_path, header=None, sep=' ', names=column_names)
    X = vertebral_data.iloc[:, :-1].values
    y = vertebral_data.iloc[:, -1].replace({'DH': 0, 'SL': 1, 'NO': 2}).values

    return X, y


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_data(X, y, test_size=0.2, random_state=42):
    # Dividir os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Padronizar os conjuntos de treinamento e teste separadamente
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test


In [24]:
from sklearn.neighbors import KNeighborsClassifier

def train_knn(X_train, y_train, n_neighbors=5):
    knn_classifier = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_classifier.fit(X_train, y_train)
    return knn_classifier


In [25]:
import numpy as np

class DMC:
    def __init__(self):
        self.centroids = None

    def fit(self, X_train, y_train):
        self.centroids = {}
        labels = np.unique(y_train)
        for label in labels:
            self.centroids[label] = np.mean(X_train[y_train == label], axis=0)

    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            min_distance = float('inf')
            predicted_label = None
            for label, centroid in self.centroids.items():
                distance = np.linalg.norm(x - centroid)
                if distance < min_distance:
                    min_distance = distance
                    predicted_label = label
            y_pred.append(predicted_label)
        return np.array(y_pred)


In [26]:
class NaiveBayes:
    def __init__(self):
        self.class_probs = None
        self.means = None
        self.stds = None

    def fit(self, X_train, y_train):
        self.class_probs = {}
        self.means = {}
        self.stds = {}
        labels = np.unique(y_train)
        for label in labels:
            self.class_probs[label] = np.mean(y_train == label)
            self.means[label] = np.mean(X_train[y_train == label], axis=0)
            self.stds[label] = np.std(X_train[y_train == label], axis=0)

    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            max_prob = float('-inf')
            predicted_label = None
            for label, prob in self.class_probs.items():
                likelihood = np.sum(np.log((1 / (np.sqrt(2 * np.pi) * self.stds[label] + 1e-9)) * np.exp(-((x - self.means[label]) ** 2) / (2 * ((self.stds[label] + 1e-9) ** 2)))))
                posterior = np.log(prob) + likelihood
                if posterior > max_prob:
                    max_prob = posterior
                    predicted_label = label
            y_pred.append(predicted_label)
        return np.array(y_pred)


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np
import os
import requests
import zipfile
import pandas as pd

# Função para carregar os dados do conjunto de dados vertebral column da UCI
def load_vertebral_column_uci():
    # URL do arquivo ZIP
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00212/vertebral_column_data.zip"
    # Caminho local para salvar o arquivo ZIP
    zip_path = "dados/vertebral_column_data.zip"
    # Caminho local para o arquivo de dados extraído
    data_path = "dados/column_3C.dat"

    # Baixar o arquivo ZIP se ainda não foi baixado
    if not os.path.exists(zip_path):
        r = requests.get(url)
        with open(zip_path, "wb") as f:
            f.write(r.content)

    # Extrair o arquivo de dados do ZIP se ainda não foi extraído
    if not os.path.exists(data_path):
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall("dados")

    # Ler o arquivo de dados
    column_names = ['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis', 'class']
    vertebral_data = pd.read_csv(data_path, header=None, sep=' ', names=column_names)
    X = vertebral_data.iloc[:, :-1].values
    y = vertebral_data.iloc[:, -1].replace({'DH': 0, 'SL': 1, 'NO': 2}).values

    return X, y

# Função para dividir os dados em treino e teste e executar o holdout com 20 realizações
def holdout(X, y, test_size=0.3, random_state=42, num_runs=20):
    best_accuracy_knn = -1
    best_accuracy_dmc = -1
    best_accuracy_nb = -1
    accuracies_knn = []
    accuracies_dmc = []
    accuracies_nb = []

    for run in range(1, num_runs + 1):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        # Treinamento e teste do k-NN
        knn = KNeighborsClassifier()
        knn.fit(X_train, y_train)
        y_pred_knn = knn.predict(X_test)
        accuracy_knn = accuracy_score(y_test, y_pred_knn)
        accuracies_knn.append(accuracy_knn)
        if accuracy_knn > best_accuracy_knn:
            best_accuracy_knn = accuracy_knn

        # Treinamento e teste do DMC (Discriminante de Mínima Distância)
        dmc = NearestCentroid()
        dmc.fit(X_train, y_train)
        y_pred_dmc = dmc.predict(X_test)
        accuracy_dmc = accuracy_score(y_test, y_pred_dmc)
        accuracies_dmc.append(accuracy_dmc)
        if accuracy_dmc > best_accuracy_dmc:
            best_accuracy_dmc = accuracy_dmc

        # Treinamento e teste do Naive Bayes
        nb = GaussianNB()
        nb.fit(X_train, y_train)
        y_pred_nb = nb.predict(X_test)
        accuracy_nb = accuracy_score(y_test, y_pred_nb)
        accuracies_nb.append(accuracy_nb)
        if accuracy_nb > best_accuracy_nb:
            best_accuracy_nb = accuracy_nb

    # Calcula a média e o desvio padrão das acurácias para cada classificador
    mean_accuracy_knn = np.mean(accuracies_knn)
    std_accuracy_knn = np.std(accuracies_knn)
    mean_accuracy_dmc = np.mean(accuracies_dmc)
    std_accuracy_dmc = np.std(accuracies_dmc)
    mean_accuracy_nb = np.mean(accuracies_nb)
    std_accuracy_nb = np.std(accuracies_nb)

    return (best_accuracy_knn, mean_accuracy_knn, std_accuracy_knn), \
           (best_accuracy_dmc, mean_accuracy_dmc, std_accuracy_dmc), \
           (best_accuracy_nb, mean_accuracy_nb, std_accuracy_nb)

# Carregar os dados
X, y = load_vertebral_column_uci()

# Executar o holdout
best_knn, best_dmc, best_nb = holdout(X, y)

print("Melhor acurácia para k-NN:")
print("Acurácia máxima:", best_knn[0])
print("Média das acurácias:", best_knn[1])
print("Desvio padrão das acurácias:", best_knn[2])

print("\nMelhor acurácia para DMC:")
print("Acurácia máxima:", best_dmc[0])
print("Média das acurácias:", best_dmc[1])
print("Desvio padrão das acurácias:", best_dmc[2])

print("\nMelhor acurácia para Naive Bayes:")
print("Acurácia máxima:", best_nb[0])
print("Média das acurácias:", best_nb[1])
print("Desvio padrão das acurácias:", best_nb[2])


  y = vertebral_data.iloc[:, -1].replace({'DH': 0, 'SL': 1, 'NO': 2}).values


NameError: name 'NearestCentroid' is not defined