In [29]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.tree import DecisionTreeClassifier

pio.templates.default = "seaborn"
RANDOM_STATE = 1234

# Algoritmos

## Regressão Logística

In [None]:
class LogisticRegression:
    """Classe de implementação da Regressão Logistica"""

    def __init__(self, alfa=0.5, n_iter=250):
        self.alfa = alfa
        self.n_iter = n_iter

    def sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, X, y):
        self.W = np.ones(shape=(1 + X.shape[1])) * 0.5
        self.cost_values = []
        N = y.size

        for i in range(self.n_iter):
            output = self.sigmoid(np.dot(X, self.W[1:]) + self.W[0])
            errors = y - output
            self.W[1:] += (self.alfa / N) * X.T.dot(errors)
            self.W[0] += (self.alfa / N) * errors.sum()

            cost = -(1 / N) * y.dot(np.log(output)) - ((1 - y).dot(np.log(1 - output)))
            self.cost_values.append(cost)

    def predict(self, X):
        return np.where(np.dot(X, self.W[1:]) + self.W[0] >= 0.0, 1, 0)

## Discriminante Gaussiano

In [None]:
class ADG:
    """Classe de implementação do Discriminante Gaussiano"""

    def __init__(self):
        self.P_CK = dict()
        self.mu = dict()
        self.sigma = dict()

    def fit(self, X, y):
        self.labels = np.unique(y)
        for k in self.labels:
            X_k = X[y == k]
            self.P_CK[k] = X_k.shape[0] / X.shape[0]
            self.mu[k] = np.mean(X_k, axis=0)
            self.sigma[k] = np.cov(X_k, rowvar=False)

    def predict(self, X):
        pred_list = list()
        for xi in X:
            logs_list = list()
            for k in self.labels:
                sigma_inv_mat = np.linalg.pinv(self.sigma[k])
                sigma_inv_det = np.linalg.det(sigma_inv_mat)
                x_minus_mu = xi - self.mu[k]

                log_p_ck = np.log(self.P_CK[k])

                ps = (
                    0.5 * np.log(sigma_inv_det)
                    - 0.5 * x_minus_mu.T @ sigma_inv_mat @ x_minus_mu
                )

                log_ck = log_p_ck + ps

                logs_list.append(log_ck)
            pred = self.labels[np.argmax(logs_list)]
            pred_list.append(pred)
        return np.array(pred_list)

## Naive Bayes Gaussiano

In [None]:
class NBG:
    """Classe de implementação do Naive Bayes Gaussiano"""

    def __init__(self):
        self.P_CK = dict()
        self.mu = dict()
        self.sigma = dict()

    def fit(self, X, y):
        self.labels = np.unique(y)
        for k in self.labels:
            X_k = X[y == k]
            self.P_CK[k] = X_k.shape[0] / X.shape[0]
            self.mu[k] = np.mean(X_k, axis=0)
            self.sigma[k] = np.diag(np.diag(np.cov(X_k, rowvar=False)))

    def predict(self, X):
        pred_list = list()
        for xi in X:
            logs_list = list()
            for k in self.labels:
                sigma_inv_mat = np.linalg.inv(self.sigma[k])
                sigma_inv_det = np.linalg.det(sigma_inv_mat)
                x_minus_mu = xi - self.mu[k]

                log_p_ck = np.log(self.P_CK[k])

                ps = (
                    0.5 * np.log(sigma_inv_det)
                    - 0.5 * x_minus_mu.T @ sigma_inv_mat @ x_minus_mu
                )

                log_ck = log_p_ck + ps

                logs_list.append(log_ck)
            pred = self.labels[np.argmax(logs_list)]
            pred_list.append(pred)
        return np.array(pred_list)

## K Nearest Neighbors

In [None]:
class KNN:
    """Classe de implementação do K Vizinhos Mais Proximos"""

    def __init__(self, k):
        self.k = k
        self.k_indices = []

    def _get_distances(self, X_source, X_target):
        dists = (
            -2 * np.dot(X_target, X_source.T)
            + np.sum(X_source ** 2, axis=1)
            + np.sum(X_target ** 2, axis=1)[:, np.newaxis]
        )
        return dists

    def _find_k_nearest_index(self, distances, k):
        return np.argpartition(distances, k, axis=1)[:, :k]

    def _find_k_nearest_classes(self, y_target, k_indices):
        y_pred = y_target.to_numpy()[k_indices]
        return np.where(np.sum(y_pred, axis=1) > k_indices.shape[1] / 2, 1, 0)

    def fit(self, X_train, X_test):
        distances = self._get_distances(X_train, X_test)
        self.k_indices = self._find_k_nearest_index(distances, self.k)

    def predict(self, y_train):
        return self._find_k_nearest_classes(y_train, self.k_indices)

# Inicialização

In [None]:
df = pd.read_csv("breastcancer.csv", header=None)

In [None]:
def split_data(df, train_size=0.7, random_state=RANDOM_STATE):
    """
    Divisão dos dados
    """
    df_train = df.sample(frac=train_size, random_state=random_state)
    df_test = df.drop(index=df_train.index)

    x_train, y_train = df_train.iloc[:, :-1], df_train.iloc[:, -1]
    x_test, y_test = df_test.iloc[:, :-1], df_test.iloc[:, -1]

    return x_train, x_test, y_train, y_test

In [None]:
# Criando variaveis auxiliares para utilizar posteriormente
x_train, x_test, y_train, y_test = split_data(
    df
)  
df_train = pd.DataFrame(
    np.hstack((x_train.to_numpy(), y_train.to_numpy()[:, np.newaxis]))
)
df_test = pd.DataFrame(np.hstack((x_test.to_numpy(), y_test.to_numpy()[:, np.newaxis])))

# Pré Processamento

In [None]:
class Preprocesser:
    def __init__(self, x):
        self.min = np.min(x, axis=0)
        self.max = np.max(x, axis=0)

    def make_scaling(self, x):
        x_scaled = (x - self.min) / (self.max - self.min)
        return x_scaled

    def reverse_scaling(self, x_scaled):
        x_original = x_scaled * (self.max - self.min) + self.min
        return x_original

In [None]:
x_train_scaled = Preprocesser(x_train).make_scaling(x_train)
x_test_scaled = Preprocesser(x_train).make_scaling(x_test)

# Métricas

In [None]:
def get_metrics(y_test, y_pred):
    """
    Função para contar os as ocorrências das classes e armazenar os valores da métricas.

    TP = True Positives
    TN = True Negatives
    FP = False Positives
    FN = False Negtives
    """
    tp, tn, fp, fn = 0, 0, 0, 0
    # counting tp, tn, fn, fp
    for x, y in list(zip(y_test, y_pred)):
        if x == 1 and x == y:  # y == 1
            tp += 1
        elif x == 1 and x != y:  # y == 0
            fn += 1
        elif x == 0 and x == y:  # y == 0
            tn += 1
        elif x == 0 and x != y:  # y == 1
            fp += 1
    # create dict of metrics
    metrics = dict.fromkeys(["precision", "recall", "f1_score", "accuracy"])
    metrics["precision"] = tp / (tp + fp)
    metrics["recall"] = tp / (tp + fn)
    metrics["f1_score"] = (
        2
        * (metrics["recall"] * metrics["precision"])
        / (metrics["recall"] + metrics["precision"])
    )
    metrics["accuracy"] = (tp + tn) / (tp + fp + tn + fn)

    return metrics

In [None]:
def get_mean_metrics(metrics_):
    """
    Função para sumarizar os dados que serão recebidos posteriormente pelo kfold.
    Necessária pois armazeno todas as informações de cada fold e tiro a média depois.
    """

    # if k_folded get the results from the list of metrics
    precision_list = [item["metrics"]["precision"] for item in metrics_]
    accuracy_list = [item["metrics"]["accuracy"] for item in metrics_]
    recall_list = [item["metrics"]["recall"] for item in metrics_]
    f1_score_list = [item["metrics"]["f1_score"] for item in metrics_]

    values = dict(
        precision=[np.mean(precision_list), np.std(precision_list)],
        accuracy=[np.mean(accuracy_list), np.std(accuracy_list)],
        recall=[np.mean(recall_list), np.std(recall_list)],
        f1_score=[np.mean(f1_score_list), np.std(f1_score_list)],
    )

    # print the list of metrics generated through k-fold
    pretty_text = (
        f"Precision: {values['precision'][0]:.2f} ± {values['precision'][1]:.2f}"
    )
    pretty_text += (
        f"\nAccuracy: {values['accuracy'][0]:.2f} ± {values['accuracy'][1]:.2f}"
    )
    pretty_text += f"\nRecall: {values['recall'][0]:.2f} ± {values['recall'][1]:.2f}"
    pretty_text += (
        f"\nF1 Score: {values['f1_score'][0]:.2f} ± {values['f1_score'][1]:.2f}"
    )
    print(pretty_text)

# K - Fold

In [None]:
class KFoldValidation:
    def __init__(self, X, k):
        self.X_v = X.copy()
        self.k = k
        self.k_metrics = []
        self.k_indices = np.array_split(X.index, k)

    def _make_folds(self, i):
        
        """"
        Função para dividir os indices do dataset em k partes iguais.
        A partir dessas k partes, separo 1 para teste e k - 1 para treino,
        para fazer a validação. 
        """

        # create test fold, using the i-th chunk of the list of indices
        X_v_test = self.X_v.loc[self.k_indices[i]]

        # create train fold, using the rest that it wasn't used on test
        X_v_train = self.X_v.drop(index=self.k_indices[i])

        return X_v_train, X_v_test

    def _preprocess(self, X_v_train, X_v_test):
        X_v_train_scaled = Preprocesser(X_v_train.iloc[:, :-1]).make_scaling(
            X_v_train.iloc[:, :-1]
        )
        X_v_test_scaled = Preprocesser(X_v_train.iloc[:, :-1]).make_scaling(
            X_v_test.iloc[:, :-1]
        )
        return X_v_train_scaled, X_v_test_scaled

    def _save_results(self, y_pred, y_test, fold):
        # get_scores
        metrics = get_metrics(y_test, y_pred)
        # save_results
        self.k_metrics.append(dict(indices=self.k_indices[fold], metrics=metrics))

    def execute(self, classifier, how="with_std"):
        """
        Função de execução do kfold.
        Recebe dois argumentos:
            1. Classifier -> Classificador que será utilizado
            2. How -> Maneira de como se trabalhar os dados (normalização na reg. logistica
            mas não nos classificadores estatisticos) .
        """

        for fold in range(self.k):

            X_v_train, X_v_test = self._make_folds(fold)

            if how == "with_std":
                X_v_train_scaled, X_v_test_scaled = self._preprocess(
                    X_v_train, X_v_test
                )
                classifier.fit(X_v_train_scaled, X_v_train.iloc[:, -1])
                y_pred = classifier.predict(X_v_test_scaled)

            if how == "knn_only":
                X_v_train_scaled, X_v_test_scaled = self._preprocess(
                    X_v_train, X_v_test
                )
                classifier.fit(X_v_train_scaled.to_numpy(), X_v_test_scaled.to_numpy())
                y_pred = classifier.predict(X_v_train.iloc[:, -1])

            if how == "without_std":
                classifier.fit(
                    X_v_train.iloc[:, :-1].to_numpy(), X_v_train.iloc[:, -1].to_numpy()
                )
                y_pred = classifier.predict(X_v_test.iloc[:, :-1].to_numpy())

            self._save_results(y_pred, X_v_test.iloc[:, -1], fold)

# Implementação

## Regressão Logística

### Validação

In [None]:
lr_classifier = LogisticRegression()
lr_kfold = KFoldValidation(df_train, 10)
lr_kfold.execute(lr_classifier, how="with_std")
get_mean_metrics(lr_kfold.k_metrics)

Precision: 0.93 ± 0.06
Accuracy: 0.94 ± 0.05
Recall: 0.99 ± 0.02
F1 Score: 0.96 ± 0.03


### Teste Final

In [None]:
lr_classifier.fit(x_train_scaled, y_train)
get_metrics(y_test, lr_classifier.predict(x_test_scaled))

{'accuracy': 0.9649122807017544,
 'f1_score': 0.9714285714285713,
 'precision': 0.9622641509433962,
 'recall': 0.9807692307692307}

In [None]:
fig = px.line(
    x=list(range(np.size(lr_classifier.cost_values))),
    y=lr_classifier.cost_values,
    title="Custo x Iteração",
)
fig.update_layout(
    autosize=False, xaxis=dict(title="Iteração"), yaxis=dict(title="Custo no Treino")
)

## Analise do Discriminante Gaussiano

### Validação

In [None]:
qda_classifier = ADG()
qda_kfold = KFoldValidation(
    df_train, 3
)  # Para valores acima de 3, tenho problemas com a inversa da matriz.
qda_kfold.execute(qda_classifier, how="without_std")
get_mean_metrics(qda_kfold.k_metrics)

Precision: 0.97 ± 0.03
Accuracy: 0.94 ± 0.01
Recall: 0.94 ± 0.01
F1 Score: 0.95 ± 0.01


### Teste Final

In [None]:
qda_classifier.fit(df_train.iloc[:, :-1].to_numpy(), df_train.iloc[:, -1].to_numpy())
get_metrics(
    qda_classifier.predict(df_test.iloc[:, :-1].to_numpy()),
    df_test.iloc[:, -1].to_numpy(),
)

{'accuracy': 0.935672514619883,
 'f1_score': 0.9452736318407959,
 'precision': 0.9134615384615384,
 'recall': 0.979381443298969}

## Naive Bayes Gaussiano

### Validação

In [None]:
nbg_classifier = NBG()
nbg_kfold = KFoldValidation(
    df_train, 10
)
nbg_kfold.execute(nbg_classifier, how="without_std")
get_mean_metrics(nbg_kfold.k_metrics)

Precision: 0.93 ± 0.05
Accuracy: 0.92 ± 0.03
Recall: 0.95 ± 0.03
F1 Score: 0.94 ± 0.03


### Teste Final

In [None]:
nbg_classifier.fit(df_train.iloc[:, :-1].to_numpy(), df_train.iloc[:, -1].to_numpy())

get_metrics(
    nbg_classifier.predict(df_test.iloc[:, :-1].to_numpy()),
    df_test.iloc[:, -1].to_numpy(),
)

{'accuracy': 0.9298245614035088,
 'f1_score': 0.9411764705882353,
 'precision': 0.9230769230769231,
 'recall': 0.96}

## KNN ( K = 3, Euclideano)

### Validação

In [None]:
knn_classifier = KNN(3)
knn_kfold = KFoldValidation(df_train, 10)
knn_kfold.execute(knn_classifier, how="knn_only")
get_mean_metrics(knn_kfold.k_metrics)

Precision: 0.96 ± 0.03
Accuracy: 0.96 ± 0.02
Recall: 0.98 ± 0.02
F1 Score: 0.97 ± 0.02


### Teste Final

In [None]:
X_train_scaled = x_train_scaled.to_numpy()
X_test_scaled = x_test_scaled.to_numpy()

In [None]:
knn = KNN(3)
knn.fit(X_train_scaled, X_test_scaled)
y_pred = knn.predict(y_train)
get_metrics(y_test, y_pred)

{'accuracy': 0.9532163742690059,
 'f1_score': 0.9607843137254902,
 'precision': 0.98,
 'recall': 0.9423076923076923}

## Decision Trees

### Validação

In [None]:
dt_classifier = DecisionTreeClassifier(criterion="gini", random_state=RANDOM_STATE)
dt_kfold = KFoldValidation(df_train, 10)

In [None]:
dt_kfold.execute(dt_classifier, how="without_std")

In [None]:
get_mean_metrics(dt_kfold.k_metrics)

Precision: 0.95 ± 0.04
Accuracy: 0.93 ± 0.03
Recall: 0.95 ± 0.04
F1 Score: 0.95 ± 0.02


### Teste Final

In [None]:
dt_classifier.fit(X=x_train_scaled, y=y_train)
get_metrics(y_test, dt_classifier.predict(x_test_scaled))

{'accuracy': 0.9122807017543859,
 'f1_score': 0.9261083743842364,
 'precision': 0.9494949494949495,
 'recall': 0.9038461538461539}