# **Parte 1: Projeto Machine Learning (Algorítmo de Classificação)**

# 1. Importando as bibliotecas

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
  accuracy_score,
  confusion_matrix,
  ConfusionMatrixDisplay,
  f1_score,
  classification_report,
)

# 2. Limpando o dataset

### 2.1. Versão VSCode

In [None]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv', delimiter=',')
df = df.drop(columns=['Person ID', 'Daily Steps', 'Heart Rate', 'Blood Pressure', 'BMI Category'])

### 2.2. Versão Google Colab

In [None]:
# df = pd.read_csv('/content/Sleep_health_and_lifestyle_dataset.csv', delimiter=';')
# df = df.drop(columns=['Unnamed: 13','Unnamed: 14', 'Person ID', 'Daily Steps', 'Heart Rate', 'Blood Pressure', 'BMI Category'])

### 2.3. Trocando valores string por valores inteiros em cada coluna

In [None]:
df['Gender'] = df['Gender'].replace(['Male', 'Female'], list(range(len(df['Gender'].unique()))))
df['Sleep Disorder'] = df['Sleep Disorder'].replace(['None', 'Sleep Apnea', 'Insomnia'], (list(range(len(df['Sleep Disorder'].unique())))))
df['Occupation'] = df['Occupation'].replace(df['Occupation'].unique(), list(range(len(df['Occupation'].unique())))) # type: ignore
df['Sleep Disorder'].fillna(np.floor(df['Sleep Disorder'].mean()), inplace=True)

# 3. Criando a coluna "Classe" a partir da coluna "Quality of Sleep"

In [None]:
display(df.loc[df['Quality of Sleep'] < 6])

def classificar_sono(qualidade):
  if qualidade in [4, 5]:
      return 'Sono ruim'
  elif qualidade in [6, 7]:
      return 'Sono mediano'
  elif qualidade in [8, 9]:
      return 'Sono bom'
  else:
      return 'valor inválido'

df['Classe'] = df['Quality of Sleep'].apply(classificar_sono)

display(df)
df = df.replace(['Sono ruim', 'Sono mediano', 'Sono bom'], list(range(len(df['Classe'].unique()))))
display(df)
df = df.drop(columns=['Quality of Sleep'])

df.describe()

# 4. Mostando os Histogramas de todas as colunas

In [None]:
plt.figure()
df.hist(figsize=(10, 6))

# 5. 1NN

In [None]:
for test_size in np.arange(0.1, 1, 0.1):
    x = df.drop('Classe', axis=1)
    y = df['Classe']

    scores = []
    
    for i in range(200):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))

        scaler, model = StandardScaler(), KNeighborsClassifier(n_neighbors=1)
        scaler.fit(x_train)

        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_pred, y_test)
        scores.append(accuracy)

    f1 = f1_score(y_pred, y_test, average="weighted")

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    labels = [0,1,2]
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()

    print(classification_report(y_test, y_pred))
    
    plt.figure()
    sns.histplot(scores)
    plt.yticks([])
    plt.title(f"Acurácias do 1NN {test_size:.1f}")
    plt.show()

# 6. KNN com 8 vizinhos

In [None]:
for test_size in np.arange(0.1, 1, 0.1):
    x = df.drop('Classe', axis=1)
    y = df['Classe']

    scores = []
    
    for i in range(200):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))

        scaler, model = StandardScaler(), KNeighborsClassifier(n_neighbors=8)
        scaler.fit(x_train)

        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_pred, y_test)
        scores.append(accuracy)

    f1 = f1_score(y_pred, y_test, average="weighted")

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    labels = [0,1,2]
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()

    print(classification_report(y_test, y_pred))
    
    plt.figure()
    sns.histplot(scores)
    plt.yticks([])
    plt.title(f"Acurácias do KNN {test_size:.1f}")
    plt.show()

# 7. Gaussiana

In [None]:
for test_size in np.arange(0.1, 1, 0.1):
    x = df.drop('Classe', axis=1)
    y = df['Classe']

    scores = []
    
    for i in range(200):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))

        scaler, model = StandardScaler(), GaussianNB()
        scaler.fit(x_train)

        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_pred, y_test)
        scores.append(accuracy)

    f1 = f1_score(y_pred, y_test, average="weighted")

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    labels = [0,1,2]
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()

    print(classification_report(y_test, y_pred))
    
    plt.figure()
    sns.histplot(scores)
    plt.yticks([])
    plt.title(f"Acurácias da Gaussiana {test_size:.1f}")
    plt.show()

# 8. Bernoulli

In [None]:
for test_size in np.arange(0.1, 1, 0.1):
    x = df.drop('Classe', axis=1)
    y = df['Classe']

    scores = []
    
    for i in range(200):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))

        scaler, model = StandardScaler(), BernoulliNB()
        scaler.fit(x_train)

        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_pred, y_test)
        scores.append(accuracy)

    f1 = f1_score(y_pred, y_test, average="weighted")

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    labels = [0,1,2]
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()

    print(classification_report(y_test, y_pred))
    
    plt.figure()
    sns.histplot(scores)
    plt.yticks([])
    plt.title(f"Acurácias da Bernoulli {test_size:.1f}")
    plt.show()

# 9. Multinomial

In [None]:
for test_size in np.arange(0.1, 1, 0.1):
    x = df.drop('Classe', axis=1)
    y = df['Classe']

    scores = []
    for i in range(200):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))

        model = MultinomialNB()

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_pred, y_test)
        scores.append(accuracy)

    f1 = f1_score(y_pred, y_test, average="weighted")

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    labels = [0,1,2]
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()

    print(classification_report(y_test, y_pred))
    
    plt.figure()
    sns.histplot(scores)
    plt.yticks([])
    plt.title(f"Acurácias da Multinomial {test_size:.1f}")
    plt.show()

# **Parte 2: Redes Neurais Artificiais (Rede Perceptron Uni e Multicamadas).**

## Para esta parte do trabalho final da disciplina, deverá ser feito:
#### Utilizando a base de dados selecionada e utilizada nos trabalhos anteriores, implemente uma RNA que seja capaz de classificar/entender/resolver seu problema, de forma que:
* Utilize a melhor base de treinamento, ou seja, você deverá conseguir obter os melhores exemplos para treinamento de acordo com as técnicas de ML utilizadas anteriormente. Este processo é manual, ou seja, você deve observar quais são os registros que fazem com que a rede "erre", eliminá-los da base de treinamento e adicionar registros mais apropriados, deste forma espera-se que a acurácia do sistema seja melhor. Lembre-se de manter pelo menos 100 registros em cada classe;
* Tendo em mãos esta nova base de treinamento, execute novamente todos os testes nas diferentes técnicas de ML e compare os resultados com o que foi executado anteriormente (esta comparação deve ser textual e explicativa, colocada no arquivo texto e no Colab).
* Implemente então uma RNA Perceptron de camada única e multicamadas com aprendizagem supervisionada e descreva os resultados.
* Ao final, realize testes com 5 entradas de dados aleatórias e veja se o sistema realiza a classificação corretamente.

# 10. Melhor base de dados

### Como a melhor base de dados definida anteriormente no trabalho de classificação foi utilizando o algorítmo com um modelo ````Gaussian()````, utilizaremos ele.
> **NOTA:**
> Por mais que esta seja a melhor base de dados, ela não treinada com os dados corretos, pois o algorítmo faz isso da própria maneira

In [None]:
for test_size in np.arange(0.1, 1, 0.1):
    x = df.drop('Classe', axis=1)
    y = df['Classe']

    scores = []

    for i in range(200):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))

        scaler, model = StandardScaler(), GaussianNB()
        scaler.fit(x_train)

        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        model.fit(x_train, y_train)
        y_pred = model.predict(x_test)
        accuracy = accuracy_score(y_pred, y_test)
        scores.append(accuracy)

    f1 = f1_score(y_pred, y_test, average="weighted")

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    labels = [0,1,2]
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()

    print(classification_report(y_test, y_pred))

    plt.figure()
    sns.histplot(scores)
    plt.yticks([])
    plt.title(f"Acurácias da Gaussiana {test_size:.1f}")
    plt.show()

# 11. Testes e Comparações

### Fazendo os teste com os conjuntos de treino e teste selecionados manualmente e comparando os resultados com os anteriores. Como definido no projeto anterior, utilizaremos o modelo Gaussiano pois foi o que mais se destacou entre os outros. Desta vez, ao invés de deixar o algorítmo definir os conjuntos de treino e teste, criaremos os conjuntos na mão, pegando 50% de cada classe para teste e os outros 50% para treino.

        for test_size in np.arange(0.1, 1, 0.1):
              x = df.drop('Classe', axis=1)
              y = df['Classe']

              scores = []
        
                for i in range(200):
                  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))
        
                    scaler, model = StandardScaler(), GaussianNB()
                  scaler.fit(x_train)

                  x_train = scaler.transform(x_train)
                  x_test = scaler.transform(x_test)

                    model.fit(x_train, y_train)
                    y_pred = model.predict(x_test)
                  accuracy = accuracy_score(y_pred, y_test)
                    scores.append(accuracy)

              f1 = f1_score(y_pred, y_test, average="weighted")

        print(f"Accuracy: {accuracy * 100:.2f}%")
        print(f"F1 Score: {f1 * 100:.2f}%")

        labels = [0,1,2]
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot()

        print(classification_report(y_test, y_pred))
        
        plt.figure()
        sns.histplot(scores)
        plt.yticks([])
        plt.title(f"Acurácias da Gaussiana {test_size:.1f}")
        plt.show()

In [None]:
for test_size in np.arange(0.1, 1, 0.1):    
    for class_label in df['Classe'].unique():
        class_data = df[df['Classe'] == class_label]
        class_50_teste = class_data.head(len(class_data) // 2)
        class_50_treino = class_data.tail(len(class_data) // 2)

        scores = []

        for i in range(200):
            x_train, x_test, y_train, y_test = train_test_split(class_data.drop('Classe', axis=1), class_data['Classe'], test_size=float(test_size))

            scaler = StandardScaler()
            model = GaussianNB()

            scaler.fit(x_train)
            x_train_scaled = scaler.transform(x_train)
            x_test_scaled = scaler.transform(x_test)

            model.fit(x_train_scaled, y_train)
            y_pred = model.predict(x_test_scaled)

            accuracy = accuracy_score(y_test, y_pred)
            scores.append(accuracy)
            
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        print(f"Class: {class_label}")
        print(f"Accuracy: {accuracy * 100:.2f}%")
        print(f"F1 Score: {f1 * 100:.2f}%")

        labels = df['Classe'].unique()
        cm = confusion_matrix(y_test, y_pred, labels=labels)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
        disp.plot()

        print(classification_report(y_test, y_pred))

        plt.figure()
        sns.histplot(scores)
        plt.yticks([])
        plt.title(f"Acurácias da Gaussiana para Classe {class_label} para {test_size:.1f}")
        plt.show()

# 12. Implementando a rede Perceptron 

> n_classes = número de classes da coluna que se deseja trabalhar

### Itens necessários para a criação do Perceptron
- x (vetor de entrada com dados)
- y (saída esperada)
- Threshold (limiar)
- Learning Rate (taxa de aprendizado)
- Epochs (épocas)

In [None]:
#Criando classe Perceptron
class Perceptron:
    # Definindo valores para o construtor
    def __init__(self, n_columns, n_classes, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        
        # Necessários para a definição dos pesos e o viés
        self.n_columns = n_columns
        self.n_classes = n_classes
        
        # A coisa certa é inicializar os pesos e o viés com valores aleatórios
        self.weights = np.random.rand(self.n_columns, self.n_classes)
        self.bias = np.random.rand(self.n_classes)
    
    # Função de ativação para definir os valores de acordo com os valores dos arrays do dataframe
    def bipolar_step_function(self, x):
        if x > 0:
            return 1
        elif x == 0:
            return 0
        else:
            return -1

    def weighted_sum(self, x):
        multiply = np.multiply(x, self.weights)
        result = np.sum(multiply) - 1 * self.bias
        return result
    
    def predict(self, x):
        linear_output = np.dot(x, self.weights) + self.bias
        y_predict = self.bipolar_step_function(linear_output)
        return y_predict
    
    def output(self, x):
        weighted_sum = self.weighted_sum(x, self.weights) # type: ignore
        return self.bipolar_step_function(weighted_sum)
    
    # new_weight = weight + learning_rate * Xi * error
    def update_weights(self, x, expected_value, obtained_value):
        error = expected_value - obtained_value
        new_weights = self.weights + self.learning_rate * x * error
        new_bias = self.bias + self.learning_rate * -1 * error
        return new_weights, new_bias

In [None]:
def perceptron(x, y, learning_rate, epochs):
    weights = np.zeros(len(x[0]))
    n_iters = 0
    y_vet = np.ones(len(y))
    errors = np.ones(len(y))

    j = [] # erro quadrático médio (vetor que irá receber o resultado do erro)

    while n_iters < epochs:
        for i in range(0, len(x)):
            f = np.dot(x[i], weights)

            if(f > 0):
                y_ = 1
            elif f == 0:
                y_ = 0
            else:
                y_ = -1
            
            y_vet[i] = y_

            for j in range(0, len(weights)):
                weights[j] = weights[j] + learning_rate * (y[i] - y_) * x[i][j] # type: ignore
            
            n_iters+=1
        
        for i in range(0, len(y)):
            errors[i] = y[i] - y_vet**2
            j.append(0.5 * np.sum(errors))
    return weights, j

In [None]:
x = df.drop('Classe',axis=1)
y = np.where(df['Classe'] == 0, 0, np.where(df['Classe'] == 1, 1, 2))

print(perceptron(x, y, 0.5, 50)[0],"\n")

### 12.1 Perceptron meh 💀

In [None]:
from sklearn.linear_model import Perceptron
from sklearn.calibration import LabelEncoder

### sklearn.linear_model

In [None]:
for test_size in np.arange(0.1, 1, 0.1):
    x = df.drop('Classe', axis=1)
    le = LabelEncoder()
    y = le.fit_transform(df['Classe'])

    scores = []
    
    for i in range(200):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=float(test_size))
        y_test = y_test[~np.isnan(y_test)]

        scaler = StandardScaler()
        perceptron = Perceptron(random_state=0)

        scaler.fit(x_train)
        x_train_scaled = scaler.transform(x_train)
        x_test_scaled = scaler.transform(x_test)

        perceptron.fit(x_train_scaled, y_train)
        y_pred = perceptron.predict(x_test_scaled)

        accuracy = accuracy_score(y_test, y_pred)
        scores.append(accuracy)

    f1 = f1_score(y_test, y_pred, average='weighted')

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")

    labels = df['Classe'].unique()
    cm = confusion_matrix(y_test, y_pred, labels=labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()

    print(classification_report(y_test, y_pred))

    weights = perceptron.coef_[0]
    bias = perceptron.intercept_[0]

    # Plot the perceptron decision boundary
    x_min, x_max = x['Age'].min(), x['Age'].max()
    y_min, y_max = x['Physical Activity Level'].min(), x['Physical Activity Level'].max()

    def perceptron_decision_boundary(model, x_min, x_max, y_min, y_max):
        # Generate a grid of points
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))

        # Flatten the grid for predictions
        grid_points = np.column_stack((xx.ravel(), yy.ravel()))

        # Predict the class for each point in the grid
        z = model.predict(grid_points)

        # Reshape the predicted class into a 2D array
        z = z.reshape(xx.shape)

        return xx, yy, z

    # Get the decision boundary for the last iteration (you might want to choose a specific iteration)
    z = perceptron_decision_boundary(perceptron, x_min, x_max, y_min, y_max)

    # Plot the decision boundary
    plt.contourf(z[0], z[1], z[2], cmap='coolwarm', alpha=0.8)
    plt.scatter(x['Age'], x['Physical Activity Level'], c=y)  # Scatter plot your data points
    plt.title(f"Perceptron Decision Boundary (Test Size: {test_size:.1f})")
    plt.xlabel("Age")
    plt.ylabel("Physical Activity Level")
    plt.show()

    plt.figure()
    sns.histplot(scores, label=f"Test Size: {test_size}")
    plt.yticks([])
    plt.legend()
    plt.title("Distribution of Accuracy Scores for Perceptron")
    plt.show()