In [8]:
# Lista de Exercícios de Mineração de Dados
# Gustavo Nunes Lopes

In [9]:
# Questão 1
# A)

import pathlib
import numpy as np
import pandas as pd

def remove_incomplete_data(data: list):
    cleaned_data = []
    for i in range(len(data)):
        for j in range(len(data[i])):
            if data[i][j] == -100:
                break
        else:
            cleaned_data.append(data[i])

    return cleaned_data

def split_X_y(data):
    X = np.array([row[:7] for row in data], dtype=float)
    y = np.array([row[7] for row in data])  # label
    return X, y

def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1

    return correct / len(actual) * 100.0

def rocchio_train(X_train, y_train):
    centroids = {}
    classes = np.unique(y_train)

    for c in classes:
        Xc = X_train[y_train == c]
        centroids[c] = np.mean(Xc, axis=0)  # vetor de 7 dimensões
    return centroids

def rocchio_predict(centroids, X_test):
    classes = list(centroids.keys())
    y_pred = []

    for x in X_test:
        best_class = None
        best_dist = None

        for c in classes:
            d = euclidean_distance(x, centroids[c])
            if best_dist is None or d < best_dist:
                best_dist = d
                best_class = c

        y_pred.append(best_class)

    return np.array(y_pred)

# Distância Euclidiana
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))
def nn_predict(X_train, y_train, X_test):
    y_pred = []

    for x in X_test:
        best_dist = None
        best_label = None

        for i in range(len(X_train)):
            d = euclidean_distance(x, X_train[i])

            if best_dist is None or d < best_dist:
                best_dist = d
                best_label = y_train[i]

        y_pred.append(best_label)

    return np.array(y_pred)


# Carregando os arquivos de texto
nebulosa_path_train = pathlib.Path().absolute() / 'db' / 'nebulosa_train.txt'
nebulosa_path_test = pathlib.Path().absolute() / 'db' / 'nebulosa_test.txt'

# Lendo os arquivos
with open(nebulosa_path_train, 'r') as file:
    nebulosa_train = file.read().split()
    new_nebulosa_train = []
    for i in range(0, len(nebulosa_train), 8):
        new_nebulosa_train.append(nebulosa_train[i:i+8])

    for i in range(len(new_nebulosa_train)):
        for j in range(len(new_nebulosa_train[i])):
            if j != 7:
                new_nebulosa_train[i][j] = float(new_nebulosa_train[i][j])


# app
with open(nebulosa_path_test, 'r') as file:
    nebulosa_test = file.read().split()
    new_nebulosa_test = []
    for i in range(0, len(nebulosa_test), 8):
        new_nebulosa_test.append(nebulosa_test[i:i+8])

    for i in range(len(new_nebulosa_test)):
        for j in range(len(new_nebulosa_test[i])):
            if j != 7:
                new_nebulosa_test[i][j] = float(new_nebulosa_test[i][j])

nebulosa_train_cleaned = remove_incomplete_data(new_nebulosa_train)
nebulosa_test_cleaned = remove_incomplete_data(new_nebulosa_test)
    
print("Treino original:", len(new_nebulosa_train), " | Treino limpo:", len(nebulosa_train_cleaned))
print("Teste original :", len(new_nebulosa_test),  " | Teste limpo :", len(nebulosa_test_cleaned))

X_train, y_train = split_X_y(nebulosa_train_cleaned)
X_test, y_test   = split_X_y(nebulosa_test_cleaned)

print("\n===== QUESTÃO 1A =====")

# ---- NN ----
y_pred_nn = nn_predict(X_train, y_train, X_test)
acc_nn = accuracy_metric(y_test, y_pred_nn)
print(f"Acurácia NN: {acc_nn:.2f}%")

# ---- Rocchio ----
centroids = rocchio_train(X_train, y_train)
y_pred_rocchio = rocchio_predict(centroids, X_test)
acc_rocchio = accuracy_metric(y_test, y_pred_rocchio)
print(f"Acurácia Rocchio: {acc_rocchio:.2f}%")

Treino original: 143  | Treino limpo: 134
Teste original : 28  | Teste limpo : 25

===== QUESTÃO 1A =====
Acurácia NN: 56.00%
Acurácia Rocchio: 0.00%


In [10]:
# Questão 1
# B)

# Para remover outliers usando o método do IQR
def clip_outliers_iqr(X_train, X_test, k=1.5):
    Q1 = np.percentile(X_train, 25, axis=0)
    Q3 = np.percentile(X_train, 75, axis=0)
    IQR = Q3 - Q1

    lower = Q1 - k * IQR
    upper = Q3 + k * IQR

    X_train_clipped = np.clip(X_train, lower, upper)
    X_test_clipped  = np.clip(X_test,  lower, upper)

    return X_train_clipped, X_test_clipped

# Normalização Min-Max
def minmax_normalize(X_train, X_test):
    min_val = np.min(X_train, axis=0)
    max_val = np.max(X_train, axis=0)

    X_train_scaled = (X_train - min_val) / (max_val - min_val)
    X_test_scaled  = (X_test  - min_val) / (max_val - min_val)

    return X_train_scaled, X_test_scaled

X_train, y_train = split_X_y(nebulosa_train_cleaned)
X_test, y_test   = split_X_y(nebulosa_test_cleaned)

print("===== QUESTÃO 1B =====")

# Outliers
X_train_clp, X_test_clp = clip_outliers_iqr(X_train, X_test)

# Min–Max
X_train_mm, X_test_mm = minmax_normalize(X_train_clp, X_test_clp)

# ---- NN ----
y_pred_nn_mm = nn_predict(X_train_mm, y_train, X_test_mm)
acc_nn_mm = accuracy_metric(y_test, y_pred_nn_mm)
print(f"Acurácia NN: {acc_nn_mm:.2f}%")

# ---- Rocchio ----
centroids_mm = rocchio_train(X_train_mm, y_train)
y_pred_rocchio_mm = rocchio_predict(centroids_mm, X_test_mm)
acc_rocchio_mm = accuracy_metric(y_test, y_pred_rocchio_mm)
print(f"Acurácia Rocchio: {acc_rocchio_mm:.2f}%")

===== QUESTÃO 1B =====
Acurácia NN: 60.00%
Acurácia Rocchio: 4.00%


In [11]:
# Questão 1
# C)

# Os resultados obtidos na questão 1B mostram que uma aplicação de técnicas de pré-processamento ajudam a melhorar a acurácia dos classificadores NN e Rocchio.
# Isso se deve ao fato de que a remoção de outliers reduz o impacto de valores extremos que podem distorcer a análise dos dados, bem como utilizar os valores normalizados.

In [14]:
# Questão 2
# A)

tweets_path = pathlib.Path().absolute() / 'db' / 'Tweets_Mg.csv'
tweets_db = pd.read_csv(tweets_path)
print(tweets_db)


      Unnamed: 0                      Created At  \
0              0  Sun Jan 08 01:22:05 +0000 2017   
1              1  Sun Jan 08 01:49:01 +0000 2017   
2              2  Sun Jan 08 01:01:46 +0000 2017   
3              3  Wed Jan 04 21:43:51 +0000 2017   
4              4  Mon Jan 09 15:08:21 +0000 2017   
...          ...                             ...   
8194        8194  Thu Feb 09 11:48:07 +0000 2017   
8195        8195  Thu Feb 09 12:10:19 +0000 2017   
8196        8196  Thu Feb 09 12:04:17 +0000 2017   
8197        8197  Thu Feb 09 12:10:04 +0000 2017   
8198        8198  Tue Feb 07 15:01:30 +0000 2017   

                                                   Text  \
0     ���⛪ @ Catedral de Santo Antônio - Governador ...   
1     � @ Governador Valadares, Minas Gerais https:/...   
2     �� @ Governador Valadares, Minas Gerais https:...   
3                           ��� https://t.co/BnDsO34qK0   
4     ��� PSOL vai questionar aumento de vereadores ...   
...                  