## Requisitos

In [13]:
import pandas as pd
import numpy as np
import neural_net as nn
import math

## Funções Auxiliares

In [14]:
# Geração de conjuntos estratificados para validação cruzada
def k_folds(df, target_attr, k=5, shuffle_seed=99):
    # Número de classes (valores distintos na coluna alvo)
    unique_classes = df[target_attr].unique()
    # Separa o dataset em 2 de acordo com o valor do atributo alvo e embaralha as entradas dentro de cada subset
    data_by_class = [df.loc[df[target_attr] == c].sample(frac=1, random_state=shuffle_seed) for c in unique_classes]
    # Cria uma lista com k dataframes
    folds = [pd.DataFrame() for i in range(k)]
    # Divide os dados em k dataframes de forma estratificada
    for class_data in data_by_class:
        n_rows = class_data.iloc[:, -1].count()
        fold_size = math.ceil(n_rows / k)
        for i in range(k):
            folds[i] = folds[i].append(class_data.iloc[i*fold_size:(i+1)*fold_size])
    # Embaralha as instâncias dentro de cada fold
    for i in range(len(folds)): folds[i] = folds[i].sample(frac=1, random_state=shuffle_seed)
    return folds

In [15]:
# Dada uma lista, retorna a lista sem o elemento de index i
def remove_index(list, index):
    if len(list) - 1 > index:
        return list[:index] + list[(index+1):]
    elif len(list) - 1 == index and index >= 0:
        return list[:index]
    else: raise Exception('index inválido')

In [16]:
# Retorna uma tupla que lista os atributos preditivos no formato (attr_name, attr_type, [possible_values])
# a partir de um arquivo csv "file" com separador "sep" no formato 
# "attr1<sep>attr2  
# attr1_type<sep>attr2_type
# value1<sep>value2"
def read_attr_list(file, sep):
  attr_df = pd.read_csv(file, sep=sep)

  attr_list = []

  for col in range(len(attr_df.columns)):
    attr_tuple = (attr_df.columns[col], attr_df.iloc[0, col])
    if len(list(attr_df[attr_df.columns[col]].dropna().drop([0]))) != 0:
      attr_tuple += (list(attr_df[attr_df.columns[col]].dropna().drop([0])),)
    attr_list.append(attr_tuple)

  return attr_list

In [17]:
# Treina uma rede neural, testa um conjunto de teste e retorna a acurácia
def test_neural_net(training_df, testing_df, attr_list, target_attr="target", alpha=0.05, lamb=0.0, intermediate_net=[2], n_outputs=1, n_epochs=100):
    correct_predictions = 0
    incorrect_predictions = 0

    dataset = []
    network = [len(attr_list), *intermediate_net, n_outputs]
    net = nn.NeuralNetwork(network, alpha=0.05, lamb=0.0)

    for idx, instance in training_df.iterrows(): 
      dataset.append([
        instance.drop(target_attr).to_numpy(),
        [instance[target_attr]]
        ])

    net.train(dataset, n_epochs)

    for idx, instance in testing_df.iterrows():
      inputs = instance.drop(target_attr).to_numpy()
      predicted_class = net.classify(inputs, n_outputs)
      actual_class = testing_df.at[idx, target_attr]
      if actual_class == predicted_class:
        correct_predictions += 1
      else:
        incorrect_predictions += 1
    accuracy = round(correct_predictions / testing_df.shape[0], 2)
    print(correct_predictions)
    return accuracy

## Dataset 1 - house-votes-84.tsv

In [12]:
raw_df = pd.read_csv('datasets/house-votes-84.tsv', sep='\t')

# Normaliza o dataset
df = (raw_df - raw_df.min()) / (raw_df.max() - raw_df.min())

# Reduz o dataset pra fins de teste
df = df.sample(50)

target_attr = "target"
n_folds = 5

# Gera conjuntos estratificados para a validação cruzada
folds = k_folds(df, target_attr, n_folds)
attr_list = read_attr_list("datasets/house-votes-attr.csv", ";")
folds_accuracy = []

# Teste da rede neural para cada um dos folds
for i in range(len(folds)):
  training_df = pd.concat(remove_index(folds, i))
  testing_df = folds[i]

  accuracy = test_neural_net(training_df, testing_df, attr_list, target_attr, alpha=0.005, lamb=0.25, intermediate_net=[12], n_epochs=1000)
  folds_accuracy.append([accuracy])

# Agregação dos Resultados
stats = pd.DataFrame(folds_accuracy, columns=["Accuracy"])
desc = stats.describe()
results = pd.concat([stats, desc.loc[['mean', 'std']] ])
results

9
9
10
8
5


Unnamed: 0,Accuracy
0,0.82
1,0.82
2,0.91
3,0.73
4,0.83
mean,0.822
std,0.063797
