In [1]:
import numpy as np
import pandas as pd
from scipy.special import expit
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Select the class 
def get_class(group, dataset):
        size = len(dataset[0]) - 1
        for i in range(len(dataset)):
            dataset[i][size] = 1 if dataset[i][size] == group else -1
        return dataset

# Normalize
def normalize(dataset):
        for i in range(dataset.shape[1]-1):
            max_ = max(dataset[:, i])
            min_ = min(dataset[:, i])
            for j in range(dataset.shape[0]):
                dataset[j, i] = (dataset[j, i] - min_) / (max_ - min_)
        return dataset

# Insert the bias
def insert_bias(data):
    group = []
    for i in range(len(data)):
        group.append(np.insert(data[i], 0, -1))
    group = np.asarray(group)
    return group

# Divide into train and test
def divide_samples(dataset):
    return train_test_split(dataset, test_size=0.20)

# Divide into x and y 
def split_samples(dataset, n_attributes):
    return np.array(dataset[:, 0:n_attributes-1]), np.array(dataset[:, -1])
 
 # Gradient Descendent
def gradient_descendent(x, y, u):
    u = np.array(u, dtype=np.float64)
    n = (y * x)
    d = 1 + np.exp(u)
    return (np.array(sum(n/d), ndmin=2)).T

# Return the error
def get_error(u):
    return np.log(1 - np.exp(-u))

# Sigmoid logistic function
def sigmoid_logistic(y):
    y = np.array(y, dtype=np.float64)
    return expit(-y)

# Predict
def predict(y, m):
    y_pred = np.zeros((1, m))
    for i in range(y.shape[0]):
        y_pred[0][i] = 1 if y[i][0] > 0.5 else -1
    return y_pred

# Training
def training(training_set, epochs, weights, rate):
    epoch = 0
    allerrors = []
    x_train, y_train = split_samples(training_set, len(training_set[0]))
    y_train = np.array(y_train, ndmin=2).T

    for i in range(epochs):
        u = y_train * x_train.dot(weights)
        h = sigmoid_logistic(u)
        y = predict(h, x_train.shape[0])
        error = get_error(h)
        allerrors.append(error)
        weights = weights + gradient_descendent(x_train, y_train, u) * rate
        epoch = i
        
    y = np.array(y, dtype=np.float64)
    return weights, epoch, allerrors, get_accuracy(y.T, y_train)

# Test
def test(test_set, weights):
    x_test, y_test = split_samples(test_set, len(test_set[0]))
    y_test = np.array(y_test, ndmin=2).T
    u = y_test * x_test.dot(weights)
    h = sigmoid_logistic(u)
    y = predict(h, x_test.shape[0])
    y_test = np.array(y_test, ndmin=2)
    return get_accuracy(y.T, y_test), get_confusion_matrix(y.T, y_test)

# Reset the weights in each iteration
def reset(n_attributes):
    return np.random.rand(n_attributes-1, 1)

# Return accuracy
def get_accuracy(y_output, y_test):
    return abs(sum(y_test == y_output)) * 1.0 / len(y_test) * 1.0

# Return the confusion matrix
def get_confusion_matrix(y_output, y_test):
    return confusion_matrix(y_test.tolist(), y_output.tolist())
 
# Evaluate an algorithm using hold-out   
def execute(realizations, samples, rate, epochs):
    print("### REGRESSÃO LOGÍSTICA ###")
    print("PARÂMETROS: ")
    print("\t Taxa de aprendizagem: ", rate)
    print("\t Número máximo de épocas: ", epochs)
    print("\t Total de realizações: ", realizations, "\n")
    rates = []
    allerrors = []

    for i in range(realizations):
        weights = reset(len(samples[0]))
        np.random.shuffle(samples)
        training_set, test_set = divide_samples(samples)
        print("### REALIZAÇÃO ", (i+1), "###")
        print("### FASE DE training ###")
        weights, epoch, allerrors, accuracy = training(training_set, epochs, weights, rate)
        print("### FASE DE TESTES ###")
        taxa, matriz = test(test_set, weights)
        print("Taxa de acerto: ", taxa, "\n")
        rates.append(taxa)
        print("Matriz de confusão: ", matriz, "\n")

    rates = np.array(rates)
    allerrors = np.array(allerrors)
    print("Acurácia: ", rates.mean())
    print("Variância da Acurácia: ", rates.var())
    print("Variância do Erro: ", allerrors.var())
    print("Desvio Padrão da Acurácia: ", rates.std())
    print("### FIM REGRESSÃO LOGÍSTICA ###")


dataset = np.array(pd.read_csv("base/column_2C.dat", delimiter=",", header=None))
dataset = get_class("NO", dataset)
dataset = normalize(dataset)
dataset = insert_bias(dataset)
execute(5, dataset, 1e-4, 100)

### REGRESSÃO LOGÍSTICA ###
PARÂMETROS: 
	 Taxa de aprendizagem:  0.0001
	 Número máximo de épocas:  100
	 Total de realizações:  5 

### REALIZAÇÃO  1 ###
### FASE DE training ###
### FASE DE TESTES ###
Taxa de acerto:  [0.98387097] 

Matriz de confusão:  [[42  1]
 [ 0 19]] 

### REALIZAÇÃO  2 ###
### FASE DE training ###
### FASE DE TESTES ###
Taxa de acerto:  [0.96774194] 

Matriz de confusão:  [[37  2]
 [ 0 23]] 

### REALIZAÇÃO  3 ###
### FASE DE training ###
### FASE DE TESTES ###
Taxa de acerto:  [0.98387097] 

Matriz de confusão:  [[39  1]
 [ 0 22]] 

### REALIZAÇÃO  4 ###
### FASE DE training ###
### FASE DE TESTES ###
Taxa de acerto:  [0.98387097] 

Matriz de confusão:  [[46  1]
 [ 0 15]] 

### REALIZAÇÃO  5 ###
### FASE DE training ###
### FASE DE TESTES ###
Taxa de acerto:  [0.85483871] 

Matriz de confusão:  [[32  7]
 [ 2 21]] 

Acurácia:  0.9548387096774194
Variância da Acurácia:  0.002539021852237253
Variância do Erro:  0.030238328609481755
Desvio Padrão da Acurácia:  0.