# Universidad Autónoma de Yucatán

## Facultad de Matemáticas

### Machine Learning

**Teacher:** Dr. Victor Uc Cetina

**Student:** Dayan Bravo Fraga

# Logistic Regression (Binary Classification)

## Download Corpus from GitHub (only for Colab)

In [None]:
import sys
import os

in_colab: bool = 'google.colab' in sys.modules
if in_colab:
    print('Is running in Colab')
    if not os.path.isfile('data.txt'):
        import gdown

        print("Downloading Data")
        url = "https://raw.githubusercontent.com/dayan3847/machine_learning/master/binary_classification/colab/data.txt"
        gdown.download(url, quiet=False)
else:
    print('Is not running in Colab')

## Import libraries

In [None]:
import numpy as np
import sympy as sp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Import Data

In [None]:
data_points: np.ndarray = np.loadtxt('data.txt', delimiter=' ').T
data_points

## Split data into training and test

In [None]:
data_train, data_test = train_test_split(data_points.T, test_size=.3, random_state=42)
data_train = data_train.T
data_test = data_test.T

## Plot Data Points

In [None]:
def plot_data(print_polynomial: bool = False):
    plt.title('Data')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.axvline(color='black')
    plt.axhline(color='black')

    index_zeros_train = np.where(data_train[2].astype(int) == 0)
    index_ones_train = np.where(data_points[2].astype(int) != 0)
    index_zeros_test = np.where(data_test[2].astype(int) == 0)
    index_ones_test = np.where(data_test[2].astype(int) != 0)
    # Data Points Train
    plt.scatter(data_train[0][index_ones_train], data_train[1][index_ones_train], color='green', label='one_train',
                marker='o')
    plt.scatter(data_train[0][index_zeros_train], data_train[1][index_zeros_train], color='red', label='zero_train',
                marker='o')
    # Data Points Test
    plt.scatter(data_test[0][index_ones_test], data_test[1][index_ones_test], color='green', label='one_test',
                marker='x')
    plt.scatter(data_test[0][index_zeros_test], data_test[1][index_zeros_test], color='red', label='zero_test',
                marker='x')

    if print_polynomial:
        polynomial = sp.sympify(str(get_polynomial()) + ' - 0.5')
        x_ = np.linspace(-3, 3)
        y_ = [sp.solve(polynomial.subs('x', x_i)) for x_i in x_]
        plt.plot(x_, y_, label='polynomial', color='blue', linestyle='-')

    plt.legend()
    plt.grid()
    plt.show()


plot_data()

# Logistic Regression

## Generate Thetas

In [None]:
def generate_thetas() -> np.ndarray:
    return np.random.uniform(low=-.01, high=.01, size=3)


thetas = generate_thetas()
thetas

In [None]:
def get_polynomial() -> sp.Expr:
    return sp.sympify(f'{thetas[0]} * x + {thetas[1]} * y + {thetas[2]}')


get_polynomial()

In [None]:
plot_data(True)

In [None]:
def sigmoid(x: float) -> float:
    return 1 / (1 + np.exp(-x))

In [None]:
def h(x_vector: np.ndarray) -> float:
    return thetas.dot(x_vector)

In [None]:
def h_activation(x_vector: np.ndarray) -> float:
    return sigmoid(h(x_vector))

In [None]:
def get_error(use_train_data: bool = False) -> float:
    d = data_train if use_train_data else data_test
    error = 0
    for data in d.T:
        yi = data[2]
        hi = h_activation(np.append(data[:2], 1))
        error += (hi - yi) ** 2
    return error / 2


get_error()

In [None]:
def get_error_rms(use_train_data: bool = False) -> float:
    d = data_train if use_train_data else data_test
    e = get_error()
    m = d.shape[1]
    return (2 * e / m) ** .5


get_error_rms()

In [None]:
def get_accuracy(use_train_data: bool = False):
    d = data_train if use_train_data else data_test
    correct = 0
    for data in d.T:
        yi = data[2]
        hi = h_activation(np.append(data[:2], 1))
        if yi == 1 and hi >= .5:
            correct += 1
        elif yi == 0 and hi < .5:
            correct += 1
    return correct / d.shape[1]


get_accuracy()

### Training

In [None]:
iterations_count: int = 100
a: float = .1
errors = []
errors_test = []
accuracy = []
accuracy_test = []


def to_train():
    for _ in range(iterations_count):
        for data in data_train.T:
            x = np.append(data[:2], 1)
            y = data[2]
            hi = h_activation(x)
            a_x_y_m_hi = a * (y - hi)
            for i in range(3):  # len(thetas)
                # se actualiza el theta
                thetas[i] += a_x_y_m_hi * x[i]  # ^ 1
        errors.append(get_error_rms())
        errors_test.append(get_error_rms(True))
        accuracy.append(get_accuracy())
        accuracy_test.append(get_accuracy(True))


print('\033[92m' + 'training...' + '\033[0m')
to_train()
print('\033[92m' + 'training finished' + '\033[0m')

# Results

## Error

In [None]:
plt.title('Error')
plt.xlabel('iteration')
plt.ylabel('error')
plt.plot(errors, label='train')
plt.plot(errors_test, label='test')
plt.legend()
plt.show()

## Accuracy

In [None]:
plt.title('Accuracy')
plt.xlabel('iteration')
plt.ylabel('accuracy')
plt.plot(accuracy, label='train')
plt.plot(accuracy_test, label='test')
plt.legend()
plt.show()

## Final Polinomial

In [None]:
get_polynomial()

## Final Polinomial Graph

In [None]:
plot_data(True)