In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Excercise 1. Logistic Regression

In [2]:
def load_data(file_name):
    data = np.array(pd.read_csv(file_name, header=None))
    np.random.shuffle(data)
    split_index = int(len(data)*0.8)
    X_train, y_train = data[0:split_index,0:-1], data[0:split_index,-1]
    X_test, y_test = data[split_index:,0:-1], data[split_index:,-1]
    return X_train, y_train, X_test, y_test

In [3]:
class LogisticRegression:
    def __init__(self, num_iter, learning_rate, loss='loss'):
        self.num_iter = num_iter
        self.lr = learning_rate
        self.loss_func = loss
        
    def __sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def __loss(self, h, y):
        return -np.mean(y*np.log(h) + (1-y)*np.log(1-h))
    
    def __cross_entropy(self, h, y):
        return -(y*np.log(h) + (1-y)*np.log(1-h))
    
    def __squared_error(self, h, y):
        h = (h >= 0.5)
        return np.sum((h - y)**2)
    
    def fit(self, X, y):
        self.weights = np.zeros(X.shape[1])
        self.accuracy = []
        for i in range(self.num_iter):
            z = np.dot(X, self.weights)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h-y)) / len(y)
            self.weights = self.weights - self.lr * gradient
            
            if self.loss_func == 'cross_entropy':
                loss = self.__cross_entropy(h, y)
            elif self.loss_func == 'squared error':
                loss = self.__squared_error(h, y)
            else:
                loss = self.__loss(h, y)
            
            self.accuracy.append(1-loss)
        return self
    
    def predict(self, X_test):
        return self.__sigmoid(np.dot(X_test, self.weights)) >= 0.5

In [4]:
X_train, y_train, X_test, y_test = load_data(file_name='spambase/spambase.data')

In [5]:
print(X_train)

[[4.700e-01 9.400e-01 9.400e-01 ... 5.723e+01 6.360e+02 7.440e+02]
 [0.000e+00 3.800e-01 3.800e-01 ... 1.843e+00 1.100e+01 1.180e+02]
 [0.000e+00 0.000e+00 0.000e+00 ... 7.983e+00 7.200e+01 4.950e+02]
 ...
 [0.000e+00 0.000e+00 1.230e+00 ... 1.736e+00 1.000e+01 6.600e+01]
 [0.000e+00 0.000e+00 0.000e+00 ... 1.071e+00 2.000e+00 1.500e+01]
 [0.000e+00 1.090e+00 0.000e+00 ... 4.941e+00 2.500e+01 8.400e+01]]


In [6]:
log_reg = LogisticRegression(100, 0.01)

In [7]:
log_reg.fit(X_train, y_train)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  


<__main__.LogisticRegression at 0x1066f7ba8>

In [8]:
prediction = log_reg.predict(X_test)

In [9]:
accuracy = len(np.where(prediction == y_test))/len(y_test) * 100
print(accuracy)

0.10857763300760044
