In [5]:
import numpy as np

class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=False, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose and i % 10 == 0):
#                 print(self.theta[0:10])
                print(i + 1)
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y)} \t')

                pred_result = self.predict_new(X)
                
                print('error: ')
                print(self.compute_error_rate(pred_result, y))
                
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold
    
    def predict_new(self, X):
        wx = np.dot(X, self.theta)
        
        return [1 if pred > 0 else -1 for pred in wx]
    
    def compute_error_rate(self, pred_result, labels):
        # We compute the error rate here, so wrong prediction will yield 1
        # and correct prediction will yield 0
        pred_verdict = [1 if pred_result[i] != labels[i] else 0 for i in range(0, len(labels))]

        # Sum the wrong predictions and divide it by total test data
        return np.sum(pred_verdict) / len(pred_verdict)
    
from scipy.io import loadmat

def load_news_data(filepath):
    news = loadmat(filepath)

    # From scipy csc matrix to 2D array
    train_data = news['data'].toarray()
    # From 2D array to 1D array
    train_labels = news['labels'].flatten()

    test_data = news['testdata'].toarray()
    test_labels = news['testlabels'].flatten()

    return train_data, train_labels, test_data, test_labels

In [None]:
model = LogisticRegression(lr=1.0, num_iter=10000, verbose=True)

train_data, train_labels, test_data, test_labels = load_news_data('news_binary.mat')

model.fit(train_data, train_labels)

1
loss: -0.01794606997592052 	
error: 
0.5194848084544254
11
loss: -5.741865899076585 	
error: 
0.4692866578599736
21
loss: nan 	
error: 
0.46136063408190225




31
loss: nan 	
error: 
0.4570673712021136
41
loss: nan 	
error: 
0.45409511228533683
51
loss: nan 	
error: 
0.4534346103038309
61
loss: nan 	
error: 
0.45310435931307796
71
loss: nan 	
error: 
0.45310435931307796
81
loss: nan 	
error: 
0.45277410832232495
91
loss: nan 	
error: 
0.45277410832232495
101
loss: nan 	
error: 
0.45277410832232495
111
loss: nan 	
error: 
0.45277410832232495
121
loss: nan 	
error: 
0.452443857331572
131
loss: nan 	
error: 
0.452443857331572


  from ipykernel import kernelapp as app


141
loss: nan 	
error: 
0.452443857331572
151
loss: nan 	
error: 
0.45211360634081904
161
loss: nan 	
error: 
0.45211360634081904
171
loss: nan 	
error: 
0.45211360634081904
181
loss: nan 	
error: 
0.45211360634081904
191
loss: nan 	
error: 
0.45211360634081904
201
loss: nan 	
error: 
0.45211360634081904
211
loss: nan 	
error: 
0.45211360634081904
221
loss: nan 	
error: 
0.45211360634081904
231
loss: nan 	
error: 
0.45211360634081904
241
loss: nan 	
error: 
0.45211360634081904
251
loss: nan 	
error: 
0.45211360634081904
261
loss: nan 	
error: 
0.45211360634081904
271
loss: nan 	
error: 
0.45211360634081904
281
loss: nan 	
error: 
0.45211360634081904
291
loss: nan 	
error: 
0.45211360634081904
301
loss: nan 	
error: 
0.45178335535006603
311
loss: nan 	
error: 
0.45178335535006603
321
loss: nan 	
error: 
0.45178335535006603
331
loss: nan 	
error: 
0.45178335535006603
341
loss: nan 	
error: 
0.45178335535006603
351
loss: nan 	
error: 
0.45178335535006603
361
loss: nan 	
error: 
0.45211360

error: 
0.45178335535006603
1991
loss: nan 	
error: 
0.45178335535006603
2001
loss: nan 	
error: 
0.45178335535006603
2011
loss: nan 	
error: 
0.45178335535006603
2021
loss: nan 	
error: 
0.45178335535006603
2031
loss: nan 	
error: 
0.45178335535006603
2041
loss: nan 	
error: 
0.45178335535006603
2051
loss: nan 	
error: 
0.45178335535006603
2061
loss: nan 	
error: 
0.45178335535006603
2071
loss: nan 	
error: 
0.45178335535006603
2081
loss: nan 	
error: 
0.45178335535006603
2091
loss: nan 	
error: 
0.45178335535006603
2101
loss: nan 	
error: 
0.45178335535006603
2111
loss: nan 	
error: 
0.45178335535006603
2121
loss: nan 	
error: 
0.45178335535006603
2131
loss: nan 	
error: 
0.45178335535006603
2141
loss: nan 	
error: 
0.45178335535006603
2151
loss: nan 	
error: 
0.45178335535006603
2161
loss: nan 	
error: 
0.45178335535006603
2171
loss: nan 	
error: 
0.45178335535006603
2181
loss: nan 	
error: 
0.45178335535006603
2191
loss: nan 	
error: 
0.45178335535006603
2201
loss: nan 	
error: 
0.

3811
loss: nan 	
error: 
0.45178335535006603
3821
loss: nan 	
error: 
0.45178335535006603
3831
loss: nan 	
error: 
0.45178335535006603
3841
loss: nan 	
error: 
0.45178335535006603
3851
loss: nan 	
error: 
0.45178335535006603
3861
loss: nan 	
error: 
0.45178335535006603
3871
loss: nan 	
error: 
0.45178335535006603
3881
loss: nan 	
error: 
0.45178335535006603
3891
loss: nan 	
error: 
0.45178335535006603
3901
loss: nan 	
error: 
0.45178335535006603
3911
loss: nan 	
error: 
0.45178335535006603
3921
loss: nan 	
error: 
0.45178335535006603
3931
loss: nan 	
error: 
0.45178335535006603
3941
loss: nan 	
error: 
0.45178335535006603
3951
loss: nan 	
error: 
0.45178335535006603
3961
loss: nan 	
error: 
0.45178335535006603
3971
loss: nan 	
error: 
0.45178335535006603
3981
loss: nan 	
error: 
0.45178335535006603
3991
loss: nan 	
error: 
0.45178335535006603
4001
loss: nan 	
error: 
0.45178335535006603
4011
loss: nan 	
error: 
0.45178335535006603
4021
loss: nan 	
error: 
0.45178335535006603
4031
loss:

error: 
0.45178335535006603
5641
loss: nan 	
error: 
0.45178335535006603
5651
loss: nan 	
error: 
0.45178335535006603
5661
loss: nan 	
error: 
0.45178335535006603
5671
loss: nan 	
error: 
0.45178335535006603
5681
loss: nan 	
error: 
0.45178335535006603
