In [26]:
import time
import math
import numpy as np

from collections import defaultdict
from scipy.io import loadmat

def load_news_data(filepath):
    news = loadmat(filepath)

    # From scipy csc matrix to 2D array
    train_data = news['data'].toarray()
    # From 2D array to 1D array
    train_labels = news['labels'].flatten()

    test_data = news['testdata'].toarray()
    test_labels = news['testlabels'].flatten()

    return train_data, train_labels, test_data, test_labels

def create_dictionary(filepath='news.vocab'):
    with open(filepath, 'r') as f:
        raw_list = f.readlines()

    list_of_words = []
    for elem in raw_list:
        list_of_words.append(elem.strip('\n'))

    return list_of_words

# def sigmoid(x):
#     print(type(x))
#     minus_x = np.multiply(x, -1)
    
#     return np.power((1 + np.exp(minus_x)), -1)

# https://stackoverflow.com/questions/51976461/optimal-way-of-defining-a-numerically-stable-sigmoid-function-for-a-list-in-pyth
# def sigmoid(x):
#     return np.where(x >= 0, 
#                     1 / (1 + np.exp(-x)), 
#                     np.exp(x) / (1 + np.exp(x)))

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# def update_weight(train_data, train_labels, weights):
#     Aw = np.dot(train_data, weights)
#     sig_Aw = self.__sigmoid(Aw)
#     gradient = np.dot(X.T, (h - y)) / y.size
#     self.theta -= self.lr * gradient

def calculate_negative_gradients(train_data, train_labels, weights):
    A = train_data
    b = train_labels
    w = weights
    
    Aw = np.dot(A, w)
    sigmoid_Aw = sigmoid(Aw)
    
    diff = sigmoid_Aw - b
    
    total_error = np.dot(A.transpose(), diff)
    
    return total_error / len(A)
        
def update_weights(weights, negative_gradients, learning_rate=1.0):
#     print(weights.shape)
#     print(negative_gradients.shape)
    return weights - negative_gradients

# For binary experiment only
# Question: Do we use bias in the logistic regression
def predict(feature_vectors, weights, bias=0):
    wx = np.dot(feature_vectors, weights)
#     b = bias
#     y = wx + b
    print('CALL PREDICT')
#     y = sigmoid(wx)

#     return [1 if pred > 0 else -1 for pred in y]
    return [1 if pred > 0 else -1 for pred in wx]

def compute_error_rate(pred_result, labels):
    # We compute the error rate here, so wrong prediction will yield 1
    # and correct prediction will yield 0
    pred_verdict = [1 if pred_result[i] != labels[i] else 0 for i in range(0, len(labels))]
    
    print(np.sum(pred_verdict))
    # Sum the wrong predictions and divide it by total test data
    return np.sum(pred_verdict) / len(pred_verdict)

In [27]:
# Get the data
train_data, train_labels, test_data, test_labels = load_news_data('news_binary.mat')

# train_data = np.array(train_data, dtype=np.float128)
# train_labels = np.array(train_labels, dtype=np.float128)
# print(train_labels)

# test_data = np.array(test_data, dtype=np.float128)
# test_labels = np.array(test_labels, dtype=np.float128)

# train_data = np.array(train_data)
# train_labels = np.array(train_labels)

# test_data = np.array(test_data)
# test_labels = np.array(test_labels)

In [29]:
# Set w_j(t=0) = 0 for all j [0, 61188)
# weights = np.zeros(len(train_data[0]), dtype=np.float128)
weights = np.zeros(len(train_data[0]))

for t in range(0, 2000):
    print('Iteration: ' + str(t+1))
    
    pred_result = predict(train_data, weights)
    
#     print(pred_result[0:10])
#     print(train_labels[0:10])
    print('WEIGHT')
    print(weights[0:10])
    
    error_rate = compute_error_rate(pred_result, train_labels)
    
    if error_rate == 0.0:
        print('Error rate 0')
        break
        
    print(error_rate)
        
    negative_gradients = calculate_negative_gradients(train_data, train_labels, weights)
    
    print(negative_gradients)
    
    weights = update_weights(weights, negative_gradients, 1.0)

Iteration: 1
CALL PREDICT
WEIGHT
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
1573
0.5194848084544254
[0.00082563 0.05085865 0.05102378 ... 0.         0.         0.        ]
Iteration: 2
CALL PREDICT
WEIGHT
[-0.00082563 -0.05085865 -0.05102378 -0.001321   -0.02625495 -0.02559445
 -0.00412814  0.00511889 -0.01700793 -0.05333554]
1573
0.5194848084544254
[-0.00165109  0.00598392  0.03401926 ...  0.          0.
  0.        ]
Iteration: 3
CALL PREDICT
WEIGHT
[ 0.00082546 -0.05684257 -0.08504304  0.00395559 -0.03848728  0.01825425
 -0.00511893  0.01667705 -0.02132178 -0.0886747 ]
1572
0.5191545574636723
[-0.00165083  0.00599428  0.03402069 ...  0.          0.
  0.        ]
Iteration: 4
CALL PREDICT
WEIGHT
[ 0.00247629 -0.06283685 -0.11906374  0.00922139 -0.05072734  0.06205189
 -0.00610982  0.02823257 -0.02563744 -0.1240153 ]
1572
0.5191545574636723
[-0.00164664  0.00601955  0.03402304 ...  0.          0.
  0.        ]
Iteration: 5
CALL PREDICT
WEIGHT
[ 0.00412293 -0.0688564  -0.15308678  0.01445756 -0.0

[0.00042481 0.01302631 0.03401585 ... 0.         0.         0.        ]
Iteration: 34
CALL PREDICT
WEIGHT
[-0.01000126 -0.43019302 -1.13963897  0.10480341 -0.42070739  0.79253777
 -0.05455627  0.26721592 -0.21074088 -1.18422418]
1381
0.4560766182298547
[0.0004197  0.01303372 0.03401585 ... 0.         0.         0.        ]
Iteration: 35
CALL PREDICT
WEIGHT
[-0.01042095 -0.44322674 -1.17365482  0.10775462 -0.43297483  0.81383216
 -0.05621479  0.27504383 -0.21694037 -1.21956104]
1380
0.4557463672391017
[0.00041494 0.01304078 0.03401585 ... 0.         0.         0.        ]
Iteration: 36
CALL PREDICT
WEIGHT
[-0.01083589 -0.45626753 -1.20767068  0.11070307 -0.4452404   0.83509765
 -0.05787256  0.28287727 -0.22313404 -1.2548979 ]
1377
0.4547556142668428
[0.0004105  0.0130475  0.03401585 ... 0.         0.         0.        ]
Iteration: 37
CALL PREDICT
WEIGHT
[-0.01124639 -0.46931503 -1.24168653  0.11364878 -0.4575042   0.85633581
 -0.05952966  0.29071582 -0.22932224 -1.29023475]
1377
0.45475

[0.00034894 0.0131473  0.03401585 ... 0.         0.         0.        ]
Iteration: 67
CALL PREDICT
WEIGHT
[-0.02237696 -0.86264624 -2.26216209  0.20105617 -0.8249201   1.48558266
 -0.10911688  0.5271249  -0.41348082 -2.35034043]
1372
0.45310435931307796
[0.00034804 0.01314881 0.03401585 ... 0.         0.         0.        ]
Iteration: 68
CALL PREDICT
WEIGHT
[-0.022725   -0.87579505 -2.29617794  0.20394698 -0.83715666  1.50639322
 -0.11076835  0.53502673 -0.41959029 -2.38567729]
1372
0.45310435931307796
[0.00034717 0.01315027 0.03401585 ... 0.         0.         0.        ]
Iteration: 69
CALL PREDICT
WEIGHT
[-0.02307216 -0.88894532 -2.33019379  0.20683681 -0.84939277  1.52719715
 -0.1124198   0.54292924 -0.42569864 -2.42101414]
1372
0.45310435931307796
[0.00034633 0.01315167 0.03401585 ... 0.         0.         0.        ]
Iteration: 70
CALL PREDICT
WEIGHT
[-0.02341849 -0.90209699 -2.36420965  0.20972571 -0.86162842  1.54799467
 -0.11407124  0.55083241 -0.43180591 -2.456351  ]
1372
0.45

CALL PREDICT
WEIGHT
[-0.03353523 -1.29712172 -3.38468521  0.29608576 -1.22853983  2.1697858
 -0.16361043  0.78812099 -0.61466449 -3.51645668]
1371
0.45277410832232495
[0.00033086 0.01317908 0.03401585 ... 0.         0.         0.        ]
Iteration: 101
CALL PREDICT
WEIGHT
[-0.0338661  -1.3103008  -3.41870106  0.29895748 -1.24076625  2.19046156
 -0.16526169  0.79603465 -0.62075109 -3.55179354]
1371
0.45277410832232495
[0.00033058 0.01317963 0.03401585 ... 0.         0.         0.        ]
Iteration: 102
CALL PREDICT
WEIGHT
[-0.03419668 -1.32348043 -3.45271691  0.30182892 -1.25299247  2.21113504
 -0.16691295  0.80394848 -0.62683729 -3.58713039]
1371
0.45277410832232495
[0.0003303  0.01318016 0.03401585 ... 0.         0.         0.        ]
Iteration: 103
CALL PREDICT
WEIGHT
[-0.03452698 -1.3366606  -3.48673276  0.30470007 -1.26521851  2.23180629
 -0.16856421  0.81186246 -0.63292308 -3.62246725]
1371
0.45277410832232495
[0.00033003 0.01318068 0.03401585 ... 0.         0.         0.      

1370
0.452443857331572
[0.00032475 0.01319103 0.03401585 ... 0.         0.         0.        ]
Iteration: 133
CALL PREDICT
WEIGHT
[-0.04433889 -1.7322557  -4.50720833  0.39074318 -1.63193388  2.85117376
 -0.21810192  1.04933343 -0.81534811 -4.68257293]
1370




0.452443857331572
[0.00032463 0.01319126 0.03401585 ... 0.         0.         0.        ]
Iteration: 134
CALL PREDICT
WEIGHT
[-0.04466352 -1.74544696 -4.54122418  0.39360923 -1.64415607  2.87180056
 -0.21975317  1.05725036 -0.82142496 -4.71790979]
1370
0.452443857331572
[0.00032451 0.01319148 0.03401585 ... 0.         0.         0.        ]
Iteration: 135
CALL PREDICT
WEIGHT
[-0.04498804 -1.75863844 -4.57524003  0.39647519 -1.65637819  2.89242649
 -0.22140443  1.06516734 -0.8275016  -4.75324664]
1370
0.452443857331572
[0.0003244  0.0131917  0.03401585 ... 0.         0.         0.        ]
Iteration: 136
CALL PREDICT
WEIGHT
[-0.04531244 -1.77183014 -4.60925588  0.39934107 -1.66860022  2.91305158
 -0.22305568  1.07308438 -0.83357803 -4.7885835 ]
1370
0.452443857331572
[0.00032429 0.01319191 0.03401585 ... 0.         0.         0.        ]
Iteration: 137
CALL PREDICT
WEIGHT
[-0.04563672 -1.78502205 -4.64327173  0.40220687 -1.68082219  2.93367586
 -0.22470694  1.08100148 -0.83965426 -4.82

CALL PREDICT
WEIGHT
[-0.05500218 -2.16765516 -5.62973144  0.4852914  -2.03523422  3.53151448
 -0.27259333  1.31061579 -1.01578989 -5.84868918]
1369
0.45211360634081904
[0.00032188 0.01319589 0.03401585 ... 0.         0.         0.        ]
Iteration: 167
CALL PREDICT
WEIGHT
[-0.05532406 -2.18085105 -5.6637473   0.48815585 -2.04745468  3.55212292
 -0.27424459  1.31853404 -1.02186131 -5.88402603]
1369
0.45211360634081904
[0.00032182 0.01319596 0.03401585 ... 0.         0.         0.        ]
Iteration: 168
CALL PREDICT
WEIGHT
[-0.05564588 -2.19404702 -5.69776315  0.49102027 -2.0596751   3.57273105
 -0.27589584  1.32645231 -1.02793261 -5.91936289]
1369
0.45211360634081904
[0.00032176 0.01319603 0.03401585 ... 0.         0.         0.        ]
Iteration: 169
CALL PREDICT
WEIGHT
[-0.05596764 -2.20724305 -5.731779    0.49388467 -2.0718955   3.59333888
 -0.2775471   1.33437061 -1.03400377 -5.95469975]
1369
0.45211360634081904
[0.00032171 0.0131961  0.03401585 ... 0.         0.         0.     

CALL PREDICT
WEIGHT
[-0.0655989  -2.60314326 -6.75225456  0.57981056 -2.43849632  4.21147215
 -0.32708475  1.57193017 -1.21608707 -7.01480543]
1369
0.45211360634081904
[0.00032044 0.01319696 0.03401585 ... 0.         0.         0.        ]
Iteration: 200
CALL PREDICT
WEIGHT
[-0.06591935 -2.61634022 -6.78627041  0.58267464 -2.45071608  4.23207427
 -0.328736    1.57984912 -1.22215499 -7.05014228]
1369
0.45211360634081904
[0.00032041 0.01319696 0.03401585 ... 0.         0.         0.        ]
Iteration: 201
CALL PREDICT
WEIGHT
[-0.06623976 -2.62953719 -6.82028626  0.58553872 -2.46293582  4.2526763
 -0.33038726  1.58776809 -1.22822282 -7.08547914]
1369
0.45211360634081904
[0.00032038 0.01319696 0.03401585 ... 0.         0.         0.        ]
Iteration: 202
CALL PREDICT
WEIGHT
[-0.06656014 -2.64273415 -6.85430212  0.5884028  -2.47515555  4.27327823
 -0.33203851  1.59568707 -1.23429056 -7.12081599]
1369
0.45211360634081904
[0.00032035 0.01319696 0.03401585 ... 0.         0.         0.      

KeyboardInterrupt: 

In [4]:
np.multiply(np.array([1,2]),2)

array([2, 4])

In [9]:
import collections
collections.Counter(train_labels)

Counter({1: 1573, -1: 1455})