In [2]:
import sys
import time
import math 
import numpy as np
from collections import defaultdict
from scipy.io import loadmat

def load_news_data(filepath):
    news = loadmat(filepath)

    # From scipy csc matrix to 2d ndarray
    train_data = news['data'].toarray()
    # From 2d ndarray to 1d ndarray
    train_labels = news['labels'].flatten()

    test_data = news['testdata'].toarray()
    test_labels = news['testlabels'].flatten()

    return train_data, train_labels, test_data, test_labels

def create_dictionary(filepath):
    with open(filepath, 'r') as f:
        list_of_words = f.readlines()

    return list_of_words

# This is going to be the pi_y
def calculate_label_count_and_probability(labels):
    label_count = defaultdict(int)

    for label in labels:
        label_count[label] = label_count[label] + 1

    label_probability = {}

    for label in labels:
        label_probability[label] = label_count[label] / len(labels)

    return label_count, label_probability

def check_sum_probability(collection):
    sum_of_probs = 0

    if isinstance(collection, dict):
        for label in label_probability:
            sum_of_probs = sum_of_probs + label_probability[label]
    else:
#         print('MASUK LIST')
        sum_of_probs = np.sum(collection)

    print(sum_of_probs)
    
    return abs(sum_of_probs - 1.0) < 0.001

# This is going to be miu_y_j
# The idea is to calculate separately for each value of y
# This is done by getting train_data only for a particular value of y
# Then, sum over the word index and divide by the number of data
def calculate_word_given_label_probability(train_data, label_count, label_probability, word_list):
    # Return array of shape(20, 61188)
    ret_arr = np.zeros((len(label_count), len(word_list)))
    
    # Iterate over label
    for label in label_count:
        # GOBLOK ANJING TADI train_labels == 1 WOKWOKWOK
        idx, = np.where(train_labels == label)
        
        first_idx = idx[0]
        last_idx = idx[-1]
        
#         print(train_data.shape)
        
        # Get the corresponding train_data
        corr_train_data = train_data[first_idx:last_idx]
#         print(corr_train_data.shape)
        
        # Sum over axis=0, i.e. sum word occurrence
        word_sum = np.sum(corr_train_data, axis=0)
#         print(word_sum.shape)
        word_sum = np.add(word_sum, 1)
#         print(np.max(word_sum) < label_count[label] + 2)
        
        # Divide by label_count + 2 (laplace_smoothing)
#         print(label_count[label])
#         word_prob = np.divide(word_sum, label_count[label]) 
        word_prob = np.divide(word_sum, label_count[label] + 2) 
#         print(word_prob[0:20])
        
        # Assign to ret_arr
        ret_arr[label - 1] = word_prob
        
    return ret_arr

# Check if for all j (words), miu_y_j sums up to 1
# def check_word_probability(word_prob):
#     sum_over_word_probs = np.sum(word_prob, axis=0)
    
#     print(sum_over_word_probs)
    
#     for elem in sum_over_word_probs:
#         if not (abs(elem - 1.0) < 0.001):
#             return False

#     return True

# This is going to be the P(X|Y)
def predict(feature_vector, word_prob, label_probability):
    # Arr of 20 elems
    likelihood_arr = np.zeros(len(label_count))
    
    one_minus_feature_vector = np.add(np.multiply(feature_vector, -1), 1)
    
    one_minus_word_prob = np.add(np.multiply(word_prob, -1), 1)
    
    log_word_prob = np.log(word_prob)
        
    log_one_minus_word_prob = np.log(one_minus_word_prob)
    
    for label in label_probability:
        pi_log_prob = np.log(label_probability[label])
            
        dot_product_one = np.dot(feature_vector, log_word_prob[label-1])
        
        dot_product_two = np.dot(one_minus_feature_vector, log_one_minus_word_prob[label-1])
        
        likelihood_arr[label - 1] = pi_log_prob + dot_product_one + dot_product_two
        
    return np.argmax(likelihood_arr) + 1

def predict_multiple(feature_vectors, word_prob, label_probability):
    # 2d ndarray of shape(len(feature_vectors), 20)
#     likelihood_arr = np.zeros(len(feature_vectors), len(label_count))
    
    one_minus_feature_vectors = np.add(np.multiply(feature_vectors, -1), 1)
    
    one_minus_word_prob = np.add(np.multiply(word_prob, -1), 1)
    
    # Change label_probability to list
    label_prob_list = []
    for i in range(0, len(label_probability)):
        label_prob_list.append(label_probability[i+1])
        
    pi_log_prob = np.log(label_prob_list)
    
    log_word_prob = np.log(word_prob)
        
    log_one_minus_word_prob = np.log(one_minus_word_prob)
    
    # Shape: 7520, 20
    dot_product_one = np.dot(feature_vectors, log_word_prob.transpose())
    
    # Shape: 7520, 20
    dot_product_two = np.dot(one_minus_feature_vectors, log_one_minus_word_prob.transpose())
    
    # pi_log_prob will be broadcasted
    # Shape: 7520, 20
    final_log_probs = dot_product_one + dot_product_two + pi_log_prob
    
    # Shape: 7520, 1
    return np.add(np.argmax(final_log_probs, axis=1), 1)
    
def compute_error_rate_2(test_data, test_labels, word_prob, label_probability):
    pred_result = predict_multiple(test_data, word_prob, label_probability)
    
    # COMPUTING ERROR RATES SO 1 IF WRONG PRED
    pred_verdict = [1 if pred_result[i] != test_labels[i] else 0 for i in range(0, len(test_data))]
    
    return np.sum(pred_verdict) / len(pred_verdict)    

def compute_error_rate(test_data, test_labels, word_prob, label_probability):
    pred_result = np.zeros(len(test_data))
    
    for i in range(0, len(test_data)):
        pred_label = predict(test_data[i], word_prob, label_probability)
        
        if i % 100 == 0:
            print(i)
        
        # COMPUTING ERROR RATES SO 1 IF WRONG PRED
        if pred_label != test_labels[i]:
            pred_result[i] = 1.0
            
    return np.sum(pred_result) / len(pred_result)

In [3]:
train_data, train_labels, test_data, test_labels = load_news_data('news.mat')

word_list = create_dictionary('news.vocab')

label_count, label_probability = calculate_label_count_and_probability(train_labels)

word_prob = calculate_word_given_label_probability(train_data, label_count, label_probability, word_list)

# for elem in word_prob[19]:
#     print(elem)

# print(check_word_probability(word_prob))

# predict(train_data[1000], word_prob, label_probability)

In [50]:
predict_multiple(test_data, word_prob, label_probability)

(7505, 20)
(7505, 20)
20
(7505, 20)


In [17]:
compute_error_rate(train_data, train_labels, word_prob, label_probability)

0.9857130180140208

In [4]:
compute_error_rate_2(test_data, test_labels, word_prob, label_probability)

0.3769487008660893

In [25]:
# compute_error_rate(test_data, test_labels, word_prob, label_probability)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500


0.3769487008660893

In [199]:
x = np.array(([1,2], [3,4]))

In [95]:
x[0]

array([1, 2])

In [96]:
x[-1]

array([3, 4])

In [97]:
np.sum(x, axis=0)

array([4, 6])

In [98]:
np.divide(x, 2)

array([[0.5, 1. ],
       [1.5, 2. ]])

In [105]:
y = np.zeros((2,2))

In [106]:
y

array([[0., 0.],
       [0., 0.]])

In [107]:
y[0] = x[0]

In [108]:
y[0]

array([1., 2.])

In [145]:
y

array([[1., 2.],
       [0., 0.]])

In [149]:
np.add(np.multiply(y, -1), 1)

array([[ 0., -1.],
       [ 1.,  1.]])

In [152]:
np.log(1)

0.0

In [4]:
np.zeros(1)

array([0.])

In [2]:
import numpy as np

a = np.zeros((2,5))

In [3]:
a

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [4]:
b = np.zeros(5)

In [5]:
b

array([0., 0., 0., 0., 0.])

In [6]:
(a + b).shape

(2, 5)

In [7]:
b = np.array([1.0, 1.0, 1.0, 1.0, 1.0])

In [8]:
a + b

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [9]:
a

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [10]:
b

array([1., 1., 1., 1., 1.])

In [12]:
z = np.array(([1,2,3], [4,5,6]))

In [14]:
np.argmax(z, axis=1)

array([2, 2])