In [213]:
import sys
import time
import math 
import numpy as np
from collections import defaultdict
from scipy.io import loadmat

def load_news_data(filepath):
    news = loadmat(filepath)

    # From scipy csc matrix to 2d ndarray
    train_data = news['data'].toarray()
    # From 2d ndarray to 1d ndarray
    train_labels = news['labels'].flatten()

    test_data = news['testdata'].toarray()
    test_labels = news['testlabels'].flatten()

    return train_data, train_labels, test_data, test_labels

def create_dictionary(filepath):
    with open(filepath, 'r') as f:
        list_of_words = f.readlines()

    return list_of_words

# This is going to be the pi_y
def calculate_label_count_and_probability(labels):
    label_count = defaultdict(int)

    for label in labels:
        label_count[label] = label_count[label] + 1

    label_probability = {}

    for label in labels:
        label_probability[label] = label_count[label] / len(labels)

    return label_count, label_probability

def check_sum_probability(collection):
    sum_of_probs = 0

    if isinstance(collection, dict):
        for label in label_probability:
            sum_of_probs = sum_of_probs + label_probability[label]
    else:
#         print('MASUK LIST')
        sum_of_probs = np.sum(collection)

    print(sum_of_probs)
    
    return abs(sum_of_probs - 1.0) < 0.001

# This is going to be miu_y_j
# The idea is to calculate separately for each value of y
# This is done by getting train_data only for a particular value of y
# Then, sum over the word index and divide by the number of data
def calculate_word_given_label_probability(train_data, train_labels, word_list):
    label_count, label_probability = calculate_label_count_and_probability(train_labels)
    
    # Return array of shape(20, 61188)
    ret_arr = np.zeros((len(label_count), len(word_list)))
    
    # Iterate over label
    for label in label_count:
        # GOBLOK ANJING TADI train_labels == 1 WOKWOKWOK
        idx, = np.where(train_labels == label)
        
        first_idx = idx[0]
        last_idx = idx[-1]
        
#         print(train_data.shape)
        
        # Get the corresponding train_data
        corr_train_data = train_data[first_idx:last_idx]
#         print(corr_train_data.shape)
        
        # Sum over axis=0, i.e. sum word occurrence
        word_sum = np.sum(corr_train_data, axis=0)
#         print(word_sum.shape)
        word_sum = np.add(word_sum, 1)
#         print(np.max(word_sum) < label_count[label] + 2)
        
        # Divide by label_count + 2 (laplace_smoothing)
#         print(label_count[label])
#         word_prob = np.divide(word_sum, label_count[label]) 
        word_prob = np.divide(word_sum, label_count[label] + 2) 
#         print(word_prob[0:20])
        
        # Assign to ret_arr
        ret_arr[label - 1] = word_prob
        
    return ret_arr

# Check if for all j (words), miu_y_j sums up to 1
# def check_word_probability(word_prob):
#     sum_over_word_probs = np.sum(word_prob, axis=0)
    
#     print(sum_over_word_probs)
    
#     for elem in sum_over_word_probs:
#         if not (abs(elem - 1.0) < 0.001):
#             return False

#     return True

# This is going to be the P(X|Y)
def predict(feature_vector, word_prob, train_labels):
    label_count, label_probability = calculate_label_count_and_probability(train_labels)
    
    # Arr of 20 elems
    likelihood_arr = np.zeros(len(label_count))
    
    print('Feature vector')
    print(feature_vector)
    one_minus_feature_vector = np.add(np.multiply(feature_vector, -1), 1)
    
    print('1 - Feature vector')
    print(one_minus_feature_vector)
    
    print('word_prob')
    print(word_prob[0])
    
    print('1 - word_prob')
    one_minus_word_prob = np.add(np.multiply(word_prob, -1), 1)
#     for elem in one_minus_word_prob[19]:
#         print(elem)
#     print(one_minus_word_prob[0])
    
    # Logs
    log_word_prob = np.log(word_prob)
    print('log_word_prob')
    print(log_word_prob[0])
    
    # NEGATIVE VALUES IN LOG
        
    log_one_minus_word_prob = np.log(one_minus_word_prob)
    print('log_one_minus_word_prob')
#     for elem in log_one_minus_word_prob[19]:
#         print(elem)
#         print(log_one_minus_word_prob[0])
    
    for label in label_probability:
        pi_log_prob = np.log(label_probability[label])
        if label == 19 or label == 20:
            print('pi_log_prob')
            print(pi_log_prob)
            
        dot_product_one = np.dot(feature_vector, log_word_prob[label-1])
        if label == 19 or label == 20:
            print('dot_product_one')
            print(dot_product_one)
        
        dot_product_two = np.dot(one_minus_feature_vector, log_one_minus_word_prob[label-1])
        if label == 19 or label == 20:
            print(log_one_minus_word_prob[label-1])
            print('dot_product_two')
            print(dot_product_two)
        
        likelihood_arr[label - 1] = pi_log_prob + dot_product_one + dot_product_two
        
    print(likelihood_arr)
    # AGAIN, add 1 to get label from index
    return np.argmax(likelihood_arr) + 1

In [215]:
train_data, train_labels, test_data, test_labels = load_news_data('news.mat')

word_list = create_dictionary('news.vocab')

word_prob = calculate_word_given_label_probability(train_data, train_labels, word_list)

# for elem in word_prob[19]:
#     print(elem)

# print(check_word_probability(word_prob))

predict(train_data[1000], word_prob, train_labels)

Feature vector
[0. 0. 0. ... 0. 0. 0.]
1 - Feature vector
[1. 1. 1. ... 1. 1. 1.]
word_prob
[0.01244813 0.08091286 0.17219917 ... 0.00207469 0.00207469 0.00207469]
1 - word_prob
log_word_prob
[-4.38618464 -2.51438247 -1.75910351 ... -6.17794411 -6.17794411
 -6.17794411]
log_one_minus_word_prob
pi_log_prob
-3.189926319726094
dot_product_one
-38.7538658138926
[-0.00214823 -0.0688396  -0.00214823 ... -0.00214823 -0.00214823
 -0.00214823]
dot_product_two
-331.2225635420257
pi_log_prob
-3.400221728562455
dot_product_one
-38.401284278883985
[-0.00264901 -0.08845542 -0.02409755 ... -0.00264901 -0.00264901
 -0.00264901]
dot_product_two
-334.62731970018376
[-338.79766664 -242.81857707 -246.06343715 -247.65848721 -244.7364244
 -261.80117251 -225.89305206 -273.6445026  -265.29455211 -266.27006394
 -285.52770531 -326.46372171 -253.38570367 -297.52491586 -291.91461697
 -337.14562134 -343.45268277 -370.12750408 -373.16635568 -376.42882571]


7

In [199]:
x = np.array(([1,2], [3,4]))

In [95]:
x[0]

array([1, 2])

In [96]:
x[-1]

array([3, 4])

In [97]:
np.sum(x, axis=0)

array([4, 6])

In [98]:
np.divide(x, 2)

array([[0.5, 1. ],
       [1.5, 2. ]])

In [105]:
y = np.zeros((2,2))

In [106]:
y

array([[0., 0.],
       [0., 0.]])

In [107]:
y[0] = x[0]

In [108]:
y[0]

array([1., 2.])

In [145]:
y

array([[1., 2.],
       [0., 0.]])

In [149]:
np.add(np.multiply(y, -1), 1)

array([[ 0., -1.],
       [ 1.,  1.]])

In [152]:
np.log(1)

0.0