# Use Naive Bayes to predict the News Category

In [1]:
import numpy as np
import random
import os
import csv

from NewsCategoryData import NewsCategory
from NewsCategoryData import NewsCategoryTrainTestSet
from NewsCategoryData import LABEL_LIST

jason_data_file_name  = "News_Category_Dataset_v2.json"
train_test_dataset_file_name = "news_cat_train_test_data.csv"


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wangdi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 1st step: count the words by category

Below function will return a dictionary which can tell us how many times the word is used in each category the given dataset. 

A sample of the data:

"there":[5.1, 65.1, 8.1, 13.1, 136.1, 11.1, 12.1, 15.1, 16.1, 24.1, 18.1, 20.1, 32.1, 11.1, 8.1, 11.1, 8.1, 7.1, 3.1, 5.1, 13.1, 6.1, 12.1, 8.1, 10.1, 40.1, 4.1, 0.1, 15.1, 5.1, 5.1, 73.1, 37.1, 1.1, 31.1, 10.1, 7.1, 15.1, 6.1, 2.1, 0.1]

## 2nd step: generate word matrix and word list
Based on the word dictionary created in 1st step, this step will calculate the propobility of each words used in each category. It will return a numpy array (count of words x number of class). The word list is a list of words whose order is the same as the numpy array. For example, word_list[0] = "there" then the word_matrix[0] is the probability list of "there" in 41 category. 


In [2]:
def count_words_by_category(data, label, class_num):
    word_dict = {}
    assert(len(data) == len(label))
    max_recorder = len(label)
    for i in range(max_recorder):
        #print(data[i])
        for word in data[i]:
            if word.lower() in word_dict:
                word_dict[word.lower()][label[i]] += 1
            else:
                word_dict[word.lower()] = [0.1 for i in range(class_num)]
                word_dict[word.lower()][label[i]] += 1
    return word_dict

def generate_word_matrix(word_dict):
    word_list = {}
    word_matrix = []
    index = 0
    for word in word_dict:
        word_list[word] = index 
        word_matrix.append(word_dict[word])
        index += 1
    word_matrix = np.asarray(word_matrix) 
    word_matrix = np.divide(word_matrix,word_matrix.sum(axis=0))
    return word_matrix, word_list

def save_word_matrix(file_name, word_matrix, word_list):
    fo = open(file_name, "w+")
    writer = csv.writer(fo)
    for key in word_list:
        writer.writerow([key]+list(word_matrix[word_list[key]]))
    fo.close()     

## Use the jason format raw data to gnerate the 

In [3]:
batch_size = 64
max_length = 100
num_class = 41
data = NewsCategory(batch_size=batch_size,max_length=max_length)
word_count_dic = count_words_by_category(data.data,data.label, num_class)
print(len(word_count_dic))
for item in word_count_dic:   
    print(item,word_count_dic[item])
    break
file_name = "naive_bayes_word_matrix.csv"

FileNotFoundError: [Errno 2] No such file or directory: 'News_Category_Dataset_v2.json'

## Load train and test dataset to generate the word_matrix and save as files 

In [9]:
batch_size = 64
max_length = 100
num_class = 41
data = NewsCategoryTrainTestSet(batch_size=batch_size,max_length=max_length)
word_count_dic = count_words_by_category(data.train_data, data.train_label, num_class)
print(len(word_count_dic))
for item in word_count_dic:   
    print(item,":", word_count_dic[item])
    break
word_matrix, word_list = generate_word_matrix(word_count_dic)
print(word_matrix.shape)
print(len(word_list))

file_name = "naive_bayes_word_matrix_ver1.csv"
save_word_matrix(file_name,word_matrix, word_list)

Total 200847 recorders are read. 180787 train data and 20060 test data.
55530
there : [5.1, 61.1, 7.1, 13.1, 126.1, 10.1, 12.1, 13.1, 15.1, 22.1, 18.1, 18.1, 32.1, 10.1, 7.1, 9.1, 7.1, 6.1, 2.1, 5.1, 11.1, 5.1, 10.1, 8.1, 10.1, 35.1, 4.1, 0.1, 15.1, 5.1, 5.1, 67.1, 31.1, 1.1, 29.1, 9.1, 7.1, 14.1, 4.1, 2.1, 0.1]
(55530, 41)
55530


In [7]:
word_matrix_2 = []
word_list_2 = {}
fo = open("naive_bayes_word_matrix.csv", "r+")
reader = csv.reader(fo)
index = 0
for row in reader:
    word_list_2[row[0]] = index
    index += 1
    m=[]
    for data in row[1:]:
        m.append(float(data))
    word_matrix_2.append(m)
word_matrix_2 = np.array(word_matrix_2)
print(len(word_list_2))
print(word_matrix_2.shape)

58834
(58834, 41)


In [7]:
import numpy as np

accuracy = 0
icount = 0
prediction = []
for title in data.data:
    y = np.array([1 for i in range(word_matrix.shape[1])], dtype=np.float32)
    x = [] 
    for word in title:
        x.append(word_matrix[word_list[word.lower()]])
    y = np.asarray(x).prod(axis=0)   
    #print("y=",y)
    prediction.append(y.argmax())
    #print(prediction,data.label[icount])
    if y.argmax() == data.label[icount]:
        accuracy += 1
    icount += 1
print("Accuracy rate = %.2f %%"%(accuracy/len(data.label)*100))        

Accuracy rate = 77.40 %
