In [3]:
import os
from collections import defaultdict
from pandas import read_table
import numpy as np
import math

In [4]:
from ekonlpy.sentiment import MPCK
mpck = MPCK()

In [5]:
class NaiveBayesClassifier:
    def __init__(self, k=0.5):
        self.k = k
        self.word_probs = []

    def load_corpusData(self, path):
        corpusData = read_table(path, sep=',', header=None, names=None, encoding='utf-8')
        corpusData = np.array(corpusData)

        return corpusData

    def count_words(self, training_set):
#         corpus data : dataDate, label
        counts = defaultdict(lambda : [0, 0])
        
#         read testFilelist
        file_list = os.listdir('data/news/testNews/')
        
        for dataDate, label in training_set:
            for file in file_list:
                if dataDate == file[5:15]:
#                     read test news file
                    corpus = open('data/news/testNews/'+file, 'r', encoding='utf-8').read()

#                     make corpus like nouns(NNG), adjectives (VA, VAX), adverbs (MAG), verbs (VA) and negations using eKoNLPy
                    tokens = mpck.tokenize(corpus)
                    ngrams = mpck.ngramize(tokens)

#                     count feature
                    for ngram in ngrams+tokens:
                        counts[ngram][0 if label == 1 else 1] += 1
        
        return counts

    def word_probabilities(self, counts, total_class0, total_class1, k):
        # get word, p(w|positive), p(w|negative)
        return [(w, (class0 + k) / (total_class0 + 2*k), (class1 + k) / (total_class1 + 2*k))
                for w, (class0, class1) in counts.items()]        

    def train(self, trainfile_path):
        training_set = self.load_corpusData(trainfile_path)

#         get prior
        positive = len([1 for _, label in training_set if label == 1])
        negative = len(training_set) - positive

#         train
        word_counts = self.count_words(training_set)
        
        self.word_probs = self.word_probabilities(word_counts, positive, negative, self.k)

#         save words
        for noOfWord in range(len(self.word_probs)):
            if self.word_probs[noOfWord][1]/self.word_probs[noOfWord][2] > 1:
                with open('data/res/positive.csv', 'a', encoding='utf-8') as f:
                    f.write(self.word_probs[noOfWord][0]+'\n')
            else:
                with open('data/res/negative.csv', 'a', encoding='utf-8') as f:
                    f.write(self.word_probs[noOfWord][0]+'\n') 

In [6]:
model = NaiveBayesClassifier()
# save result file in res folder
model.train(trainfile_path='data/labeledCallRate.csv')