# Naive Bayes Text Classification

We have a two datasets of Digikala comments and each comment has a label :'recommended' or 'not_recommended'. We will use training dataset for training a naive bayes classifier. Then we will predict the 

In [189]:
import numpy as np
import pandas as pd
from hazm import Normalizer, Stemmer, Lemmatizer, word_tokenize, stopwords_list, POSTagger

In [190]:
normalizer = Normalizer()
stemmer = Stemmer()
lemmatizer = Lemmatizer()

In [191]:
data_folder = './CA3_dataset/'
data_train = pd.read_csv(data_folder + 'comment_train.csv')
data_test = pd.read_csv(data_folder + 'comment_test.csv')

In [192]:
class FeedbackClassifier():
    
    def __init__(self, preProcess = True):
        self.preProcess = preProcess
        self.sw = set(stopwords_list()[:30])
        
    def normalize_text(self, text):
        return normalizer.normalize(text)
    
    def stemLem(self, words):
        words = self.remove_stops(words)
        for i in range(len(words)):
            words[i] = lemmatizer.lemmatize(words[i])
            words[i] = stemmer.stem(words[i])
            
        return words
    
    def count_words(self, words):
        result = {}
        for w in words:
            if w in result:
                result[w] += 1
            else:
                result[w] = 0
        return {k: v for k, v in reversed(sorted(result.items(), key=lambda item: item[1]))}
    
    def remove_stops(self, words):
        for w in words:
            if w in self.sw:
                words.remove(w)
        return words
    
    def merge_columns(self, data):
        data['combined'] = data['title'] + ' ' + data['comment']
        data = data.drop(columns=['title', 'comment'])
        return data
    
    def fit(self, data_train, alpha = 1, additive_smooth = True):
        data_train = self.merge_columns(data_train)
        
        data_train_good = data_train[data_train['recommend'] == 'recommended']
        data_train_bad = data_train[data_train['recommend'] == 'not_recommended']
        
        good_text = data_train_good['combined'].str.cat(sep=' ')
        bad_text = data_train_bad['combined'].str.cat(sep=' ')
        
        if self.preProcess:
            good_text = self.normalize_text(good_text)
            bad_text = self.normalize_text(bad_text)
             
        good_words = word_tokenize(good_text)
        bad_words = word_tokenize(bad_text)
        
        if self.preProcess:
            good_words = self.stemLem(good_words)
            bad_words = self.stemLem(bad_words)
        
        self.good_prior = np.log(len(data_train_good) / len(data_train))
        self.bad_prior = np.log(len(data_train_bad) / len(data_train))
        
        self.vocabulary = set()
        for w in good_words + bad_words:
            if self.preProcess and w not in self.sw:
                self.vocabulary.add(w)
            else:
                self.vocabulary.add(w)
        
        good_words_count = self.count_words(good_words)
        bad_words_count = self.count_words(bad_words)
        
        self.loglh_good = {}
        
        for w in self.vocabulary:
            if additive_smooth:
                nom = good_words_count.get(w, 0) + 1                
                denom = len(good_words) + len(self.vocabulary) * alpha
                self.loglh_good[w] =  np.log(nom / denom)
            else:
                nom = good_words_count.get(w, 0) + 1e2
                denom = len(good_words)
                self.loglh_good[w] = np.log(nom / denom)
            
        self.loglh_bad = {}
        for w in self.vocabulary:
            if additive_smooth:
                nom = bad_words_count.get(w, 0) + 1
                denom = len(bad_words) + len(self.vocabulary) * alpha
                self.loglh_bad[w] =  np.log(nom / denom)
            else:
                nom = bad_words_count.get(w, 0) + 1e2
                denom = len(bad_words)
                self.loglh_bad[w] = np.log(nom / denom)
                
    def _predict(self, words):
        pred_for_good = self.good_prior                
        pred_for_bad = self.bad_prior
        
        for w in words:
            if w in self.vocabulary:
                pred_for_good += self.loglh_good[w]
                pred_for_bad += self.loglh_bad[w]
                
        if pred_for_good > pred_for_bad:
            return 'recommended'
        else:
            return 'not_recommended'
    
    def predict(self, data_test):
        data_test = self.merge_columns(data_test)['combined'].to_numpy()
        preds = []
        for text in data_test:
            if self.preProcess:
                text = self.normalize_text(text)
            words = word_tokenize(text)
            if self.preProcess:
                words = self.stemLem(words)
            preds.append(self._predict(words))
        return preds

    def print_metrics(self, acc, confusion_matrix):
        correct_detected_recommended = confusion_matrix[1][1]
        all_detected_recommended = confusion_matrix.sum(0)[1]
        total_recommended = confusion_matrix.sum(1)[1]

        presision = correct_detected_recommended / all_detected_recommended
        recall = correct_detected_recommended / total_recommended
        f1 = 2 * (presision * recall) / (presision + recall)

        print('accuracy:', acc)
        print('presision:', presision)
        print('recall:', recall)
        print('f1-score:', f1)
    
    def score(self, data_test):
        confusion_matrix = np.zeros((2,2))
        preds = self.predict(data_test)
        accuracy = (preds == data_test['recommend']).sum() / len(data_test)
        class_idx = {'recommended': 1, 'not_recommended': 0}
        for i in range(len(preds)):
            confusion_matrix[class_idx[data_test['recommend'].iloc[i]], class_idx[preds[i]]] += 1
        
        self.print_metrics(accuracy, confusion_matrix)

In [193]:
feedbackClassifier = FeedbackClassifier(preProcess=True)
feedbackClassifier.fit(data_train, alpha=1, additive_smooth=True)
feedbackClassifier.score(data_test)

accuracy: 0.91875
presision: 0.9055690072639225
recall: 0.935
f1-score: 0.9200492004920049


In [194]:
feedbackClassifier = FeedbackClassifier(preProcess=False)
feedbackClassifier.fit(data_train, alpha=1, additive_smooth=True)
feedbackClassifier.score(data_test)

accuracy: 0.92375
presision: 0.910411622276029
recall: 0.94
f1-score: 0.9249692496924968


In [195]:
feedbackClassifier = FeedbackClassifier(preProcess=True)
feedbackClassifier.fit(data_train, alpha=1, additive_smooth=False)
feedbackClassifier.score(data_test)

accuracy: 0.82875
presision: 0.9255663430420712
recall: 0.715
f1-score: 0.8067700987306065


In [196]:
feedbackClassifier = FeedbackClassifier(preProcess=False)
feedbackClassifier.fit(data_train, alpha=1, additive_smooth=False)
feedbackClassifier.score(data_test)

accuracy: 0.82875
presision: 0.933993399339934
recall: 0.7075
f1-score: 0.8051209103840682
