In [1]:
import numpy as np
import pandas as pd
from hazm import Normalizer, Stemmer, Lemmatizer, word_tokenize, stopwords_list, POSTagger

In [2]:
normalizer = Normalizer()
stemmer = Stemmer()
lemmatizer = Lemmatizer()

In [3]:
data_folder = './CA3_dataset/'
data_train = pd.read_csv(data_folder + 'comment_train.csv')
data_test = pd.read_csv(data_folder + 'comment_test.csv')

In [60]:
class FeedbackClassifier():
    
    def __init__(self, doPreProcess = True):
        self.preprocess = doPreProcess
        
    def normalize_text(self, text):
        return normalizer.normalize(text)
    
    def stemLem(self, words):
        for i in range(len(words)):
            words[i] = lemmatizer.lemmatize(words[i])
            words[i] = stemmer.stem(words[i])
        return words
    
    def count_words(self, words):
        result = {}
        for w in words:
            if w in result:
                result[w] += 1
            else:
                result[w] = 0
        return {k: v for k, v in reversed(sorted(result.items(), key=lambda item: item[1]))}

    def merge_columns(self, data):
        data['combined'] = data['title'] + ' ' + data['comment']
        data = data.drop(columns=['title', 'comment'])
        return data
    
    def fit(self, data_train, alpha = 1, additive_smooth = True):
        data_train = self.merge_columns(data_train)
        
        data_train_good = data_train[data_train['recommend'] == 'recommended']
        data_train_bad = data_train[data_train['recommend'] == 'not_recommended']
        
        good_text = data_train_good['combined'].str.cat(sep=' ')
        bad_text = data_train_bad['combined'].str.cat(sep=' ')
        
        if self.preprocess:
#             print('normalizing data...')
            good_text = self.normalize_text(good_text)
            bad_text = self.normalize_text(bad_text)
#             print('finished normalizing data...')
             
        good_words = word_tokenize(good_text)
        bad_words = word_tokenize(bad_text)
        
        if self.preprocess:
            good_words = self.stemLem(good_words)
            bad_words = self.stemLem(bad_words)
        
        self.good_prior = np.log(len(data_train_good) / len(data_train))
        self.bad_prior = np.log(len(data_train_bad) / len(data_train))

        self.vocabulary = good_words  + bad_words
        
        good_words_count = self.count_words(good_words)
        bad_words_count = self.count_words(bad_words)
        
        self.loglh_good = {}
        for w in self.vocabulary:
            if additive_smooth:
                nom = good_words_count.get(w, 0) + 1
                denom = len(good_words) + len(self.vocabulary) * alpha
                self.loglh_good[w] =  np.log(nom / denom)
            else:
                nom = good_words_count.get(w, 0)
                denom = len(good_words)
                self.loglh_good[w] = np.log(nom / denom)
            
        self.loglh_bad = {}
        for w in self.vocabulary:
            if additive_smooth:
                nom = bad_words_count.get(w, 0) + 1
                denom = len(bad_words) + len(self.vocabulary) * alpha
                self.loglh_bad[w] =  np.log(nom / denom)
            else:
                nom = bad_words_count.get(w, 0)
                denom = len(bad_words)
                self.loglh_bad[w] = np.log(nom / denom)
                
    def _predict(self, words):
        pred_for_good = self.good_prior                
        pred_for_bad = self.bad_prior
        
        for w in words:
            if w in self.vocabulary:
                pred_for_good += self.loglh_good[w]
                pred_for_bad += self.loglh_bad[w]
                
        if pred_for_good > pred_for_bad:
            return 'recommended'
        else:
            return 'not_recommended'
    
    def predict(self, data_test):
        data_test = self.merge_columns(data_test)['combined'].to_numpy()
        preds = []
        for t in data_test:
            text = self.normalize_text(t)
            words = self.stemLem(word_tokenize(text))
            preds.append(self._predict(words))
        return preds
        
#         return data_test['combined'].apply(lambda x : self._predict(x))


In [61]:
feedbackClassifier = FeedbackClassifier(doPreProcess = False)

In [62]:
feedbackClassifier.fit(data_train)

In [63]:
preds = feedbackClassifier.predict(data_test)
(preds == data_test['recommend']).sum() / len(data_test)

0.85875