In [126]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from utils import load_dataset, split_dataset, split_dataset_data_frame
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [127]:
# Upload datasets
train_messages, train_labels, test_messages, test_labels, corpus = load_dataset()
print('corpus shape:', corpus.shape)
print('train shape:', train_messages.shape)
print('test shape:', test_labels.shape)

corpus shape: (1118, 2)
train shape: (895, 1)
test shape: (223, 1)


### Text Data Cleaning and Preprocessing

In [128]:
def tokenize_and_normalize(text, remove_small_words = True, leave_only_letters = True, to_lower_case = True):
    """
    Converting a sentence into list of words. Normalize text.
    
    Argument:
    text -- a sentence that should be tokenized and normalized
    to_lower_case -- reduced all words to lowercase. Default value is True
    leave_only_letters -- remove all irrelevant characters (any non-letter characters). Default value is True
    remove_small_words -- remove all small words (less than 3 characters). Default value is True
    
    Returns:
    words -- list of words

    """
    if to_lower_case:
        text=text.lower()
    pattern = r'[A-Z,a-z]' if leave_only_letters else r'\S' 
    pattern += r'{3,}' if remove_small_words else r'{1,}' 
    words=re.findall(pattern,text)
    return words

In [129]:
print('number of words in row string: ', len(train_messages[3, 0].split()))
words = tokenize_and_normalize(train_messages[3, 0])
print('number of words in normalized string: ', len(words))

number of words in row string:  481
number of words in normalized string:  372


### Remove stopwords

In [130]:
def remove_stopwords(row_words):
    """
    Remove stopwords from list of words.
    
    Argument:
    row_words -- a list of words that contains stopwords that should be removed
    
    Returns:
    words -- list of words

    """
    
    clean_words = row_words.copy()
    
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = tokenize_and_normalize(' '.join(stopwords))
    stopwords = list(set(stopwords))
    
    clean_words = [x for x in clean_words if x not in stopwords]
    
    return clean_words

In [131]:
print('number of words in string before remove stopwords:', len(words))
words = remove_stopwords(words)
print('number of words in string after stopwords have been removed:', len(words))

number of words in string before remove stopwords: 372
number of words in string after stopwords have been removed: 358


### Create the NaiveBayes class

In [132]:
class NaiveBayes:
    def __init__(self):
        self.corpus = pd.DataFrame({'messages' : [], 'labels' : []})
        self.classes = np.empty(0)
        self.tokens = []
        self.frequency_table = pd.DataFrame({'col' : []})
        self.likelihoods_of_tokens = []
        self.likelihoods_of_classes = []
        self.likelihoods_of_tokens_for_each_classes = []
    
    def fit(self, corpus, withLogarithmTrick = False):
        self.corpus = corpus.copy(deep=True)
        self.classes = self.corpus['labels'].unique()
        train_corpus, validation_corpus = self.split_corpus(self.corpus)
        
        self.frequency_table, self.tokens = self.__to_frequency_table(train_corpus)
        self.likelihoods_of_tokens = self.__calc_likelihoods_of_tokens()
        self.likelihoods_of_classes = self.__calc_likelihoods_of_classes()
        self.likelihoods_of_tokens_for_each_classes = self.__calc_likelihoods_of_tokens_for_each_classes()
        
        # Calculate accuracy        
        messages = validation_corpus['messages'].values.tolist()
        predicted_corpus = self.predict(messages, withLogarithmTrick)
        predicted = predicted_corpus.loc[:,'predicted classes'].values
        real = validation_corpus.loc[:,'labels'].values
        accuracy = self.calc_accuracy(predicted, real)

        return accuracy
    
    def predictOne(self, text, withLogarithmTrick = False):
        likelihoods_of_classes = self.__calc_likelihoods_of_classes_for_each_tokens(tokenize_and_normalize(text), withLogarithmTrick)

        predicted_class = likelihoods_of_classes.idxmax(axis=0)
        
        return (text, predicted_class.values[0], likelihoods_of_classes)

    
    def predict(self, corpus, withLogarithmTrick = False):
        x = np.array(corpus).reshape(len(corpus), 1)
        y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)
        data = np.append(x, y, axis=1)
        
        corpus_with_predicted_classes = pd.DataFrame(data=data,
                                                     index=np.arange(len(corpus)),
                                                     columns=["messages", "predicted classes"])
        
        for idx, text in enumerate(corpus):
            prediction = self.predictOne(corpus_with_predicted_classes.at[idx, "messages"], withLogarithmTrick)
            corpus_with_predicted_classes.at[idx, "predicted classes"] = prediction[1]

        return corpus_with_predicted_classes
    
    def __to_frequency_table(self, input_corpus):
        corpus = input_corpus.copy(deep=True)
        tokens = tokenize_and_normalize(' '.join(corpus['messages'].values.tolist()))
        tokens = list(set(tokens))
        classes = corpus['labels'].unique()
        
        frequency_table = pd.DataFrame(1,
                                     index=classes,
                                     columns=tokens)
        
        for clss in classes:
            class_corpus = corpus[corpus['labels'] == clss]
            all_class_tokens = tokenize_and_normalize(' '.join(class_corpus['messages'].values.tolist()))
            for token in all_class_tokens:
                frequency_table.at[clss, token] += 1
        
        return frequency_table, tokens
    
    def __calc_likelihoods_of_classes(self):
        sum_of_frequencies_of_tokens_by_classes = self.frequency_table.sum(axis=1)
        sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_classes.sum(axis=0)
        likelihoods_of_classes = sum_of_frequencies_of_tokens_by_classes / sum_of_frequencies_of_tokens_at_all
        return likelihoods_of_classes
    
    def __calc_likelihoods_of_tokens(self):
        sum_of_frequencies_of_tokens_by_tokens = self.frequency_table.sum(axis=0)
        sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_tokens.sum(axis=0)
        likelihoods_of_tokens = sum_of_frequencies_of_tokens_by_tokens / sum_of_frequencies_of_tokens_at_all
        return likelihoods_of_tokens
    
    def __calc_likelihoods_of_tokens_for_each_classes(self):
        sum_of_frequencies_of_classes_by_classes = self.frequency_table.sum(axis=1)
        likelihoods_of_tokens_for_each_classes = self.frequency_table.loc[:,:] \
                                                     .div(sum_of_frequencies_of_classes_by_classes, axis=0)
        return likelihoods_of_tokens_for_each_classes
    
    def __calc_likelihoods_of_classes_for_each_tokens(self, tokens, withLogarithmTrick = False):
        likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),
                     index=self.classes,
                     columns=["likelihood"])
            
        for clss in self.classes:
            if withLogarithmTrick:
                tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]
                tokens_likelihoods_with_logarithm_trick = np.log(self.likelihoods_of_classes[clss]) + \
                         np.log(tokens_likelihoods[tokens_likelihoods > 0]).sum()
                likelihoods_of_classes.loc[clss] = tokens_likelihoods_with_logarithm_trick    
            else:
                tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]
                tokens_likelihoods_prod = tokens_likelihoods[tokens_likelihoods > 0].prod()
                likelihoods_of_classes.loc[clss] = self.likelihoods_of_classes[clss] * tokens_likelihoods_prod


        return likelihoods_of_classes
    
    
    def split_corpus(self, corpus):
        train_corpus, test_corpus = split_dataset_data_frame(corpus)
        return train_corpus, test_corpus
    
    def calc_accuracy(self, X, Y):
        """
        Calculate the model accuracy. Predicted labels vs true ones.

        Argument:
        X -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of predicted labels.
        Y -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of real labels.

        Returns:
        accuracy -- a classification accuracy
        """
        accuracy = (np.copy(X) == np.copy(Y)).mean()
        return accuracy

### Validation

#### Prepare test corpus

In [133]:
data = np.array([["Chinese Beijing Chinese","0"],
            ["Chinese Chinese Shanghai","0"], 
            ["Chinese Macao","0"],
            ["Tokyo Japan Chinese","1"]])
                
data = pd.DataFrame(data=data[0:,0:],
                  columns=["messages","labels"])

data

Unnamed: 0,messages,labels
0,Chinese Beijing Chinese,0
1,Chinese Chinese Shanghai,0
2,Chinese Macao,0
3,Tokyo Japan Chinese,1


#### Train and validation a NaiveBayes model

In [134]:
# Create instance of NaiveBayes class
nb = NaiveBayes()

# Train our model
# Tips: inside fit method it would be nice to split input data into train / test (80/20) sets and return model’ accuracy, e.g.:
accuracy = nb.fit(data, False)  # return accuracy 
print('train accuracy:', accuracy)

# Try to predict class of text
prediction = nb.predictOne("Chinese Chinese Chinese Tokyo Japan", False)
print('classify text "{}"'.format(prediction[0]))
print('predicted class:', prediction[1])
print('pobability of classes:', dict(prediction[2]["likelihood"]))
# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]
# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}
# or log     {'1': -7.906681345001262, '0': -7.10769031284391}

train accuracy: 1.0
classify text "Chinese Chinese Chinese Tokyo Japan"
predicted class: 1
pobability of classes: {'0': 0.00035555555555555574, '1': 0.00043402777777777775}


#### Spam recognition with NaiveBayes model

In [135]:
train_corpus, test_corpus = nb.split_corpus(corpus)

train_accuracy = nb.fit(train_corpus, False)
print('train accuracy:', train_accuracy)

train accuracy: 0.797752808988764


In [136]:
list_of_messages = test_corpus['messages'].values.tolist()
predicted_corpus = nb.predict(list_of_messages, False)

In [137]:
predicted = predicted_corpus.loc[:,'predicted classes'].values
real = test_corpus.loc[:,'labels'].values
test_accuracy = nb.calc_accuracy(predicted, real)
print('test accuracy:', test_accuracy)

test accuracy: 0.7847533632286996


**Result table for Simple NaiveBayes model**: 

<style type="text/css">td {text-align:left}</style>
<table style="width:auto">
    <tr>
        <td>  </td>
        <td> Train </td>
        <td> Test </td>
    </tr>
    <tr>
        <td> Accuracy </td>
        <td> 79.8% </td>
        <td> 78.5%</td>
    </tr>
</table>

#### Logarithm trick

*Note:* If we have a lot of words in document, we will have zero value for P(text|class), because Python float limitation. We can use natural logarithm trick and change formula for P(text|class) into:
log(P(class|text))=log(P(class))+log(P(word_1|class))+…+log(P(word_n|class))

In [138]:
train_accuracy_with_log_trick = nb.fit(train_corpus, True)
print('train accuracy with logarithm trick:', train_accuracy_with_log_trick)

train accuracy with logarithm trick: 0.9550561797752809


In [139]:
list_of_messages = test_corpus['messages'].values.tolist()
predicted_corpus = nb.predict(list_of_messages, True)

In [140]:
predicted = predicted_corpus.loc[:,'predicted classes'].values
real = test_corpus.loc[:,'labels'].values
test_accuracy_with_log_trick = nb.calc_accuracy(predicted, real)
print('test accuracy with logarithm trick:', test_accuracy_with_log_trick)

test accuracy with logarithm trick: 0.9282511210762332


**Result table for NaiveBayes model with logarithn trick**: 

<style type="text/css">td {text-align:left}</style>
<table style="width:auto">
    <tr>
        <td>  </td>
        <td> Train </td>
        <td> Test </td>
    </tr>
    <tr>
        <td> Accuracy </td>
        <td> 95.5% </td>
        <td> 92.8%</td>
    </tr>
</table>

In [141]:
class NaiveBayesTfIdf(NaiveBayes):

    def predict(self, corpus, withLogarithmTrick = False):
        x = np.array(corpus).reshape(len(corpus), 1)
        y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)
        data = np.append(x, y, axis=1)
        
        corpus_with_predicted_classes = pd.DataFrame(data=data,
                                                     index=np.arange(len(corpus)),
                                                     columns=["messages", "predicted classes"])
        
        for idx, text in enumerate(corpus):
            prediction = self.predictOne(corpus_with_predicted_classes.at[idx, "messages"], withLogarithmTrick)
            corpus_with_predicted_classes.at[idx, "predicted classes"] = prediction[1]

        return corpus_with_predicted_classes
    
    def predictOne(self, text, withLogarithmTrick = False):
        likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),
             index=self.classes,
             columns=["likelihood"])
        
        for clss in self.classes:
            tokens = tokenize_and_normalize(text)
            all_docs_in_class = self.corpus[self.corpus['labels'] == clss]            
            class_prior = all_docs_in_class.shape[0] / self.corpus.shape[0]
            probability = class_prior
            
            for token in tokens:
                freq_w = tokens.count(token) / len(tokens)
                count_text_token = 1
                corpus_docs = tokenize_and_normalize(' '.join(all_docs_in_class['messages'].values.tolist()))



                for it in range(self.corpus.shape[0]):
                    count_text_token += token in corpus_docs

                IDF = np.log(all_docs_in_class.shape[0] / count_text_token)
                probability *= (freq_w * IDF)
        
            likelihoods_of_classes.loc[clss] = probability
            
        predicted_class = likelihoods_of_classes.idxmax(axis=0)


        return (text, predicted_class.values[0], likelihoods_of_classes)

In [143]:
nbTfIdf = NaiveBayesTfIdf()
train_corpus, test_corpus = nb.split_corpus(corpus)
train_corpus, test_corpus = nb.split_corpus(test_corpus)
# train_corpus, test_corpus = nb.split_corpus(test_corpus)

train_accuracy_tf_idf = nbTfIdf.fit(train_corpus)
print('train accuracy with TF-IDF:', train_accuracy_tf_idf)

train accuracy with TF-IDF: 0.5714285714285714


In [144]:
list_of_messages = test_corpus['messages'].values.tolist()
predicted_corpus_tf_idf = nbTfIdf.predict(list_of_messages, False)

In [146]:
predicted_tf_idf = predicted_corpus_tf_idf.loc[:,'predicted classes'].values
real = test_corpus.loc[:,'labels'].values
test_accuracy_tf_idf = nbTfIdf.calc_accuracy(predicted_tf_idf, real)
print('test accuracy with TF-IDF:', test_accuracy_tf_idf)

test accuracy with TF-IDF: 0.4318181818181818


**Result table for NaiveBayes model with TF-IDF**: 

<style type="text/css">td {text-align:left}</style>
<table style="width:auto">
    <tr>
        <td>  </td>
        <td> Train </td>
        <td> Test </td>
    </tr>
    <tr>
        <td> Accuracy </td>
        <td> 57,1% </td>
        <td> 43.2%</td>
    </tr>
</table>