In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from utils import load_dataset, split_dataset, split_dataset_data_frame
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Upload datasets

train_messages, train_labels, test_messages, test_labels, corpus = load_dataset()
print('corpus shape', corpus.shape)
print('train shape', train_messages.shape)
print('test shape', test_labels.shape)

corpus shape (1118, 2)
train shape (895, 1)
test shape (223, 1)


### Text Data Cleaning and Preprocessing

In [3]:
def tokenize_and_normalize(text, remove_small_words = True, leave_only_letters = True, to_lower_case = True):
    """
    Converting a sentence into list of words. Normalize text.
    
    Argument:
    text -- a sentence that should be tokenized and normalized
    to_lower_case -- reduced all words to lowercase. Default value is True
    leave_only_letters -- remove all irrelevant characters (any non-letter characters). Default value is True
    remove_small_words -- remove all small words (less than 3 characters). Default value is True
    
    Returns:
    words -- list of words

    """
    if to_lower_case:
        text=text.lower()
    pattern = r'[A-Z,a-z]' if leave_only_letters else r'\S' 
    pattern += r'{3,}' if remove_small_words else r'{1,}' 
    words=re.findall(pattern,text)
    return words

In [4]:
print('number of words in row string: ', len(train_messages[3, 0].split()))
words = tokenize_and_normalize(train_messages[3, 0])
print('number of words in normalized string: ', len(words))


number of words in row string:  481
number of words in normalized string:  372


### Remove stopwords

In [5]:
def remove_stopwords(row_words):
    """
    Remove stopwords from list of words.
    
    Argument:
    row_words -- a list of words that contains stopwords that should be removed
    
    Returns:
    words -- list of words

    """
    
    clean_words = row_words.copy()
    
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = tokenize_and_normalize(' '.join(stopwords))
    stopwords = list(set(stopwords))
    
    clean_words = [x for x in clean_words if x not in stopwords]
    
    return clean_words

print('number of words in string before remove stopwords:', len(words))
words = remove_stopwords(words)
print('number of words in string after stopwords have been removed:', len(words))



number of words in string before remove stopwords: 372
number of words in string after stopwords have been removed: 358


In [6]:
pd.DataFrame({'messages' : [], 'labels' : []})

Unnamed: 0,labels,messages


### Create the NaiveBayes class

In [17]:
class NaiveBayes:
    def __init__(self):
        self.corpus = pd.DataFrame({'messages' : [], 'labels' : []})
        self.classes = np.empty(0)
        self.tokens = []
        self.frequency_table = pd.DataFrame({'col' : []})
        self.likelihoods_of_tokens = []
        self.likelihoods_of_classes = []
        self.likelihoods_of_tokens_for_each_classes = []
    
    def fit(self, corpus):
        self.corpus = corpus.copy(deep=True)
        self.classes = self.corpus['labels'].unique()
        train_corpus, validation_corpus = self.split_corpus(self.corpus)
        
        self.frequency_table, self.tokens = self.__to_frequency_table(self.corpus)
        self.likelihoods_of_tokens = self.__calc_likelihoods_of_tokens()
        self.likelihoods_of_classes = self.__calc_likelihoods_of_classes()
        self.likelihoods_of_tokens_for_each_classes = self.__calc_likelihoods_of_tokens_for_each_classes()
        
        accuracy = 0.01
        return accuracy
    
    def predictOne(self, text):
#         print('!!!!!!!!!!! call predictOne with tokenized text', tokenize_and_normalize(text))
        likelihoods_of_classes = self.__calc_likelihoods_of_classes_for_each_tokens(tokenize_and_normalize(text))
#         print('!!!!!!!!!!! likelihoods_of_classes', likelihoods_of_classes)
        predicted_class = likelihoods_of_classes.idxmax(axis=0)
        return predicted_class.values[0]

    
    def predict(self, corpus):
        x = np.array(corpus).reshape(len(corpus), 1)
        y = np.repeat(-1, len(corpus)).reshape(len(corpus), 1)
        data = np.append(x, y, axis=1)
        
        corpus_with_predicted_classes = pd.DataFrame(data=data,
                                                     index=np.arange(len(corpus)),
                                                     columns=["messages", "predicted classes"])
        
#         print('!!!!!!!!!!! corpus_with_predicted_classes', corpus_with_predicted_classes)
#         print('!!!!!!!!!!! enumerate(corpus', enumerate(corpus))

        for idx, text in enumerate(corpus):
#             print('!!!!!!!!!!! TEXT FOR PREDICT', corpus_with_predicted_classes.at[idx, "messages"])
            clazz = self.predictOne(corpus_with_predicted_classes.at[idx, "messages"])
            corpus_with_predicted_classes.at[idx, "predicted classes"] = clazz

        return corpus_with_predicted_classes
    
    def __to_frequency_table(self, inputCorpus):
        corpus = inputCorpus.copy(deep=True)
        tokens = tokenize_and_normalize(' '.join(corpus['messages'].values.tolist()))
        tokens = list(set(tokens))
        classes = corpus['labels'].unique()
        
        frequency_table = pd.DataFrame(0,
                                     index=classes,
                                     columns=tokens)
        
        for clss in classes:
            class_corpus = corpus[corpus['labels'] == clss]
            all_class_tokens = tokenize_and_normalize(' '.join(class_corpus['messages'].values.tolist()))
            for token in all_class_tokens:
                frequency_table.at[clss, token] += 1
#         print(frequency_table)
        
        return frequency_table, tokens
    
    def __calc_likelihoods_of_classes(self):
        sum_of_frequencies_of_tokens_by_classes = self.frequency_table.sum(axis=1)
        sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_classes.sum(axis=0)
        likelihoods_of_classes = sum_of_frequencies_of_tokens_by_classes / sum_of_frequencies_of_tokens_at_all
        return likelihoods_of_classes
    
    def __calc_likelihoods_of_tokens(self):
        sum_of_frequencies_of_tokens_by_tokens = self.frequency_table.sum(axis=0)
        sum_of_frequencies_of_tokens_at_all = sum_of_frequencies_of_tokens_by_tokens.sum(axis=0)
        likelihoods_of_tokens = sum_of_frequencies_of_tokens_by_tokens / sum_of_frequencies_of_tokens_at_all
        return likelihoods_of_tokens
    
    def __calc_likelihoods_of_tokens_for_each_classes(self):
        sum_of_frequencies_of_classes_by_classes = self.frequency_table.sum(axis=1)
        likelihoods_of_tokens_for_each_classes = self.frequency_table.loc[:,:] \
                                                     .div(sum_of_frequencies_of_classes_by_classes, axis=0)
        return likelihoods_of_tokens_for_each_classes
    
    def __calc_likelihoods_of_classes_for_each_tokens(self, tokens):

#         p2 = 1;
#         for token in list(set(tokens)):
#             p2 *= self.likelihoods_of_tokens[token]
        likelihoods_of_classes = pd.DataFrame(data=np.zeros((len(self.classes), 1), dtype=int),
                     index=self.classes,
                     columns=["likelihood"])
    
        for clss in self.classes:
            tokens_likelihoods = self.likelihoods_of_tokens_for_each_classes.loc[clss][tokens]
            tokens_likelihoods_prod = tokens_likelihoods[tokens_likelihoods > 0].prod()
        
            likelihoods_of_classes.loc[clss] = self.likelihoods_of_classes[clss] * tokens_likelihoods_prod
            
        return likelihoods_of_classes
    
    def split_corpus(self, corpus):
        train_corpus, test_corpus = split_dataset_data_frame(corpus)
        return train_corpus, test_corpus
    
    def calc_accuracy(self, X, Y):
        """
        Calculate the model accuracy. Predicted labels vs true ones.

        Argument:
        X -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of predicted labels.
        Y -- a numpy array (labels) of shape (num_samples, 1 - a label). Usually, it's a matrix of real labels.

        Returns:
        accuracy -- a classification accuracy
        """
        accuracy = (np.copy(X) == np.copy(Y)).mean()
        return accuracy

### Validation

#### Prepare test corpus

In [18]:
data = np.array([["Chinese Beijing Chinese","0"],
            ["Chinese Chinese Shanghai","0"], 
            ["Chinese Macao","0"],
            ["Tokyo Japan Chinese","1"]])
                
data = pd.DataFrame(data=data[0:,0:],
                  columns=["messages","labels"])

data

Unnamed: 0,messages,labels
0,Chinese Beijing Chinese,0
1,Chinese Chinese Shanghai,0
2,Chinese Macao,0
3,Tokyo Japan Chinese,1


#### Train and validation a NaiveBayes model

In [19]:
# Create instance of NaiveBayes class
nb = NaiveBayes()

# Train our model
# Tips: inside fit method it would be nice to split input data into train / test (80/20) sets and return model’ accuracy, e.g.:
Accuracy = nb.fit(data)  # return accuracy 

# Try to predict class of text
LIK = nb.predictOne("Chinese Chinese Chinese Tokyo Japan")
LIK
# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]
# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}
# or log     {'1': -7.906681345001262, '0': -7.10769031284391}

'0'

In [20]:
nb2 = NaiveBayes()
train_corpus, test_corpus = nb.split_corpus(corpus)

In [21]:
nb2.fit(train_corpus)

0.01

In [22]:
list_of_messages = test_corpus['messages'].values.tolist()
predicted_corpus = nb2.predict(list_of_messages)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


In [23]:
predicted = predicted_corpus.loc[:,'predicted classes'].values
real = test_corpus.loc[:,'labels'].values
nb2.calc_accuracy(predicted, real)

0.4977578475336323