In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from utils import load_dataset, split_dataset, split_dataset_data_frame
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Upload datasets

train_messages, train_labels, test_messages, test_labels, corpus = load_dataset()
print('corpus shape', corpus.shape)
print('train shape', train_messages.shape)
print('test shape', test_labels.shape)

corpus shape (1118, 2)
train shape (895, 1)
test shape (223, 1)


### Text Data Cleaning and Preprocessing

In [3]:
def tokenize_and_normalize(text, remove_small_words = True, leave_only_letters = True, to_lower_case = True):
    """
    Converting a sentence into list of words. Normalize text.
    
    Argument:
    text -- a sentence that should be tokenized and normalized
    to_lower_case -- reduced all words to lowercase. Default value is True
    leave_only_letters -- remove all irrelevant characters (any non-letter characters). Default value is True
    remove_small_words -- remove all small words (less than 3 characters). Default value is True
    
    Returns:
    words -- list of words

    """
    if to_lower_case:
        text=text.lower()
    pattern = r'[A-Z,a-z]' if leave_only_letters else r'\S' 
    pattern += r'{3,}' if remove_small_words else r'{1,}' 
    words=re.findall(pattern,text)
    return words

In [4]:
print('number of words in row string: ', len(train_messages[3, 0].split()))
words = tokenize_and_normalize(train_messages[3, 0])
print('number of words in normalized string: ', len(words))


number of words in row string:  481
number of words in normalized string:  372


### Remove stopwords

In [5]:
def remove_stopwords(row_words):
    """
    Remove stopwords from list of words.
    
    Argument:
    row_words -- a list of words that contains stopwords that should be removed
    
    Returns:
    words -- list of words

    """
    
    clean_words = row_words.copy()
    
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = tokenize_and_normalize(' '.join(stopwords))
    stopwords = list(set(stopwords))
    
    clean_words = [x for x in clean_words if x not in stopwords]
    
    return clean_words

print('number of words in string before remove stopwords:', len(words))
words = remove_stopwords(words)
print('number of words in string after stopwords have been removed:', len(words))



number of words in string before remove stopwords: 372
number of words in string after stopwords have been removed: 358


In [6]:
pd.DataFrame({'messages' : [], 'labels' : []})

Unnamed: 0,messages,labels


### Create the NaiveBayes class

In [7]:
class NaiveBayes:
    def __init__(self):
        self.corpus = pd.DataFrame({'messages' : [], 'labels' : []})
        self.classes = np.empty(0)
        self.bag_of_words = []
     
    def fit(self, corpus):
        self.corpus = corpus.copy(deep=True)
        labels = corpus['labels'].unique()
        self.classes = labels

        train_messages, train_labels, test_messages, test_labels = split_dataset(self.corpus)
        
        train_corpus, test_corpus = split_dataset_data_frame(self.corpus)
        
#         for message in train_messages.reshape(train_messages.shape[0],):
#             self.bag_of_words += tokenize_and_normalize(message)
            
        self.bag_of_words = tokenize_and_normalize(' '.join(train_corpus['messages'].values.tolist()))
        self.bag_of_words = list(set(self.bag_of_words))
        
        counter_words_by_class = pd.DataFrame(data=np.zeros((len(self.classes), len(self.bag_of_words)), dtype=int),
                                     index=self.classes,
                                     columns=self.bag_of_words)
        
        for label in self.classes:
            ds_with_label = train_corpus[train_corpus['labels'] == label]
            all_words_l = tokenize_and_normalize(' '.join(ds_with_label['messages'].values.tolist()))
            for word in all_words_l:
                counter_words_by_class.at[label, word] += 1

        print(counter_words_by_class)
                
        
        accuracy = 0.01
        return accuracy
    
    def predict(self, text):
        pobability = {str(label):0.0001 for label in self.classes}

        return pobability

In [8]:
# nothing

### Validation

#### Prepare test corpus

In [9]:
data = np.array([["Chinese Beijing Chinese","0"],
            ["Chinese Chinese Shanghai","0"], 
            ["Chinese Macao","0"],
            ["Tokyo Japan Chinese","1"]])
                
data = pd.DataFrame(data=data[0:,0:],
                  columns=["messages","labels"])

data

Unnamed: 0,messages,labels
0,Chinese Beijing Chinese,0
1,Chinese Chinese Shanghai,0
2,Chinese Macao,0
3,Tokyo Japan Chinese,1


#### Train and validation a NaiveBayes model

In [10]:
# Create instance of NaiveBayes class
nb = NaiveBayes()

# Train our model
# Tips: inside fit method it would be nice to split input data into train / test (80/20) sets and return model’ accuracy, e.g.:
Accuracy = nb.fit(data)  # return accuracy 

# Try to predict class of text
nb.predict(["Chinese Chinese Chinese Tokyo Japan"])

# Must return[ ('Chinese Chinese Chinese Tokyo Japan', '0')]
# pobability {'1': 0.00013548070246744226, '0': 0.00030121377997263036}
# or log     {'1': -7.906681345001262, '0': -7.10769031284391}

   tokyo  chinese  macao  shanghai  japan
0      0        3      1         1      0
1      1        1      0         0      1


{'0': 0.0001, '1': 0.0001}