In [1]:
import numpy as np
import re
from utils import load_dataset
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/eugene/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Upload datasets

train_messages, train_labels, test_messages, test_labels, datasets_all = load_dataset()
print('dataset shape', datasets_all.shape)
print('train shape', train_messages.shape)
print('test shape', test_labels.shape)

dataset shape (1118, 2)
train shape (895, 1)
test shape (223, 1)


### Text Data Cleaning and Preprocessing

In [3]:
def tokenize_and_normalize(text, remove_small_words = True, leave_only_letters = True, to_lower_case = True):
    """
    Converting a sentence into list of words. Normalize text.
    
    Argument:
    text -- a sentence that should be tokenized and normalized
    to_lower_case -- reduced all words to lowercase. Default value is True
    leave_only_letters -- remove all irrelevant characters (any non-letter characters). Default value is True
    remove_small_words -- remove all small words (less than 3 characters). Default value is True
    
    Returns:
    words -- list of words

    """
    if to_lower_case:
        text=text.lower()
    pattern = r'[A-Z,a-z]' if leave_only_letters else r'\S' 
    pattern += r'{3,}' if remove_small_words else r'{1,}' 
    words=re.findall(pattern,text)
    return words

In [4]:
print('number of words in row string: ', len(train_messages[3, 0].split()))
words = tokenize_and_normalize(train_messages[3, 0])
print('number of words in normalized string: ', len(words))


number of words in row string:  481
number of words in normalized string:  372


### Remove stopwords

In [51]:
def remove_stopwords(row_words):
    """
    Remove stopwords from list of words.
    
    Argument:
    row_words -- a list of words that contains stopwords that should be removed
    
    Returns:
    words -- list of words

    """
    
    clean_words = row_words.copy()
    
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords = tokenize_and_normalize(' '.join(stopwords))
    stopwords = list(set(stopwords))
    
    clean_words = [x for x in clean_words if x not in stopwords]
    
    return clean_words

print('number of words in string before remove stopwords:', len(words))
words = remove_stopwords(words)
print('number of words in string after stopwords have been removed:', len(words))



number of words in string before remove stopwords: 372
number of words in string after stopwords have been removed: 358
