#### References:

@InProceedings{maas-EtAl:2011:ACL-HLT2011,<br/>
  &ensp;&ensp;&ensp;author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},<br/>
  &ensp;&ensp;&ensp;title = {Learning Word Vectors for Sentiment Analysis},<br/>
  &ensp;&ensp;&ensp;booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},<br/>
  &ensp;&ensp;&ensp;month = {June},<br/>
  &ensp;&ensp;&ensp;year = {2011},<br/>
  &ensp;&ensp;&ensp;address = {Portland, Oregon, USA},<br/>
  &ensp;&ensp;&ensp;publisher = {Association for Computational Linguistics},<br/>
  &ensp;&ensp;&ensp;pages = {142--150},<br/>
  &ensp;&ensp;&ensp;url = {http://www.aclweb.org/anthology/P11-1015}<br/>
}

In [72]:
# common libraries
import os
import re
import numpy as np
import pandas as pd
import string
import timeit

# natural language toolkit libraries
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# deep learning libraries
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing.text import Tokenizer

# files
from contractions import CONTRACTION_MAP

In [2]:
# check tensorflow version and is gpu available
is_available = "AVAILABLE" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE"
print(f"Version: {tf.__version__}\nGPU: {is_available}")

Version: 2.7.0
GPU: AVAILABLE


In [4]:
# download IMDB dataset
tfds.load(name='imdb_reviews', data_dir='[replace with your own data directory]', download=True)

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to D:\Portfolio\nlp-web-app\model\imdb_reviews\plain_text\1.0.0...[0m


Dl Size...: 100%|██████████| 80/80 [06:03<00:00,  4.55s/ MiB]
Dl Completed...: 100%|██████████| 1/1 [06:03<00:00, 363.95s/ url]


[1mDataset imdb_reviews downloaded and prepared to D:\Portfolio\nlp-web-app\model\imdb_reviews\plain_text\1.0.0. Subsequent calls will reuse this data.[0m


{Split('train'): <PrefetchDataset shapes: {label: (), text: ()}, types: {label: tf.int64, text: tf.string}>,
 Split('test'): <PrefetchDataset shapes: {label: (), text: ()}, types: {label: tf.int64, text: tf.string}>,
 Split('unsupervised'): <PrefetchDataset shapes: {label: (), text: ()}, types: {label: tf.int64, text: tf.string}>}

In [148]:
# tensorflow load train dataset
builder = tfds.core.builder_from_directory('D:\\Portfolio\\nlp-web-app\\model\\imdb_reviews\\plain_text\\1.0.0')

# as_supervised argument - set the structure of the dataset as input and label
dataset = builder.as_dataset(split=('train[:80%]', 'train[80%:]', 'test'), 
                             shuffle_files=True, 
                             as_supervised=True)      # 80% train 20% test
train, test, valid = dataset
print(f"Train Size: {len(train)}, Test Size: {len(test)}, Valid Size: {len(valid)}")

Train Size: 20000, Test Size: 5000, Valid Size: 25000


In [149]:
data = tfds.as_dataframe(ds=train, ds_info=builder.info)      # require jinja2
df = data.copy(deep=True)
df['text'] = [x.decode('utf-8').strip().lower() for x in df.text]   # remove trailing and leading whitespace
df.head()       # label 0 = negative; 1 = positive

Unnamed: 0,label,text
0,0,this was an absolutely terrible movie. don't b...
1,0,"i have been known to fall asleep during films,..."
2,0,mann photographs the alberta rocky mountains i...
3,1,this is the kind of film for a snowy sunday af...
4,1,"as others have mentioned, all the women that g..."


In [159]:
# reference: https://towardsdatascience.com/nlp-learning-series-part-1-text-preprocessing-methods-for-deep-learning-20085601684b
# reference: https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
# things to consider: drop rows with empty text and spelling corrections
def remove_html_tags(text):
    if bool(re.search(r'<.*?>', text)):
        text = re.sub(r'<.*?>', ' ', text)
    return text


# expand contractions, e.g., don't -> do not, purpose is to standardize our text
def get_contractions(contraction_mapping):
    contraction_regex = re.compile('(%s)' % '|'.join(contraction_mapping.keys()))
    return contraction_mapping, contraction_regex

contractions, contractions_pattern = get_contractions(CONTRACTION_MAP)

def replace_contractions(text):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_pattern.sub(replace, text)


def add_space_between_punctuations(text):
    text = re.sub(r'([a-zA-Z])([,.!()])', r'\1\2 ', text)       # add space between punctuations and letters
    return text


def remove_stopwords(text):
    stopword_list = stopwords.words('english')
    stopword_list = [item for item in stopword_list if item not in ('no', 'not', 'nor', 'any', 'too')]  # useful information
    words = word_tokenize(text)
    filtered_words = [word for word in words if not word in stopword_list] 
    return ' '.join(filtered_words)


def remove_punctuations(text, filters):
    text = text.translate(str.maketrans('', '', filters))
    return text


def remove_multiple_whitespace(text):
    text = re.sub(' +', ' ', text)
    return text


# main function to clean text - ordering of function can be a factor for data cleaning
def clean_text(text, remove_stopword=False, use_bert=False):
    text = remove_html_tags(text)
    text = add_space_between_punctuations(text)
    text = replace_contractions(text)
    
    if not use_bert:        # allow text in bert (bidirectional encoder representations from transformers)
        text = re.sub(r'\+|\d+', '', text)

    if remove_stopword:
        text = remove_stopwords(text)

    # reference: https://github.com/hmohebbi/SentimentAnalysis/blob/master/main.ipynb
    # save certain punctuations if using bert because bert embeddings was trained on wikipedia
    filters = string.punctuation + "\t\n"
    if use_bert:
        text = re.sub(r'\!+', '!', text)
        text = re.sub(r'\!+', '!', text)
        filters = set(filters) - set("-'!?).;,/:(")
        filters = ''.join(filters)
    text = remove_punctuations(text, filters)

    if use_bert:        # remove empty brackets
        text = re.sub(r'\( *\)', ' ', text)

    text = remove_multiple_whitespace(text)
    return text

In [160]:
# part of speeh tagging and wordnet lemmatization
# convert penn treebank tag to wordnet tag
# reference: https://github.com/prateek22sri/Sentiment-analysis/blob/master/unigramSentiWordNet.py
# reference: https://github.com/KT12/tag-lemmatize/blob/master/tag-lemmatize.py
# reference: https://wordnet.princeton.edu/documentation/wnintro3wn
# other techniques include stemming
# stemming is not use in this context as it removes or stems the last few characters, often leading to incorrect spelling
def convert_tag(penn_tag):
    """
    Convert between PennTreebank to WordNet tags
    """
    if penn_tag.startswith('N'):     # Noun
        return wordnet.NOUN
    elif penn_tag.startswith('V'):   # Verb
        return wordnet.VERB
    elif penn_tag.startswith('J'):   # Adjective
        return wordnet.ADJ
    elif penn_tag.startswith('S'):   # Adjective Satellite
        return 's'
    elif penn_tag.startswith('R'):   # Adverb
        return wordnet.ADV
    else:
        return None  # other parts of speech will be returned as none

def pos_and_lemm(text):       # part-of-speech tagging and word lemmatization
    elements = word_tokenize(text)      # tokenize the words
    lemmatizer = WordNetLemmatizer()
    sentence = nltk.pos_tag(elements)
    words = []

    # list of tuples [('token'), 'tag'), ('token2'), 'tag2'...]
    for word, tag in sentence:
        wn_tag = convert_tag(tag)
        if wn_tag is None:
            continue
        words.append(lemmatizer.limmetize(word, wn_tag))
    
    return ' '.join(words)      # O(n) time complexity, if use += it will be O(n^2)

In [161]:
# clean the entire data and separate into new column
df['clean'] = df.text.apply(clean_text)
df.head()

Unnamed: 0,label,text,clean
0,0,this was an absolutely terrible movie. don't b...,this was an absolutely terrible movie do not b...
1,0,"i have been known to fall asleep during films,...",i have been known to fall asleep during films ...
2,0,mann photographs the alberta rocky mountains i...,mann photographs the alberta rocky mountains i...
3,1,this is the kind of film for a snowy sunday af...,this is the kind of film for a snowy sunday af...
4,1,"as others have mentioned, all the women that g...",as others have mentioned all the women that go...


In [41]:
# num_words - number of words to keep in vocab after tokenization for training the network
tokenizer = Tokenizer(num_words=None, 
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                      lower=True, 
                      char_level=False,
                      document_count=0,
                      split=' ')
# tokenizer.fit_on_texts(texts)     # texts are normal array
# list(tokenizer.word_index.items())[:5]