# Sentiment Analyses with imdb data


## Connecting notebook with drive

In [1]:
import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
os.chdir('drive/MyDrive/Colab Notebooks')

## Libraries

In [4]:
# libraries to clean text
import re
from   nltk                                   import download
from   nltk.tokenize                          import word_tokenize
from   nltk.corpus                            import stopwords
from   nltk.tokenize                          import word_tokenize
from   nltk.stem                              import WordNetLemmatizer
from   nltk                                   import pos_tag

# required resources for NLTK
download("stopwords")
download("punkt")
download("wordnet")
download("omw-1.4")
download('averaged_perceptron_tagger') 

# libraries to minupulate data
import numpy                                  as     np
import pandas                                 as     pd
from   sklearn.feature_extraction.text        import TfidfVectorizer

# libraries to plot
import wordcloud
import matplotlib.pyplot                      as     plt

# libraries for sentiment analysis baseline
import pickle
import tensorflow_datasets                    as     tfds
from   sklearn.model_selection                import cross_validate
from   sklearn.naive_bayes                    import MultinomialNB
from   tensorflow.keras.preprocessing.text    import text_to_word_sequence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Retriving and Cleaning Data

In [5]:
data_imdb = pd.read_csv("imdb_data.csv")
data_imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
data_imdb['sentiment'] = data_imdb['sentiment'].map({'positive': 1, 'negative': 0})
data_imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [7]:
def split_into_sentences( text, split_pattern = [".", "!", "?"] ):
    """Split a text on . or ? or ! symbol.
    Note: the text is not cleaned, only split.
    Args
        text: string with some text.
        split_pattern: a list with characters to split the sentence.
            These characters must be . or ? or !
    Return
        sentences: list with split text."""

    # check if split_pattern is a list object
    if not isinstance(split_pattern, list):
        # invalid split_patter format
        raise Exception("Invalid param: split_pattern format! It must be a list.")

    # check if split pattern contains only . or ! or ?
    if len(set(split_pattern) - {".", "!", "?"}) > 0:
        # invalid split patter
        raise Exception("Invalid param: split_pattern! It must be . or ! or ?")

    # import required libraries
    import re

    # define split
    regex_split = "".join(split_pattern)

    # split when . or ? or ! is found
    sentences = re.split(f"[{regex_split}]+", text)

    return sentences


def remove_emails( text, replacer = " " ):
    """Remove email addresses from a text.
    Args
        text: string with some text.
        replacer: string with the value to substitute the emails.
    Return
        text: string with processed text.
    NOTE: to avoid errors on email removing, remove_email function
        must be used before removing_mentions fucntion (@)"""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # import required library
    import re

    # remove emails with regex
    text = re.sub("[a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+", replacer, text)

    return text


def remove_mentions( text, replacer = " "):
    """Remove mentions of the @some_word format from a text.
    Args
        text: string with some text.
        replacer: string with the value to substitute the mentions.
    Return
        text: string with processed text.
    NOTE: to avoid errors on mentions removing, remove_email function
        must be used before removing_mentions fucntion (@)"""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # import required library
    import re

    # remove mention with regex
    text = re.sub("@\w+", replacer, text)

    return text


def remove_hashtags( text, replacer = " " ):
    """Remove hashtag of the #some_word format from a text.
    Args
        text: string with some text.
        replacer: string with the value to substitute the hashtags.
    Return
        text: string with processed text."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # import required library
    import re

    # remove hashtag with regex
    text = re.sub("#\w+", replacer, text)

    return text


def remove_urls( text, replacer = " " ):
    """Remove any url from a text.
    Args
        text: string with some text.
        replacer: string with the value to substitute the urls.
    Return
        text: string with processed text."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # import required library
    import re

    # remove any url with regex
    text = re.sub("(https://|http://|www.)\S+",
                  replacer,
                  text)

    return text


def remove_html_tags( text, replacer = " " ):
    """Remove any html tags from a text.
    Args
        text: string with some text.
        replacer: string with the value to substitute the html tags.
    Return
        text: string with processed text."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # import required library
    import re

    # remove any url with regex
    text = re.sub("<.*?>", replacer, text)

    return text


def remove_spaces( text, replacer = " " ):
    """Remove any spaces from a text.
    Args
        text: string with some text.
        replacer: string with the value to substitute the spaces
    Return
        text: string with processed text.
    NOTE: When removing word with regex, the best prectice seems to be
        removing the substituted word with " " (space) and not "" (empty).
        Then, after all replacings, use the remove_space the deal with all spaces
        that were created. In other words, use the remove_spaces function
        as the last step of data cleaning"""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # import required library
    import re

    # remove any spaces with regex
    text = re.sub("\s+", replacer, text)

    return text


def lower_caser( text ):
    """Lower the case of the words in a text.
    Args
        text: string with some text.
    Return
        text: string with lower cased text."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    return text.lower() # lower case


def remove_punctuation( text, replacer = " " ):
    """Remove the punctuation of a text.
    Args:
        text: string with some text.
        replacer: string with the value to substitute the punctuations
    Return
        text: string with processed text."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # import required libraries
    import string

    # iterate over punctuation symbols
    for punctuation in string.punctuation:

        # remove punctuation from string
        text = text.replace(punctuation, replacer)

    return text


def lemmatize( text ):
    """Lemmatize the words in a text.
    Args
        text: string with some text.
    Return
        text: string with processed text."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # import required libraries
    from nltk.stem import WordNetLemmatizer

    # instanciate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate over words in text and lemmatize
    lemmatized_text = [lemmatizer.lemmatize(word) for word in text.split()]

    # join lemmatized items
    lemmatized_text = " ".join(lemmatized_text)

    return lemmatized_text


def remove_stopwords( text ):
    """First it gets the negative stop words and convert it into "not" word.
    Then, it removes all other stop words (but not the "not" word previously created.
    Args
        text: string with some text.
    Return
        text: string with processed text."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # import required library
    from nltk.corpus import stopwords

    # define the english negative stop words
    negative_stop_words = ["no", "nor", "not", "don", "don't", "ain", "ain't", "aren", "aren't",
                           "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't",
                           "hasn", "hasn't", "haven",  "haven't", "isn", "isn't", "mightn", "mightn't",
                           "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't",
                           "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't"]

    # iterate over words in text
    # if word is one of the negative stop words, replace with "not"
    # else keep the word as it is
    # result will be a tokenized text
    tokens_with_neg = ["not" if word in negative_stop_words else word
                       for word in text.split()]

    # get the standard unique stopwords in English
    # according to NLTK
    std_stop_words = stopwords.words('english')

    # remove negative stop words from standard stop words
    final_stop_words = set(std_stop_words) - set(negative_stop_words)

    # remove stop words
    text = " ".join( [word for word in tokens_with_neg if not word in final_stop_words] )

    return text


def remove_numbers( text, replacer = " ", remove_numbers_only = True):
    """Remove numbers (or numbers + special characters) from a text.
    Args
        text: string with some text.
        replacer: string with the value to substitute the emails.
        remove_numbers_only: a boolean to indicate if user wants
            to remove only numbers (remove_numbers_only = True) or
            "numbers + special characters" (remove_numbers_only = False)
    Return
        text: string with processed text."""

    # check text param
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # check replacer param
    if not isinstance(replacer, str):
        # invalid format
        raise Exception("Invalid input: replacer must be a string")

    # check remove_numbers_only param
    if not isinstance(remove_numbers_only, bool):
        # invalid format
        raise Exception("Invalid input: remove_numbers_only must be a boolean")

    # import required library
    import re

    # check if user wants to remove only numbers from words
    if remove_numbers_only:
        # remove only numbers from words with regex
        text = re.sub("\d+", replacer, text)
    # user wants to remove numbers and special characters from the words
    else:
        # remove numbers and special characters from the words with regex
        text = re.sub("[^a-zA-Z]+", replacer, text)

    return text


def part_of_speech_cleaning( text ):
    """Use Part-Of-Speech technique to keep only adj, noun, adverb and verbs
    on the document once usually they are the words that carry
    the most relevant information in a text.
    Args
        text: a list with senteces of the document.
    Return
        pos_text: a list with part-of-speech senteces of the document."""

    # check if text is a string
    if not isinstance(text, str):
        # invalid format
        raise Exception("Invalid input: text must be a string")

    # import required library
    from nltk import pos_tag

    # split the text on spaces then
    # check apply POS to each word (word, POS).
    # Keep the word if the POS of the word is adj ["JJ"],
    # noun ["NN"], adverb ["RB"] or verbs ["VB"].
    # Otherwise, remove the word
    pos_text = [ word for word, pos in pos_tag( text.split() )
                 if pos.startswith( ("JJ", "NN", "RB", "VB") ) ]

    # join the words together to compose a text (instead of a list)
    pos_text = " ".join(pos_text)
    # remove any additional leading of trailing spaces
    pos_text = pos_text.strip()

    return pos_text


def clean_document( split_document, remove_numbers_only = True ):
    """Clean the document text. Clean means:
    (1) remove the emails on the article
    (2) remove the mentions (@someone) on the article
    (3) remove the hashtags (#something) on the article
    (4) remove the urls on the article
    (5) remove html tags (<tag>something</tag>) on the article
    (6) lower the case of all words in the article
    (7) remove words that are commposed of only digitis
    (8) remove puntuation of the sentences
    (9) remove stopwords from sentences
    (10) lemmatize words in sentences
    (11) use part-of-speech technique of the text
    (11) remove spaces on the article
    Args
        split_document: a list with sentences of the document.
        remove_numbers_only: a boolean to indicate if user wants
            to remove only numbers (remove_numbers_only = True) or
            "numbers + special characters" (remove_numbers_only = False).
            This is the parameter used for calling remove_numbers function.
    Return
        cleaned_sentences: a list with cleaned senteces of the document."""

    # remove the emails on the article
    removed_emails = [remove_emails( sentence ) for sentence in split_document]

    # remove the mentions (@someone) on the article
    removed_mentions = [remove_mentions( sentence ) for sentence in removed_emails]

    # remove the hashtags (#something) on the article
    removed_hashtags = [remove_hashtags( sentence ) for sentence in removed_mentions]

    # remove the urls on the article
    removed_urls = [remove_urls( sentence ) for sentence in removed_hashtags]

    # remove html tags (<tag>something</tag>) on the article
    removed_html_tags = [remove_html_tags( sentence ) for sentence in removed_urls]

    # lower the case of all words in the article
    sentences_lower_cased = [lower_caser(sentence) for sentence in removed_html_tags]

    # remove words that are commposed of only digitis
    sentences_without_nums = [remove_numbers(sentence, remove_numbers_only = remove_numbers_only) for sentence in sentences_lower_cased]

    # remove puntuation of the sentences
    sentences_removed_punct = [remove_punctuation(sentence) for sentence in sentences_without_nums]

    # remove stopwords from sentences
    sentences_removed_stopwords = [remove_stopwords(sentence) for sentence in sentences_removed_punct]

    # lemmatize words in sentences
    lemmatized_sentences = [lemmatize(sentence) for sentence in sentences_removed_stopwords]

    # use part-of-speech on the sentences
    pos_sentences = [part_of_speech_cleaning( sentence ) for sentence in lemmatized_sentences]

    # remove spaces on the article
    cleaned_sentences = [remove_spaces( sentence ) for sentence in pos_sentences]

    return cleaned_sentences

In [8]:
data_imdb.drop_duplicates(inplace=True)
data_imdb.dropna(axis=0, inplace=True)

In [9]:
data_imdb['clean_text'] = clean_document(data_imdb['review'])
data_imdb

Unnamed: 0,review,sentiment,clean_text
0,One of the other reviewers has mentioned that ...,1,reviewer mentioned watching oz episode hooked ...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stunnin...
...,...,...,...
49995,I thought this movie did a down right good job...,1,thought movie right good job not creative orig...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,0,catholic taught parochial elementary school nu...
49998,I'm going to have to disagree with the previou...,0,going disagree previous comment side maltin se...


In [11]:
data_imdb.to_csv('dataset.csv', index=False)

## Preprocessing and modelling data
Applying Deep Learning to the preprocessed data.


In [12]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import regularizers, Sequential, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [13]:
data_imdb.drop_duplicates(inplace=True)
data_imdb.dropna(axis=0, inplace=True)

X = data_imdb['clean_text']
y = data_imdb['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

tk = Tokenizer()
tk.fit_on_texts(X_train)

    

X_train_tokenized = tk.texts_to_sequences(X_train)
X_test_tokenized = tk.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokenized, dtype='float32', padding='post', maxlen=300)
X_test_pad = pad_sequences(X_test_tokenized, dtype='float32', padding='post', maxlen=300)

In [14]:
reg_l2 = regularizers.L2(0.3)

embedding_size = 40

model = Sequential()
model.add(layers.Embedding(
    input_dim= len(tk.word_index)+1,
    output_dim=embedding_size,
    mask_zero=True
))

model.add(layers.LSTM(20, kernel_regularizer=reg_l2))
model.add(layers.Dense(1, activation="sigmoid"))
model.summary()

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 40)          3089680   
                                                                 
 lstm (LSTM)                 (None, 20)                4880      
                                                                 
 dense (Dense)               (None, 1)                 21        
                                                                 
Total params: 3,094,581
Trainable params: 3,094,581
Non-trainable params: 0
_________________________________________________________________


In [15]:
es = EarlyStopping(monitor='loss', patience=10, restore_best_weights=True)

model_ = model.fit(X_train_pad, y_train, validation_split=0.25, epochs=20, callbacks = [es])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
#Evaluate

results = model.evaluate(X_test_pad, y_test, batch_size=128)
results



[0.34080061316490173, 0.8868571519851685]