# LSTM Sentiment Analysis
This is important.

In [37]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

### Define our functions for preprocessing

In [49]:
def remove_tags(text):
    TAG_RE = re.compile(r'<[^>]+>')
    return TAG_RE.sub('', text)

def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

### Load our dataset

In [46]:
movie_reviews = pd.read_csv("datasets/IMDB Dataset.csv")
movie_reviews.isnull().values.any()
movie_reviews.insert(1, "processed", '', True) # add empty processed column
movie_reviews.head()

Unnamed: 0,review,processed,sentiment
0,One of the other reviewers has mentioned that ...,,positive
1,A wonderful little production. <br /><br />The...,,positive
2,I thought this was a wonderful way to spend ti...,,positive
3,Basically there's a family where a little boy ...,,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",,positive


### Process reviews

In [50]:
for i, row in tqdm(movie_reviews.iterrows(), total=movie_reviews.shape[0]): # process the review
    movie_reviews.iloc[i]['processed'] = preprocess_text(movie_reviews.iloc[i]['review'])

100%|██████████| 50000/50000 [00:19<00:00, 2572.68it/s]


In [51]:
movie_reviews.head()

Unnamed: 0,review,processed,sentiment
0,One of the other reviewers has mentioned that ...,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,I thought this was wonderful way to spend time...,positive
3,Basically there's a family where a little boy ...,Basically there a family where little boy Jake...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",Petter Mattei Love in the Time of Money is vis...,positive


### Split the dataframe into X and y

In [53]:
def split_dataframe(df, xname, yname):
    X = list(df[xname])
    y = array(list(map(lambda x: 1 if x=="positive" else 0, df[yname])))
    return X, y

In [62]:
X, y = split_dataframe(movie_reviews, 'processed', 'sentiment')

### Split the dataframe into train and test

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Prepare the embedding layer

In [64]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

Apply padding so all lists have the same length

In [65]:
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)