<a href="https://colab.research.google.com/github/grantKinsley/Sentiment_Analysis/blob/main/M156_Simulation_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# dependencies
import os
import numpy as np
import pandas as pd
from keras import layers
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Data Import



Download dataset from
https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data

In [None]:
# Upload CSV File
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv("IMDB Dataset.csv",engine='python', error_bad_lines=False)
df.head(5)

Skipping line 41745: unexpected end of data


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

In [51]:
# Replace non alphabetical characters
df.replace(to_replace=r'<br />', value='', regex=True, inplace=True)
df.replace(to_replace=r'[*,/()\'\".?0-9]', value='', regex=True, inplace=True)

# Convert positive to 1, negative to 0
df['sentiment'][df['sentiment'] == 'negative'] = 0
df['sentiment'][df['sentiment'] == 'positive'] = 1
# Convert sentiment int to float
df['sentiment'] = np.asarray(df['sentiment']).astype(np.float32)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1.0
1,A wonderful little production The filming tech...,1.0
2,I thought this was a wonderful way to spend ti...,1.0
3,Basically theres a family where a little boy J...,0.0
4,Petter Matteis Love in the Time of Money is a ...,1.0


In [39]:
avg_num_words = df.review.apply(lambda x: len(x.split(" "))).mean()
embedding_dim = int(avg_num_words)
embedding_dim

226

# CNN

In [56]:
def CNN(sample_size = 10000, verbose = False):
  num_words = 6000
  maxlen = 130

  # Sample Dataset
  df_subset = df.sample(sample_size)

  # Split into Training and Testing Data
  training_data = df_subset.sample(frac=0.8)
  testing_data = df_subset.drop(training_data.index)

  # Prepare Data
  tokenizer = Tokenizer(num_words=num_words)
  tokenizer.fit_on_texts(training_data['review'])

  X_train = tokenizer.texts_to_sequences(training_data['review'])
  X_test = tokenizer.texts_to_sequences(testing_data['review'])

  y_train = training_data['sentiment']
  y_test = testing_data['sentiment']

  vocab_size = len(tokenizer.word_index) + 1

  X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
  X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

  # Prepare Model
  model = Sequential()
  model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
  model.add(layers.Conv1D(128, 5, activation='relu'))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(10, activation='relu'))
  model.add(layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

  model.fit(X_train, y_train,
                    epochs=3,
                    verbose=verbose,
                    validation_data=(X_test, y_test),
                    batch_size=100)
  
  _, training_accuracy = model.evaluate(X_train, y_train, verbose=False)
  _, testing_accuracy = model.evaluate(X_test, y_test, verbose=False)

  return training_accuracy, testing_accuracy

In [58]:
training_accuracy, testing_accuracy = CNN(verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [59]:
training_accuracy, testing_accuracy

(0.9952499866485596, 0.8485000133514404)

# LSTM

In [60]:
def LSTM(sample_size = 10000, verbose = False, max_pooling = False):
  num_words = 6000
  maxlen = 130

  # Sample Dataset
  df_subset = df.sample(sample_size)

  # Split into Training and Testing Data
  training_data = df_subset.sample(frac=0.8)
  testing_data = df_subset.drop(training_data.index)

  # Prepare Data
  tokenizer = Tokenizer(num_words=num_words)
  tokenizer.fit_on_texts(training_data['review'])

  X_train = tokenizer.texts_to_sequences(training_data['review'])
  X_test = tokenizer.texts_to_sequences(testing_data['review'])

  y_train = training_data['sentiment']
  y_test = testing_data['sentiment']

  vocab_size = len(tokenizer.word_index) + 1

  X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
  X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

  # Prepare Model
  model = Sequential()
  model.add(layers.Embedding(num_words, embedding_dim))
  model.add(layers.LSTM(32, return_sequences = True))
  if max_pooling: # conditionally apply max pooling layer
    model.add(layers.GlobalMaxPool1D())
  model.add(layers.Dense(20, activation="relu"))
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(1, activation="sigmoid"))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  model.fit(X_train, y_train,
                    epochs=3,
                    verbose=verbose,
                    batch_size=100)

  _, training_accuracy = model.evaluate(X_train, y_train, verbose=False)
  _, testing_accuracy = model.evaluate(X_test, y_test, verbose=False)

  return training_accuracy, testing_accuracy

In [61]:
training_accuracy, testing_accuracy = LSTM(verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [62]:
training_accuracy, testing_accuracy

(0.8929891586303711, 0.7681461572647095)

In [63]:
training_accuracy, testing_accuracy = LSTM(verbose = True, max_pooling=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [64]:
training_accuracy, testing_accuracy

(0.9421250224113464, 0.8220000267028809)

# Test sample sizes

In [66]:
sample_sizes = [500, 1000, 10000, 25000, 40000]

for sample_size in sample_sizes:
  print("Sample size:", sample_size)
  cnn_train, cnn_test = CNN(sample_size=sample_size)
  lstm_train, lstm_test = LSTM(sample_size=sample_size, max_pooling=True)
  print("CNN test accuracy:", cnn_test)
  print("LSTM test accuracy:", lstm_test)
  print(os.linesep)

Sample size: 500
CNN test accuracy: 0.5899999737739563
LSTM test accuracy: 0.6100000143051147


Sample size: 1000
CNN test accuracy: 0.5350000262260437
LSTM test accuracy: 0.625


Sample size: 10000
CNN test accuracy: 0.8414999842643738
LSTM test accuracy: 0.8295000195503235


Sample size: 25000
CNN test accuracy: 0.8560000061988831
LSTM test accuracy: 0.8500000238418579


Sample size: 40000
CNN test accuracy: 0.8755000233650208
LSTM test accuracy: 0.8653749823570251


