<a href="https://colab.research.google.com/github/grantKinsley/Sentiment_Analysis/blob/main/M156_Simulation_Results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
# dependencies
import os
import numpy as np
import pandas as pd
from keras import layers
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Data Import



Download dataset from
https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data

In [None]:
# Upload CSV File
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv("IMDB Dataset.csv",engine='python', error_bad_lines=False)
df.head(5)

Skipping line 41745: unexpected end of data


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocessing

In [51]:
# Replace non alphabetical characters
df.replace(to_replace=r'<br />', value='', regex=True, inplace=True)
df.replace(to_replace=r'[*,/()\'\".?0-9]', value='', regex=True, inplace=True)

# Convert positive to 1, negative to 0
df['sentiment'][df['sentiment'] == 'negative'] = 0
df['sentiment'][df['sentiment'] == 'positive'] = 1
# Convert sentiment int to float
df['sentiment'] = np.asarray(df['sentiment']).astype(np.float32)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1.0
1,A wonderful little production The filming tech...,1.0
2,I thought this was a wonderful way to spend ti...,1.0
3,Basically theres a family where a little boy J...,0.0
4,Petter Matteis Love in the Time of Money is a ...,1.0


In [39]:
avg_num_words = df.review.apply(lambda x: len(x.split(" "))).mean()
embedding_dim = int(avg_num_words)
embedding_dim

226

# CNN

In [42]:
def CNN(sample_size = 10000, global_avg = False):
  num_words = 6000
  maxlen = 130

  # Sample Dataset
  df_subset = df.sample(sample_size)

  # Split into Training and Testing Data
  training_data = df_subset.sample(frac=0.8)
  testing_data = df_subset.drop(training_data.index)

  # Prepare Data
  tokenizer = Tokenizer(num_words=num_words)
  tokenizer.fit_on_texts(training_data['review'])

  X_train = tokenizer.texts_to_sequences(training_data['review'])
  X_test = tokenizer.texts_to_sequences(testing_data['review'])

  y_train = training_data['sentiment']
  y_test = testing_data['sentiment']

  vocab_size = len(tokenizer.word_index) + 1

  X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
  X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

  # Prepare Model
  model = Sequential()
  model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
  model.add(layers.Conv1D(128, 5, activation='relu'))
  model.add(layers.GlobalMaxPooling1D())
  model.add(layers.Dense(10, activation='relu'))
  model.add(layers.Dense(1, activation='sigmoid'))
  model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])

  model.fit(X_train, y_train,
                    epochs=3,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=100)
  
  print(os.linesep)
  _, accuracy = model.evaluate(X_train, y_train, verbose=False)
  print("Training Accuracy: {:.4f}".format(accuracy))
  _, accuracy = model.evaluate(X_test, y_test, verbose=False)
  print("Testing Accuracy:  {:.4f}".format(accuracy))

In [43]:
CNN()

Epoch 1/3
Epoch 2/3
Epoch 3/3


Training Accuracy: 0.9944
Testing Accuracy:  0.8675


# LSTM

In [48]:
def LSTM(sample_size = 10000, max_pooling = False):
  num_words = 6000
  maxlen = 130

  # Sample Dataset
  df_subset = df.sample(sample_size)

  # Split into Training and Testing Data
  training_data = df_subset.sample(frac=0.8)
  testing_data = df_subset.drop(training_data.index)

  # Prepare Data
  tokenizer = Tokenizer(num_words=num_words)
  tokenizer.fit_on_texts(training_data['review'])

  X_train = tokenizer.texts_to_sequences(training_data['review'])
  X_test = tokenizer.texts_to_sequences(testing_data['review'])

  y_train = training_data['sentiment']
  y_test = testing_data['sentiment']

  vocab_size = len(tokenizer.word_index) + 1

  X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
  X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

  # Prepare Model
  model = Sequential()
  model.add(layers.Embedding(num_words, embedding_dim))
  model.add(layers.LSTM(32, return_sequences = True))
  if max_pooling: # conditionally apply max pooling layer
    model.add(layers.GlobalMaxPool1D())
  model.add(layers.Dense(20, activation="relu"))
  model.add(layers.Dropout(0.5))
  model.add(layers.Dense(1, activation="sigmoid"))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  model.fit(X_train, y_train,
                    epochs=3,
                    verbose=True,
                    batch_size=100)


  print(os.linesep)
  _, accuracy = model.evaluate(X_train, y_train, verbose=False)
  print("Training Accuracy: {:.4f}".format(accuracy))
  _, accuracy = model.evaluate(X_test, y_test, verbose=False)
  print("Testing Accuracy:  {:.4f}".format(accuracy))

In [49]:
LSTM()

Epoch 1/3
Epoch 2/3
Epoch 3/3


Training Accuracy: 0.8970
Testing Accuracy:  0.7626


In [50]:
LSTM(max_pooling=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


Training Accuracy: 0.9436
Testing Accuracy:  0.8370
