<a href="https://colab.research.google.com/github/georgeliu1998/sentiment_word_embedding/blob/master/sentiment_word_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis with Word Embedding and RNN

## Setting up the Environment

In [1]:
from numpy.random import seed
seed(8)

from tensorflow import set_random_seed
set_random_seed(8)

import numpy as np
seed = np.random.RandomState(8)

import urllib.request
import tarfile
import os
import re
import string
from time import time

import numpy as np
import pandas as pd
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer

Using TensorFlow backend.


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz#egg=en_core_web_md==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz (120.8MB)
[K    100% |████████████████████████████████| 120.9MB 56.7MB/s 
[?25hInstalling collected packages: en-core-web-md
  Running setup.py install for en-core-web-md ... [?25l- \ | / - \ | / - done
[?25hSuccessfully installed en-core-web-md-2.0.0

[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_md -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en_core_web_md

    You can now load the model via spacy.load('en_core_web_md')



In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Getting the Data

In [0]:
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

path = './gdrive/My Drive/WorkingDir/sentiment_word_embedding'
download_folder = 'raw_data'
download_name = 'imdb.tar.gz'
path_name = os.path.join(path, download_folder, download_name)

# Download data to the specified path
urllib.request.urlretrieve(url, path_name)

#extract_path = os.path.join(path, download_folder)

with tarfile.open(path_name) as tar:
  #tar.extractall(path=extract_path)
  tar.extractall()

('./gdrive/My Drive/WorkingDir/sentiment_word_embedding/raw_data/imdb.tar.gz',
 <http.client.HTTPMessage at 0x7fa4dbb55898>)

In [0]:
def load_imdb(path):
  """
  Loads train and test data into dataframes.

  Params
  ---------------------
  path: str
    The path to the unzipped aclImdb folder.

  Returns
  ---------------------
  df_train, df_test: tuple of of pandas df
    The dataframes created from data
  """

  data = {}

  for split in ['train', 'test']:
    data[split] = []

    for label in ['pos', 'neg']:
      sentiment = 1 if label == 'pos' else 0
      file_names = os.listdir(os.path.join(path, split, label))

      for file_name in file_names:
        file_path = os.path.join(path, split, label, file_name)
        with open(file_path, "r") as f:
          review = f.read()

          data[split].append([review, sentiment])

  np.random.shuffle(data['train'])        
  df_train = pd.DataFrame(data['train'], columns=['review', 'sentiment'])

  np.random.shuffle(data['test'])
  df_test = pd.DataFrame(data['test'], columns=['review', 'sentiment'])

  return df_train, df_test

In [0]:
df_train, df_test = load_imdb('./aclImdb')

In [0]:
def save_df(df, path, save_name):
  file_name = save_name + '.csv'
  path_name = os.path.join(path, file_name)
  
  df.to_csv(path_name, index=False)

In [0]:
save_df(df=df_train, path=path, save_name='df_train')
save_df(df=df_test, path=path, save_name='df_test')

In [0]:
path = './gdrive/My Drive/WorkingDir/sentiment_word_embedding'

df_train = pd.read_csv(os.path.join(path, 'df_train.csv'))
df_test = pd.read_csv(os.path.join(path, 'df_test.csv'))

In [0]:
def load_data():
  """
  Loads all train test data
  """
  
  path = './gdrive/My Drive/WorkingDir/sentiment_word_embedding'
  
  df_train = pd.read_csv(os.path.join(path, 'df_train.csv'))
  df_test = pd.read_csv(os.path.join(path, 'df_test.csv'))
  
  X_train = np.loadtxt(os.path.join(path, 'X_train.csv'), delimiter=",")
  X_test = np.loadtxt(os.path.join(path, 'X_test.csv'), delimiter=",")
  y_train = df_train['sentiment'].values
  y_test = df_test['sentiment'].values
  
  return X_train, X_test, y_train, y_test 

## Data Preprocessing

In [0]:
def preprocess_text(text, lemmatizing=False):
    """
    Preprocesses text by removing all stop words and lemmatizing.
    
    Params
    --------------
    text: str
      the string to be cleaned
    
    Returns
    --------------
    text: str
      the cleaned string

    """
    text = text.lower()
    
    if lemmatizing:
      lemmatizer = WordNetLemmatizer()
      text = ' '.join([lemmatizer.lemmatize(word, pos='v') for word in text.split()])
    
    # Remove html tags
    text = re.sub(r'<.*?>', '', text)
    
    # Replace punctuation with spaces
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(translator)

    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    
    # Remove additional white spaces
    text = ' '.join(text.split())
    
    return text

In [0]:
t1 = time()

X_train = df_train['review'].apply(preprocess_text)

t2 = time()

t = (t2 - t1) / 60

print('Took {} min'.format(t))

Took 14.381852738062541 min


In [0]:
df_train['review_processed'] = X_train
df_train.head()

Unnamed: 0,review,sentiment,review_processed
0,I could never remember the name of this show. ...,1,could never remember name show use watch 8 rem...
1,"Going into this movie, I had heard good things...",1,going movie heard good things coming really am...
2,The only scary thing about this movie is the t...,0,scary thing movie thought whoever made might m...
3,This deserves a 12 out of 10. An absolutely re...,1,deserves 12 10 absolutely refreshing show real...
4,"Joan Fontaine is ""A Damsel in Distress"" in thi...",1,joan fontaine damsel distress 1937 musical sta...


In [0]:
save_df(df=df_train, path=path, save_name='df_train')

In [0]:
t1 = time()

df_test['review_processed'] = df_test['review'].apply(preprocess_text)

t2 = time()

t = (t2 - t1) / 60

save_df(df=df_test, path=path, save_name='df_test')

print('Took {} min'.format(t))

Took 13.990289672215779 min


In [0]:
df_test.head()

Unnamed: 0,review,sentiment,review_processed
0,"""Thunderbolt"" is probably Jackie Chan's worst ...",0,thunderbolt probably jackie chan worst movie s...
1,This is the worst movie that I have ever seen....,0,worst movie ever seen first thought going good...
2,(Warning: Some spoilers ahead.)<br /><br />Wha...,0,warning spoilers ahead incredibly crappy movie...
3,This film is a perfect example of how to take ...,0,film perfect example take fascinating subject ...
4,"Ok, everybody agreed on what was the best seas...",1,ok everybody agreed best season first killing ...


## Modeling with CountVectorizer and SVC

In [0]:
# Transform each text into a vector of word counts
vectorizer = CountVectorizer()

training_features = vectorizer.fit_transform(df_train['review_processed'])    
test_features = vectorizer.transform(df_test['review_processed'])

# Training
model = LinearSVC()
model.fit(training_features, df_train['sentiment'])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(df_test['sentiment'], y_pred)

print("Accuracy on test set: {:.2%}".format(acc))

Accuracy on test set: 84.14%




## Modeling with Word Embedding and Neural Networks

In [0]:
# Load the spacy model
nlp = spacy.load('en_core_web_md')

In [0]:
t1 = time()

X_train = np.array(list(df_train['review_processed'].apply(lambda x: nlp(x).vector.tolist())))

t2 = time()

t = (t2 - t1) / 60

path_name = os.path.join(path, 'X_train_vector.csv')

np.savetxt(path_name, X_train, delimiter=',') 

print('Took {} min'.format(t))

Took 21.16675995985667 min


In [0]:
t1 = time()

X_test = np.array(list(df_test['review_processed'].apply(lambda x: nlp(x).vector.tolist())))

t2 = time()

t = (t2 - t1) / 60

path_name = os.path.join(path, 'X_test_vector.csv')

np.savetxt(path_name, X_test, delimiter=',') 

print('Took {} min'.format(t))

Took 20.361665081977844 min


In [0]:
X_train, X_test, y_train, y_test = load_data()

In [8]:
# Get the first 1000 samples from train dataset
#X_train, y_train = X_train[:1000,], y_train[:1000]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000, 300), (25000,), (25000, 300), (25000,))

In [15]:
from keras.optimizers import RMSprop

model = Sequential()
model.add(Dense(64, input_dim=300, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

rmsprop = RMSprop(lr=0.001)
model.compile(loss='binary_crossentropy',
              optimizer=rmsprop,
              metrics=['accuracy'])

model.fit(X_train, y_train,
          epochs=100,
          batch_size=128)

score = model.evaluate(X_test, y_test, batch_size=128)
print("\nAccuracy on the test set is: {}".format(score[1]))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Modeling with Wprd Embedding and RNN

In [0]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM

#max_features = 1024

model = Sequential()
#model.add(Embedding(max_features, output_dim=256))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(X_train, y_train, batch_size=16, epochs=10)
score = model.evaluate(X_test, y_test, batch_size=16)

ValueError: ignored

## To-Do's
 
- preprocess using spaCy
- lemmatize based on pos: https://stackoverflow.com/questions/41824782/lemmatize-string-according-to-pos-nlp

In [0]:
def cosine_similarity(v1, v2):
  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Log
- original linearSVC: 84.14%
- Initial NN: 85.05%
- SVC with preprocess: 84.14%
- NN with preprocess and new embed: 85.53%


took 37.5 min to convert to embedding before preprocessing, 20 min after preprocessing.