<a href="https://colab.research.google.com/github/ezratawil/LSTM-Sentiment-Analysis-/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from google.colab import files
from sklearn.pipeline import Pipeline
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter
from sklearn.preprocessing import FunctionTransformer
import string
from tqdm.notebook import tqdm
tqdm.pandas()
from keras.layers import Embedding, Dense, Dropout, Input, LSTM, GlobalMaxPooling1D
from keras.callbacks import EarlyStopping,ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Import curated data for processing

In [5]:
amazon = pd.read_csv('/content/amazon.csv')
imdb = pd.read_csv('/content/imdb_proc.csv')
yelp = pd.read_csv('/content/yelp.csv')

In [6]:
print(amazon.value_counts(amazon['label']))
print(imdb.value_counts(imdb['label']))
print(yelp.value_counts(yelp['label']))

label
1    2000000
0    2000000
dtype: int64
label
1    25000
0    25000
dtype: int64
label
1    299000
0    299000
dtype: int64


In [None]:
# split into train and test at this point, 
# because datasets vary in size, so we would like a porportional sample of each 
# dataset for training, and we would also like to stratify our sampling so we
# get a porportional amount of negative and positive sentiment texts to 
# train/fit the model to

# amazon data too large for model training time , reduce to 1M samples
amazon_keep , amazon_drop =  train_test_split(amazon, test_size=0.8, random_state=0, stratify=amazon[['label']])

amazon_train,amazon_test = train_test_split(amazon_keep,test_size=0.2,random_state=0,stratify=amazon_keep[['label']])
amazon_train, amazon_val =  train_test_split(amazon_train, test_size=0.2, random_state=0, stratify=amazon_train[['label']])

yelp_train, yelp_test =  train_test_split(yelp, test_size=0.15, random_state=0, stratify=yelp[['label']])
yelp_train, yelp_val =  train_test_split(yelp_train, test_size=0.2, random_state=0, stratify=yelp_train[['label']])

imdb_train, imdb_test = train_test_split(imdb, test_size=0.15, random_state=0, stratify=imdb[['label']])
imdb_train, imdb_val = train_test_split(imdb_train, test_size=0.2, random_state=0, stratify=imdb_train[['label']])

In [None]:
# total 952640 training instances
train_data = pd.concat([amazon_train,yelp_train,imdb_train],ignore_index=True)
# total 257200 validation instances
val_data = pd.concat([amazon_val,yelp_val,imdb_val],ignore_index=True)
# total 238160 testing instances
test_data = pd.concat([amazon_test,yelp_test,imdb_test],ignore_index=True)
# remove unnecessary cols
train_data.drop(['Unnamed: 0','website'],axis=1,inplace=True)
test_data.drop(['Unnamed: 0','website'],axis=1,inplace=True)
val_data.drop(['Unnamed: 0','website'],axis=1, inplace=True)

In [None]:
print(f'train: {train_data.shape}, test: {test_data.shape}, val: {val_data.shape}')

train: (952640, 2), test: (257200, 2), val: (238160, 2)


# Preprocessing

In [None]:
STOPWORDS = set(stopwords.words('english'))
counter = Counter()
FREQ = set([word for (word,word_count) in counter.most_common(15)])
rare_words = 200000
RARE = set([word for (word, word_count) in counter.most_common()[:-rare_words-1:-1]])
# func to remove all punctuation
def remove_punc(text):
  return text.translate(str.maketrans('', '', string.punctuation))
# func to remove stopwords
def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in STOPWORDS])
# func to remove 15 most frequent words ()
def remove_freq(text):
  return " ".join([word for word in str(text).split() if word not in FREQ])

def remove_rare(text):
  return " ".join([word for word in str(text).split() if word not in RARE])



def preprocess(df):
  df["text"] = df["text"].progress_apply(lambda text: remove_punc(text))
  df["text"] = df["text"].progress_apply(lambda text: remove_stopwords(text))
  df["text"] = df["text"].progress_apply(lambda text: remove_freq(text))
  df["text"] = df["text"].progress_apply(lambda text: remove_rare(text))
  
preprocessing = FunctionTransformer(preprocess) # make transformer for pipeline use



In [None]:
prep_pipe = Pipeline([('preprocessing',preprocessing)])

prep_pipe.fit_transform(train_data)

prep_pipe.transform(val_data)

prep_pipe.transform(test_data)

  0%|          | 0/952640 [00:00<?, ?it/s]

  0%|          | 0/952640 [00:00<?, ?it/s]

  0%|          | 0/952640 [00:00<?, ?it/s]

  0%|          | 0/952640 [00:00<?, ?it/s]

# Tokenize and pad sequences

In [2]:
train_data = train_data.sample(frac=1).reset_index(drop=True) # shuffle datasets
val_data = val_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

train_data.dropna(inplace=True)
val_data.dropna(inplace=True)
X_train,y_train = train_data['text'],train_data['label']
X_val,y_val = val_data['text'],val_data['label']
X_test,y_test = test_data['text'],test_data['label']

In [None]:
# keep only 100000 most common words, oov = out of vocabulary token, 
# used to replace words not in vocab 
tokenizer = Tokenizer(num_words=100000, oov_token='<UNK>') 

tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
vocab_size


1098481

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def get_sequences(tokenizer, text):
    sequences = tokenizer.texts_to_sequences(text)
    padded_sequences = pad_sequences(sequences, truncating='post', maxlen=100, padding='post')
    return padded_sequences.astype('float32')

In [None]:
padded_train_sequences = get_sequences(tokenizer, X_train)
padded_val_sequences = get_sequences(tokenizer, X_val)
padded_test_sequences = get_sequences(tokenizer, X_test)

In [None]:
padded_train_sequences[100808]

array([2.0000e+00, 1.1680e+03, 7.0000e+00, 8.3000e+01, 1.3000e+01,
       5.8700e+02, 1.9000e+02, 2.0000e+00, 3.3300e+02, 2.0000e+00,
       9.5330e+03, 2.6670e+03, 3.6740e+03, 4.6000e+01, 2.7860e+03,
       1.8560e+03, 2.0000e+00, 1.8100e+02, 3.5660e+03, 8.2000e+01,
       1.1000e+01, 7.0900e+02, 5.2000e+01, 8.7600e+02, 1.6100e+03,
       3.7000e+01, 4.0000e+01, 4.9000e+02, 2.6810e+03, 4.4000e+01,
       1.1000e+02, 1.1000e+01, 2.6670e+03, 3.8800e+02, 1.5520e+03,
       3.6740e+03, 1.7490e+03, 1.3700e+02, 9.1800e+02, 6.4530e+03,
       5.7000e+01, 5.7000e+01, 1.7180e+03, 6.9000e+01, 1.9860e+03,
       8.0000e+02, 1.2600e+02, 2.5000e+01, 1.5985e+04, 4.0800e+02,
       1.8100e+02, 1.3140e+03, 1.7180e+03, 5.3800e+02, 3.5700e+02,
       6.0300e+02, 2.7330e+03, 9.0000e+00, 2.0300e+02, 1.0170e+03,
       8.1240e+03, 4.6000e+01, 5.4000e+01, 1.1000e+01, 4.9600e+02,
       2.8000e+02, 2.2690e+03, 2.7720e+03, 5.8550e+03, 5.1728e+04,
       9.1800e+02, 1.3012e+04, 2.2000e+01, 5.8700e+02, 5.2000e

#  Create Embedding from Glove embeddings and Build model

In [None]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.100d.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()
print(f'Loaded {len(embeddings_index)} word vectors')

# generate embedding matrix by mapping the vocabulary to the pretrained word embeddings:

# create a weight matrix for words in training reviews
embedding_mat = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items():
    embedding_vec = embeddings_index.get(word)
    if embedding_vec is not None:
        embedding_mat[i] = embedding_vec


Loaded 400001 word vectors


In [None]:
# defining model architeture
model=tf.keras.Sequential()

#embedding layer
model.add(Embedding(vocab_size,100,weights=[embedding_matrix],input_length=100,trainable=False)) 

#lstm layer
model.add(LSTM(128,return_sequences=True,dropout=0.2))

#Global Maxpooling
model.add(GlobalMaxPooling1D())

#Dense Layer
model.add(Dense(64,activation='tanh')) 
model.add(Dense(1,activation='sigmoid')) 

#loss function, metrics, optimizer
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.01), 
              loss='binary_crossentropy',metrics=["acc"]) 

#Adding callbacks
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=3)  
checkpoint=ModelCheckpoint('best_model.h5',
                                      monitor='val_acc',
                                      mode='max', save_best_only=True,verbose=1)  

#Print summary of model
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          109848100 
                                                                 
 lstm (LSTM)                 (None, 100, 128)          117248    
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 109,973,669
Trainable params: 125,569
Non-trainable params: 109,848,100
____________________________________

  super(Adam, self).__init__(name, **kwargs)


# Train Latest Model

In [None]:
# training the model:

history = model.fit(padded_train_sequences,
                    np.array(y_train),batch_size=1024,epochs=10,
                    validation_data=(padded_val_sequences,np.array(y_val)),
                    verbose=1,callbacks=[es,checkpoint])

# Evaluating our model on the Test Data

In [None]:
#loading best model
from keras.models import load_model
model = load_model('best_model.h5')

#evaluation 
_,test_acc = model.evaluate(padded_test_sequences,y_test, batch_size=1024)

0.898804783821106


# Comparing our model to other classifiers

In [None]:
# BASELINE Classifier
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier() # clasiifies by guessing the most frequent label
dummy_clf.fit(X_test,y_test)
dummy_clf.predict(X_test)
dummy_clf.score(X_test,y_test)

0.5000077761707025

# Comparing our model to NLTK's TextBlob 

In [None]:
from textblob import TextBlob
def sentiment_calc(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None
def get_sentiment_score(df):
  df['sentiment'] = df['text'].apply(sentiment_calc)

  df.loc[df['sentiment'] >= 0, 'sentiment'] = 1
  df.loc[df['sentiment'] < 0, 'sentiment'] = 0
  correct = 0
  for label,score in zip(df['label'],df['sentiment']):
    if label == score:
      correct += 1
  total = df['sentiment'].count()
  acc = correct/total
  return acc



In [3]:
print(f'Baseline Classifier accuracy: {dummy_clf.score(X_test,y_test)}\n')
print(f'TextBlob Accuracy on Test Data: {get_sentiment_score(test_data)}\n')
print(f'Our Model Accuracy on Test Data: {test_acc}')

Baseline Classifier accuracy: 0.5000077761707025

TextBlob Accuracy on Test Data: 0.6634784366786419

Our Model Accuracy on Test Data: 0.898804783821106
