<a href="https://colab.research.google.com/github/gitakartika/sentiment-analysis-of-movie-reviews/blob/master/Sentiment_Analysis_of_Rotten_Tomatoes_Movie_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.tokenize import TweetTokenizer
import datetime
#import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score
#from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
pd.set_option('max_colwidth',400)

  import pandas.util.testing as tm


# Reading the input files

In [0]:
train = pd.read_csv('/content/drive/My Drive/WebMining/train.tsv', sep="\t")

In [0]:
train.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what is good for the goose,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is good for the goose,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for the goose,2


In [0]:
train['Sentiment'].value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

# Cleaning the data

In [0]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
from string import punctuation
import re

In [0]:
def clean_review(review_col):
    review_corpus=[]
    for i in range(0,len(review_col)):
        review=str(review_col[i])
        review=re.sub('[^a-zA-Z]',' ',review)
        
        review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
        review=' '.join(review)
        review_corpus.append(review)
    return review_corpus

In [0]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
train['clean_review']=clean_review(train.Phrase.values)

In [0]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
0,1,1,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1,a series of escapade demonstrating the adage that what is good for the goose is also good for the gander some of which occasionally amuses but none of which amount to much of a story
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2,a series of escapade demonstrating the adage that what is good for the goose
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series


# Balancing the data by Resampling

In [0]:
from sklearn.utils import resample
train_2 = train[train['Sentiment']==2]
train_1 = train[train['Sentiment']==1]
train_3 = train[train['Sentiment']==3]
train_4 = train[train['Sentiment']==4]
train_5 = train[train['Sentiment']==0]
train_2_sample = resample(train_2,replace=True,n_samples=75000,random_state=123)
train_1_sample = resample(train_1,replace=True,n_samples=75000,random_state=123)
train_3_sample = resample(train_3,replace=True,n_samples=75000,random_state=123)
train_4_sample = resample(train_4,replace=True,n_samples=75000,random_state=123)
train_5_sample = resample(train_5,replace=True,n_samples=75000,random_state=123)

df_upsampled = pd.concat([train_2, train_1_sample,train_3_sample,train_4_sample,train_5_sample])

In [0]:
df_upsampled.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment,clean_review
1,2,1,A series of escapades demonstrating the adage that what is good for the goose,2,a series of escapade demonstrating the adage that what is good for the goose
2,3,1,A series,2,a series
3,4,1,A,2,a
4,5,1,series,2,series
5,6,1,of escapades demonstrating the adage that what is good for the goose,2,of escapade demonstrating the adage that what is good for the goose


In [0]:
df_upsampled[df_upsampled['Sentiment']==1].shape

(75000, 5)

# Data Processing for ML

In [0]:
text = ' '.join(df_upsampled.loc[df_upsampled.Sentiment == 4, 'Phrase'].values)
text_trigrams = [i for i in ngrams(text.split(), 3)]

In [0]:
Counter(text_trigrams).most_common(30)

[(('one', 'of', 'the'), 1644),
 (('of', 'the', 'year'), 832),
 (('of', 'the', 'best'), 677),
 (('of', 'the', 'most'), 612),
 (('is', 'one', 'of'), 407),
 (('One', 'of', 'the'), 370),
 ((',', 'and', 'the'), 333),
 (('the', 'year', "'s"), 326),
 (('It', "'s", 'a'), 323),
 (('the', 'edge', 'of'), 300),
 (('it', "'s", 'a'), 299),
 (('a', 'movie', 'that'), 297),
 (('of', 'your', 'seat'), 273),
 (('the', 'film', 'is'), 267),
 (('the', 'kind', 'of'), 267),
 (('.', 'is', 'a'), 264),
 (('the', 'film', "'s"), 264),
 (('as', 'one', 'of'), 254),
 ((',', 'the', 'film'), 253),
 (('edge', 'of', 'your'), 249),
 ((',', 'this', 'is'), 236),
 (('as', 'well', 'as'), 231),
 ((',', 'it', "'s"), 226),
 (('film', 'that', 'is'), 223),
 (('.', 'It', "'s"), 218),
 (('a', 'film', 'that'), 211),
 ((',', 'funny', ','), 208),
 (('some', 'of', 'the'), 206),
 (('year', "'s", 'best'), 188),
 (('a', 'solid', 'cast'), 178)]

In [0]:
tokenizer = TweetTokenizer()

In [0]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)
full_text = list(df_upsampled['clean_review'].values)
vectorizer.fit(full_text)
df_upsampled_vectorized = vectorizer.transform(df_upsampled['clean_review'])



In [0]:
y = df_upsampled['Sentiment']

# Applying ML algorithm

In [0]:
from keras.utils import to_categorical
X = df_upsampled['clean_review']
Y = to_categorical(df_upsampled['Sentiment'].values)
print(Y)

Using TensorFlow backend.


[[0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]]


# splitting training set into training and validation set

In [0]:

from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=123)

In [0]:
print(X_train.shape,Y_train.shape)
print(X_val.shape,Y_val.shape)

(284686,) (284686, 5)
(94896,) (94896, 5)


In [0]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Total number of words/features

In [0]:
all_words=' '.join(X_train)
all_words=word_tokenize(all_words)
#print(all_words)
dist=FreqDist(all_words)

num_unique_word=len(dist)
num_unique_word
#X_train.head()

13728

# Number of words for each phrase/text

In [0]:
r_len=[]
for text in X_train:
    word=word_tokenize(text)
  #  print(text)
    l=len(word)
    r_len.append(l)
    
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN

48

In [0]:
max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=5

In [0]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

# Tokenizing the words

In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)

In [0]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences

# Sequence Padding

In [0]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
#print(X_train.shape,X_val.shape)

In [0]:
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
import matplotlib.pyplot as plt

# CNN+BiLSTM


In [0]:
model1= Sequential()
model1.add(Embedding(max_features,100,input_length=max_words))
model1.add(Conv1D(64,kernel_size=3,padding='same',activation='relu'))
model1.add(MaxPooling1D(pool_size=2))
model1.add(Dropout(0.25))
model1.add(Bidirectional(LSTM(128,return_sequences=True)))
model1.add(Dropout(0.3))
model1.add(Flatten())
model1.add(Dense(128,activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(5,activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 48, 100)           1372800   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 48, 64)            19264     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 24, 64)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 24, 256)           197632    
_________________________________________________________________
dropout_2 (Dropout)          (None, 24, 256)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6144)             

In [0]:
%%time
model1.fit(X_train, Y_train, validation_data=(X_val, Y_val),epochs=50, batch_size=batch_size, verbose=1)

Train on 284686 samples, validate on 94896 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 2h 55min 47s, sys: 9min 59s, total: 3h 5min 46s
Wall time: 1h 39min 24s


<keras.callbacks.callbacks.History at 0x7f176065cc88>

In [0]:
y_pred=model1.predict_classes(X_val)

In [0]:
Y_val_rev = [np.argmax(y, axis=None, out=None) for y in Y_val]
Y_val_rev[:10]

[1, 3, 2, 1, 4, 1, 4, 1, 1, 0]

In [0]:
accuracy_score(y_pred,Y_val_rev)

0.8478123419322205

# CNN + LSTM

In [0]:
model2= Sequential()
model2.add(Embedding(max_features,100,input_length=max_words))
model2.add(Conv1D(64,kernel_size=3,padding='same',activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(Dropout(0.25))
model2.add(LSTM(128,return_sequences=True))
model2.add(Dropout(0.3))
model2.add(Flatten())
model2.add(Dense(128,activation='relu'))
model2.add(Dropout(0.5))
model2.add(Dense(5,activation='softmax'))
model2.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 48, 100)           1372800   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 48, 64)            19264     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 24, 64)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 24, 128)           98816     
_________________________________________________________________
dropout_8 (Dropout)          (None, 24, 128)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 3072)             

In [0]:
%%time
model2.fit(X_train, Y_train, validation_data=(X_val, Y_val),epochs=30, batch_size=batch_size, verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 284686 samples, validate on 94896 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
CPU times: user 5h 11min 41s, sys: 16min 22s, total: 5h 28min 3s
Wall time: 3h 3min 4s


<keras.callbacks.callbacks.History at 0x7ff5dedd3ef0>

In [0]:
y_pred=model2.predict_classes(X_val)

In [0]:
accuracy_score(y_pred,Y_val_rev)