In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
import nltk.data
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt


np.random.seed= 0

In [2]:
#Loading the data

training_data= pd.read_csv('/Users/harikrishnanagarajan/Desktop/Kaggle/SA_using_Word2vec/labeledTrainData.tsv', header= 0, delimiter= '\t')

training_data.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
training_data.review[2222]

"Go immediately and rent this movie. It will be be on a bottom shelf in your local video store and will be covered in dust. No one will have touched it in years. It may even be a $.50 special! It's worth ten bucks, I swear! Buy it! There aren't very many films than can compare with this - the celluloid version of that goo that forms at the bottom of a trash can after a few years. Yes, I gave it a '1,' but it really deserves much lower. 1-10 scales were not designed with stuff like this in mind."

In [4]:
#Checking for null values and balance of data

print(training_data.isna().sum())
print('\n')
print(training_data.sentiment.value_counts())

id           0
sentiment    0
review       0
dtype: int64


1    12500
0    12500
Name: sentiment, dtype: int64


### Cleaning the training data

In [5]:
def clean_raw_text(review):
    
    text= BeautifulSoup(review).get_text()
    
    letters_only= re.sub("[^A-Za-z]", " ", text)
    
    words= letters_only.lower().split()
    
    stop_words= set(stopwords.words('english'))
    
    meaningful_words= [x for x in words if x not in stop_words]
    
    return " ".join(meaningful_words)

In [6]:
clean_reviews= []
                         
for review in training_data.review:
    
    clean_reviews.append(clean_raw_text(review))
    

In [7]:
clean_reviews[2222]

'go immediately rent movie bottom shelf local video store covered dust one touched years may even special worth ten bucks swear buy many films compare celluloid version goo forms bottom trash years yes gave really deserves much lower scales designed stuff like mind'

In [8]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

tokenized= [word_tokenize(review) for review in clean_reviews]

print(tokenized[2222])



['go', 'immediately', 'rent', 'movie', 'bottom', 'shelf', 'local', 'video', 'store', 'covered', 'dust', 'one', 'touched', 'years', 'may', 'even', 'special', 'worth', 'ten', 'bucks', 'swear', 'buy', 'many', 'films', 'compare', 'celluloid', 'version', 'goo', 'forms', 'bottom', 'trash', 'years', 'yes', 'gave', 'really', 'deserves', 'much', 'lower', 'scales', 'designed', 'stuff', 'like', 'mind']


In [85]:
feature_size = 128
context_size = 5
min_word = 1

word_vec= Word2Vec(tokenized, vector_size=feature_size, window=context_size, min_count=min_word, epochs=50)

In [86]:
word_vec_unpack = [(word, idx) for word, idx in \
                   word_vec.wv.key_to_index.items()]
vocab= len(word_vec_unpack) + 2
print(word_vec_unpack[1])
embedding_matrix= np.zeros((vocab, 128))
for word, i in word_vec_unpack:
    embedding_matrix[i]= word_vec.wv[i]
    
print(embedding_matrix.shape)

('film', 1)
(74066, 128)


In [87]:
from sklearn.model_selection import train_test_split

X= np.array(clean_reviews)
y= np.array(training_data.sentiment)

X_train, X_valid, y_train, y_valid= train_test_split(X, y, test_size= 0.2)

In [88]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer= Tokenizer()
tokenizer.fit_on_texts(X)

sequence_length= 100

X_train_padded= pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen= sequence_length, padding= 'post')
X_valid_padded= pad_sequences(tokenizer.texts_to_sequences(X_valid), maxlen= sequence_length, padding= 'post')

In [89]:
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, GlobalMaxPool1D, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Bidirectional, SimpleRNN, GRU
from keras import utils
import tensorflow as tf
from keras.optimizers import Adam

In [96]:
model= Sequential()

model.add(Embedding(vocab, 128, weights= [embedding_matrix], input_length= sequence_length))
model.add(Bidirectional(LSTM(30, activation= 'relu', return_sequences= True)))
model.add(GlobalMaxPool1D())
model.add(Dense(30, activation= 'relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation= 'sigmoid'))

model.summary()

model.compile(loss='binary_crossentropy', optimizer= Adam(learning_rate= 0.001), metrics=['accuracy'])

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, 100, 128)          9480448   
_________________________________________________________________
bidirectional_23 (Bidirectio (None, 100, 60)           38160     
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 60)                0         
_________________________________________________________________
dense_49 (Dense)             (None, 30)                1830      
_________________________________________________________________
dropout_27 (Dropout)         (None, 30)                0         
_________________________________________________________________
dense_50 (Dense)             (None, 1)                 31        
Total params: 9,520,469
Trainable params: 9,520,469
Non-trainable params: 0
___________________________________________

In [97]:
model.fit(X_train_padded, y_train, validation_data=(X_valid_padded, y_valid), epochs= 5, batch_size=128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f801c540dd0>

In [98]:
testing_data= pd.read_csv('/Users/harikrishnanagarajan/Desktop/Kaggle/SA_using_Word2vec/testData.tsv', delimiter= '\t')

testing_data.head()

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...


In [99]:
#Checking test data for null values

print(testing_data.isna().sum())

id        0
review    0
dtype: int64


In [100]:
cleaned_test_reviews= [clean_raw_text(review) for review in testing_data.review]

cleaned_test_reviews[0]

'naturally film main themes mortality nostalgia loss innocence perhaps surprising rated highly older viewers younger ones however craftsmanship completeness film anyone enjoy pace steady constant characters full engaging relationships interactions natural showing need floods tears show emotion screams show fear shouting show dispute violence show anger naturally joyce short story lends film ready made structure perfect polished diamond small changes huston makes inclusion poem fit neatly truly masterpiece tact subtlety overwhelming beauty'

In [101]:
X_test_padded= pad_sequences(tokenizer.texts_to_sequences(cleaned_test_reviews), maxlen= sequence_length, padding= 'post')

In [108]:
import itertools
predictions= model.predict(X_test_padded)

temp= list(itertools.chain.from_iterable(predictions))
results= [round(num, 0) for num in temp]
output = pd.DataFrame( data={"id":testing_data["id"], "sentiment":results} )
output.to_csv('submission.csv', index= False)