In [1]:
import nltk
import re
import tensorflow

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
class_corpus = pd.read_csv('/content/ClassCorpus_V2.csv')

In [6]:
stop_words = set(stopwords.words('english'))

In [7]:
def remove_punctuation(text):
    return re.sub('[^a-zA-Z]', ' ', str(text))

def lower_case(text):
    return text.lower()    

def remove_tags(text):    
    return re.sub("&lt;/?.*?&gt;"," &lt;&gt; ", text)

def remove_special_chars_and_digits(text):
    return re.sub("(\\d|\\W)+"," ", text)

def remove_stop_words(text):
    word_tokens = nltk.word_tokenize(text)  
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

def clean_doc(text: str) -> str:
    text = remove_punctuation(text)
    text = lower_case(text)
    text = remove_tags(text)
    text = remove_special_chars_and_digits(text)
    text = remove_stop_words(text)

    return text

# Experiments for Sentiment Analysis

In [8]:
class_corpus['cleaned_review'] = (class_corpus.MovieReview
                                              .apply(clean_doc))

In [9]:
X = class_corpus.cleaned_review

In [10]:
y = class_corpus[['ReviewType']]

In [11]:
num_classes = 2

In [12]:
model_ohe = OneHotEncoder()

In [13]:
y = (model_ohe.fit_transform(y)).toarray()

In [14]:
y.shape

(190, 2)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)

In [16]:
max_words = 5000
max_len = 5000

In [17]:
tok = Tokenizer(num_words=max_words)

[`Tokenizer` reference](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer)

In [18]:
tok.fit_on_texts(X_train)

In [19]:
sequences = tok.texts_to_sequences(X_train)

In [20]:
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)

[`pad_sequences` reference](https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/sequence/pad_sequences)

In [21]:
sequences_matrix.shape

(152, 5000)

In [22]:
def build_RNN_model():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 64, input_length=max_len, mask_zero=True)(inputs)
    layer = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3))(layer)
    layer = Bidirectional(LSTM(32, dropout=0.3))(layer)
    layer = Dense(64, name='FC1', activation='relu')(layer)
    layer = Dense(num_classes, name='out_layer', activation='softmax')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [23]:
model = build_RNN_model()

In [24]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 5000)]            0         
                                                                 
 embedding (Embedding)       (None, 5000, 64)          320000    
                                                                 
 bidirectional (Bidirectiona  (None, 5000, 128)        66048     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 FC1 (Dense)                 (None, 64)                4160      
                                                                 
 out_layer (Dense)           (None, 2)                 130   

In [25]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [26]:
history = model.fit(sequences_matrix, y_train,
                    batch_size=16,
                    epochs=10,
                    validation_split=0.2,
                    callbacks=[EarlyStopping(monitor='val_accuracy',
                                             patience=3)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [27]:
history.history

{'accuracy': [0.5289255976676941, 0.7272727489471436, 1.0, 0.93388432264328],
 'loss': [0.6943893432617188,
  0.6840061545372009,
  0.6348475813865662,
  0.4030504524707794],
 'val_accuracy': [0.5161290168762207,
  0.3870967626571655,
  0.4193548262119293,
  0.5161290168762207],
 'val_loss': [0.6931840777397156,
  0.6950613856315613,
  0.7069066166877747,
  0.9125917553901672]}

# Experiments for Genre prediction

In [33]:
X = class_corpus.cleaned_review

In [38]:
y = class_corpus[['Genre']]

In [39]:
y.value_counts()

Genre 
Action    50
Comedy    50
Horror    50
Sci-Fi    40
dtype: int64

In [40]:
num_classes = 4

In [42]:
model_ohe = OneHotEncoder()
y = (model_ohe.fit_transform(y).toarray())

In [43]:
y.shape

(190, 4)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)

In [45]:
max_words = 5000
max_len = 5000

In [46]:
tok = Tokenizer(num_words=max_words)

In [47]:
tok.fit_on_texts(X_train)

In [48]:
sequences = tok.texts_to_sequences(X_train)

In [49]:
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)

In [50]:
sequences_matrix.shape

(152, 5000)

In [51]:
def build_RNN_model():
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, 64, input_length=max_len, mask_zero=True)(inputs)
    layer = Bidirectional(LSTM(64, return_sequences=True, dropout=0.3))(layer)
    layer = Bidirectional(LSTM(32, dropout=0.3))(layer)
    layer = Dense(64, name='FC1', activation='relu')(layer)
    layer = Dense(num_classes, activation='softmax', name='out_layer')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [52]:
model = build_RNN_model()

In [53]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 5000)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 5000, 64)          320000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 5000, 128)        66048     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 FC1 (Dense)                 (None, 64)                4160      
                                                                 
 out_layer (Dense)           (None, 4)                 260 

In [54]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

In [None]:
history = model.fit(sequences_matrix, y_train,
                    batch_size=16,
                    epochs=10,
                    validation_split=0.2,
                    callbacks=[EarlyStopping(monitor='val_accuracy',
                                             patience=3)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
