<a href="https://colab.research.google.com/github/durg3sh10/Language_Modelling_using_RNN/blob/main/Language_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Language Modeling

**Instructor**: Pavlos Protopapas<br />


<hr style="height:2pt">

**Imports**

In [None]:
import requests
import re
import os
import zipfile
import collections
import numpy as np
import pandas as pd
import urllib.request
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline
from IPython.core.display import HTML


import tensorflow as tf
from tensorflow import keras
from tensorflow.python.keras import backend as K
from sklearn.model_selection import train_test_split
from tensorflow.keras import Input, Sequential, Model
from tensorflow.keras.layers import Embedding, Bidirectional,LSTM,Dense, TimeDistributed, SimpleRNN
from collections import defaultdict

**Verify Setup**

In [None]:
# Enable/Disable Eager Execution
# Reference: https://www.tensorflow.org/guide/eager
# TensorFlow's eager execution is an imperative programming environment that evaluates operations immediately, 
# without building graphs

#tf.compat.v1.disable_eager_execution()
#tf.compat.v1.enable_eager_execution()

print("tensorflow version", tf.__version__)
print("keras version", tf.keras.__version__)
print("Eager Execution Enabled:", tf.executing_eagerly())

# Get the number of replicas 
strategy = tf.distribute.MirroredStrategy()
print("Number of replicas:", strategy.num_replicas_in_sync)

devices = tf.config.experimental.get_visible_devices()
print("Devices:", devices)
print(tf.config.experimental.list_logical_devices('GPU'))

print("GPU Available: ", tf.config.list_physical_devices('GPU'))
print("All Physical Devices", tf.config.list_physical_devices())

# Better performance with the tf.data API
# Reference: https://www.tensorflow.org/guide/data_performance
AUTOTUNE = tf.data.experimental.AUTOTUNE

tensorflow version 2.8.2
keras version 2.8.0
Eager Execution Enabled: True
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)
Number of replicas: 1
Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
GPU Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
All Physical Devices [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


___
___


## Language Modelling using RNNs
<br />    



<div class="alert alert-block alert-danger" style="color:black;background-color:#E7F4FA">
    
### **PREPROCESS THE DATASET**
   

Read in the dataset `imdb.csv`. Create a new dataframe by splitting each review into individual sentences. The sentences can be delimited by different characters such as period and question mark (eroteme). Call this column as `text` in the new dataframe.
</div>

In [None]:
# Read the data
file_path = "https://drive.google.com/uc?id=1QDSIaV4iERVgc3b0xkW0u7EuTyQ8vncm&export=download"
data = pd.read_csv(file_path, encoding='latin1')
data.head()

Unnamed: 0,text,polarity
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1


In [None]:
# Your code here
df = pd.DataFrame(data["text"].str.split('[\.|\?]'))
df = df.explode('text')
print("Data frame shape:", df.shape)
df.head()

Data frame shape: (150360, 1)


Unnamed: 0,text
0,"first think another Disney movie, might good, ..."
0,"watch it, can't help enjoy it"
0,ages love movie
0,first saw movie 10 8 years later still love i...
0,Christopher Lloyd hilarious perfect part


<div class="alert alert-block alert-danger" style="color:black;background-color:#E7F4FA">

Define a function `clean_data` that takes the new dataframe as input and removes all html tags and non-alphabetic characters from the dataframe. Additionally, convert all characters to lower case. Remove all the sentences where the number of words is less than 10 and higher than 30. Finally, add the start token `<s>` and the end token `</s>` to every sentence (row) in the dataframe. Return the processed the dataframe. 
    
</div>

In [None]:
# Your code here
def clean_data(df):
  df["text"] = df["text"].apply(lambda x: re.sub('<br /><br />', ' ', x))
  df["text"] = df["text"].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s\']', ' ', x))
  df["text"] = df["text"].apply(lambda x: x.lower())
  l = [i for i in df.text if len(i.split(" "))>10 and len(i.split(" "))<=30]
  df = pd.DataFrame(l, columns=['text'])
  df["text"] = df["text"].apply(lambda x: "<s> " + x + " </s>" )
  df.reset_index(inplace=True)
  df = df[["text"]]
  return df

df = clean_data(df)
df.head()

Unnamed: 0,text
0,<s> first think another disney movie might go...
1,<s> first saw movie 10 8 years later still lo...
2,<s> can't help enjoy movie give 10 10 </s>
3,<s> house repeat missed desperate housewives...
4,<s> never thought i'd say this want 15 minut...


<div class="alert alert-block alert-danger" style="color:black;background-color:#E7F4FA">
    
### **TOKENIZE THE DATASET**


Instantiate a Tokenizer for the dataset using `tensorflow.keras.preprocessing.text.Tokenizer` with a vocabulary size of 5000.

</div>

In [None]:
# Your code here
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer  = Tokenizer(num_words = 5000, filters='')

<div class="alert alert-block alert-danger" style="color:black;background-color:#E7F4FA">
Fit the tokenizer on the dataset and get the sequence representation of each sentence.
    
</div>

In [None]:
# Your code here
tokenizer.fit_on_texts(df["text"])
word = tokenizer.word_index
sentences = tokenizer.texts_to_sequences(df["text"])

In [None]:
tokenizer.sequences_to_texts(sentences)[:5]

["<s> first think another disney movie might good it's kids movie </s>",
 '<s> first saw movie 10 8 years later still love it danny glover superb could play part better </s>',
 "<s> can't help enjoy movie give 10 10 </s>",
 '<s> house repeat missed desperate new watch one </s>',
 "<s> never thought i'd say this want 15 minutes fame back </s>"]

In [None]:
sentences[:5]

[[1, 21, 32, 72, 844, 4, 120, 8, 7, 294, 4, 2],
 [1,
  21,
  146,
  4,
  173,
  968,
  62,
  178,
  50,
  43,
  14,
  1549,
  3301,
  944,
  28,
  198,
  84,
  57,
  2],
 [1, 109, 241, 306, 4, 125, 173, 173, 2],
 [1, 213, 3302, 1061, 1629, 65, 40, 5, 2],
 [1, 42, 113, 432, 61, 55, 90, 1094, 139, 1798, 56, 2]]

<div class="alert alert-block alert-danger" style="color:black;background-color:#E7F4FA">
    
### **MODELLING THE DATA**
    
**2.3.1** - The first step is to split the dataset into the predictors ($X$) and the response ($Y$). The predictors for each observation (sentence) are all tokens in that sentence _except_ the **last** token. The response for a given sentence is all tokens in that sentence _except_ the **first**. Using `tf.keras.preprocessing.sequence.pad_sequences` post-pad each sequence in $X$ and $Y$ to a length of 30.
    
```
Example:
if token for <s> = 1 and </s> = 2
sentence_i = [1, 48, 2498, 22, 16, 4, 4, 1554, 149, 14, 22, 2]
x_i = [1,  48,   2498, 22, 16, 4, 4,    1554, 149, 14, 22, 0, ..., 0]
y_i = [48, 2498, 22,   16, 4,  4, 1554, 149,  14,  22, 2,  0, ..., 0]
```

</div>    

In [None]:
# Your code here
x = [i[:-1]for i in sentences]
y = [i[1:]for i in sentences]

from tensorflow.keras.preprocessing.sequence import pad_sequences
X = pad_sequences(x, padding="post", maxlen= 30)
y = pad_sequences(y, padding="post", maxlen= 30)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1)
print("shape of X_train is:",X_train.shape)
print("shape of X_test is:",X_test.shape)
print("shape of y_train is:",y_train.shape)
print("shape of y_test is:",y_test.shape)

shape of X_train is: (55057, 30)
shape of X_test is: (6118, 30)
shape of y_train is: (55057, 30)
shape of y_test is: (6118, 30)


In [None]:
# from tensorflow.keras.utils import to_categorical
# y_test = to_categorical(y_test)
# y_test.shape

<div class="alert alert-block alert-danger" style="color:black;background-color:#E7F4FA">
    
Defining a simple RNN and LSTM model that has an embedding layer with an embedding dimension of 300. The output of the model will be a dense layer with size of the vocabulary and softmax activation. Using the functional API here may make it easier to reuse parts of the network.
    
</div>

In [None]:
hidden_size = 300
vocab_size = 5000

# Clear the tensorflow session
tf.keras.backend.clear_session()

 # Defining the input layer
sentence_input1 = tf.keras.Input(shape=X_train.shape[1:], name='sentence_input')

 # Adding a embedding layer
word_embedding = tf.keras.layers.Embedding(input_dim=vocab_size+1, output_dim=hidden_size, 
                                           name='word_embedding', mask_zero=True)(sentence_input1)

# Adding the 1st RNN layer
RNN1 = tf.keras.layers.SimpleRNN(100, return_sequences=True)(word_embedding)

# Adding the 2nd RNN layer
RNN2 = tf.keras.layers.SimpleRNN(100, return_sequences = True)(RNN1)

# Defining the output layer with 5000 cells and softmax activation
output = tf.keras.layers.Dense(vocab_size, activation='softmax')(RNN2)

# Combining the input and output to form the language model
rnn_model = tf.keras.models.Model(inputs=sentence_input1, outputs=output)

 #Compile the model
rnn_model.compile(loss='sparse_categorical_crossentropy', metrics = ['accuracy'], optimizer="adam")

# Model Summary
rnn_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sentence_input (InputLayer)  [(None, 30)]             0         
                                                                 
 word_embedding (Embedding)  (None, 30, 300)           1500300   
                                                                 
 simple_rnn (SimpleRNN)      (None, 30, 100)           40100     
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 30, 100)           20100     
                                                                 
 dense (Dense)               (None, 30, 5000)          505000    
                                                                 
Total params: 2,065,500
Trainable params: 2,065,500
Non-trainable params: 0
_________________________________________________________________


In [None]:
rnn_history = rnn_model.fit(X_train, y_train, epochs=300, batch_size=512, validation_split=0.2)

In [None]:
def get_accuracy_plots(model_history):
  # Ploting the train and validation accuracy of the model
  plt.rcParams["figure.figsize"] = (10,8)
  plt.title("NER Model")
  plt.plot(model_history.history['accuracy'], label='Train accuracy', color='#FF9A98')
  plt.plot(model_history.history['val_accuracy'],  label='Validation accuracy', color='#75B594')
  plt.legend()
  plt.xlabel('Epochs')
  plt.ylabel('Accuracy');

def get_loss_plots(model_history):
  # Ploting the train and validation loss of the model
  plt.rcParams["figure.figsize"] = (10,8)
  plt.title("NER Model Loss")
  plt.plot(model_history.history['loss'], label='Train loss', color='#FF9A98')
  plt.plot(model_history.history['val_loss'],  label='Validation loss', color='#75B594')
  plt.legend()
  plt.xlabel('Epochs')
  plt.ylabel('Loss');

In [None]:
rnn_model.save("/content/simplernn.h5")

In [None]:
from tensorflow import keras
rnn_model = keras.models.load_model('/content/simplernn.h5')

In [None]:
get_loss_plots(rnn_history)

In [None]:
score = rnn_model.evaluate(X_test, y_test)

In [None]:
y_pred = rnn_model.predict(X_test)
y_pred.shape

In [None]:
ids = {j:i for i,j in word.items()}
ids[0] = "<UNK>"

In [None]:
# y_pred1 = []
# for i in range(y_pred.shape[0]):
#   y_pred[i].shape
#   y_p = np.argmax(y_pred[i],axis=1)
#   print(y_p.shape)

In [None]:
idxx = np.argmax(y_pred[1],axis=1)
idxx

In [None]:
ses = [ids[i] for i in idxx]
print(ses)

In [None]:
y_test[1]

In [None]:
ses1 = [ids[i] for i in y_test[1].tolist()]
print(ses1)

In [None]:
y_test.shape