<a href="https://colab.research.google.com/github/dlinnlp2023/material/blob/main/DLinNLP_Day_1_Session_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM & CNN for classification
This notebook describes how to implement LSTM and CNN models for text binary classification using tensorflow and keras.

## Install relevant libraries.

In [None]:
!pip install datasets scikit-learn

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/519.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-

## Load dataset
We use imdb dataset for sentiment classification

In [None]:
import random
from datasets import load_dataset
from sklearn.model_selection import train_test_split

RANDOM_SEED = 500
VALIDATION_SIZE = 0.2

imdb = load_dataset("imdb")

train_split, validation_split = train_test_split(imdb['train'],test_size = VALIDATION_SIZE, random_state=RANDOM_SEED)

train_txt = train_split['text']
train_lbl = train_split['label']

val_txt = validation_split['text']
val_lbl = validation_split['label']

test_txt = imdb['test']['text']
test_lbl = imdb['test']['label']

print(f'training set size = {len(train_txt)} | test set size = {len(test_txt)} | validation set size = {len(val_txt)}')


Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

training set size = 20000 | test set size = 25000 | validation set size = 5000


## Vectorise the dataset and build the vocabulary

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization

MAX_LENGTH = 300
MAX_VOCAB_SIZE = 20000
BATCH_SIZE = 128

vectorizer = TextVectorization(max_tokens=MAX_VOCAB_SIZE, output_sequence_length=MAX_LENGTH)
text_ds = tf.data.Dataset.from_tensor_slices(train_txt).batch(BATCH_SIZE)
vectorizer.adapt(text_ds)


In [None]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'and', 'a']

In [None]:
output = vectorizer([["You are welcome to the RANLP conference"]])
output.numpy()[0, :8]

array([   23,    24,  2368,     6,     2,     1, 14907,     0])

## Download embeddings

In [None]:
!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip

--2023-08-29 23:23:36--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-08-29 23:26:16 (5.16 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
!unzip -q glove.6B.zip

In [None]:
!ls

drive		   glove.6B.200d.txt  glove.6B.50d.txt	sample_data
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip


## Create word index and embeddings index

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

path_to_glove_file = 'glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


## Build embeddings matrix

In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 18715 words (1285 misses)


## Create LSTM model

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras import layers

NUM_CLASSES = 2

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
    name='embeddings'
)
input = tf.keras.Input(shape=(None,), dtype="int64", name="input")
x = embedding_layer(input)
x = layers.LSTM(128, name="lstm_1",return_sequences=True)(x)
x = layers.LSTM(128, name="lstm_2")(x)
output = layers.Dense(NUM_CLASSES, activation="softmax", name="dense_predictions")(x)
model = keras.Model(inputs=input, outputs=output, name="lstm_model")
model.summary()

Model: "lstm_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, None)]            0         
                                                                 
 embeddings (Embedding)      (None, None, 100)         2000200   
                                                                 
 lstm_1 (LSTM)               (None, None, 128)         117248    
                                                                 
 lstm_2 (LSTM)               (None, 128)               131584    
                                                                 
 dense_predictions (Dense)   (None, 2)                 258       
                                                                 
Total params: 2,249,290
Trainable params: 249,090
Non-trainable params: 2,000,200
_________________________________________________________________


## Train Model

In [None]:
x_train = vectorizer(np.array([[s] for s in train_txt])).numpy()
x_val = vectorizer(np.array([[s] for s in val_txt])).numpy()

y_train = np.array(train_lbl)
y_val = np.array(val_lbl)

LEARNING_RATE = 0.01
optimiser = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
# optimiser = keras.optimizers.SGD(learning_rate=LEARNING_RATE)
# optimiser = keras.optimizers.RMSprop(learning_rate=LEARNING_RATE)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimiser, metrics=["accuracy"],
)
model.fit(x_train, y_train, batch_size=256, epochs=3, validation_data=(x_val, y_val))


x_test = vectorizer(np.array([[s] for s in test_txt])).numpy()
y_test = np.array(test_lbl)
scores = model.evaluate(x_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 83.58%


## Early Stopping

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2,min_delta=0.001)
model.fit(x_train, y_train, batch_size=64, epochs=5, validation_data=(x_val, y_val),callbacks=[callback])

## Inferencing

In [None]:
string_input = keras.Input(shape=(1,), dtype="string")
x = vectorizer(string_input)
preds = model(x)
end_to_end_model = keras.Model(string_input, preds)

probabilities = end_to_end_model.predict(
    [["I like this movie"]]
)
print(probabilities)
np.argmax(probabilities[0])

[[0.3184405 0.6815595]]


1

# Bi-LSTM

Change the above model to a Bi-LSTM model

In [None]:
# Your code goes here

# CNN
Change the above model to a CNN 1d model

In [None]:
# Your code goes here