# RNN to Classify Reuters Topics

In this project, I will be training a recurrent neural network to Classify Reuters newswires into 46 Topics.

Dataset of 11,228 newswires from Reuters, labeled over 46 topics. Each wire is encoded as a sequence of word indexes.

In [None]:
%tensorflow_version 2.8.0
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf
tf.autograph.set_verbosity(0)

from __future__ import print_function

!pip install numpy
import numpy as np
import keras
keras.__version__

import random
seed = 32
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `2.8.0`. This will be interpreted as: `2.x`.


TensorFlow is already loaded. Please restart the runtime to change versions.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Loading the Reuters Dataset


In [None]:
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer

max_words = 10000 # Number of words to consider as features

print('Loading data...')
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=max_words, test_split=0.2)
word_index = reuters.get_word_index(path="reuters_word_index.json")

Loading data...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json


In [None]:
# Pad train and test sequences
from keras import preprocessing
maxlen=200
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
print('input_train shape:', x_train.shape)
print('input_test shape:', x_test.shape)

input_train shape: (8982, 200)
input_test shape: (2246, 200)


In [None]:
# Converting labels into one hot encodings
from keras.utils.np_utils import to_categorical

one_hot_train_labels = to_categorical(y_train)
one_hot_test_labels = to_categorical(y_test)

## RNN and LSTM


### Single layer RNN model with 128 output units for 10 epochs

In [None]:
#Training a RNN model
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN

model = Sequential()
model.add(Embedding(10000, embedding_size))
model.add(SimpleRNN(128))
model.add(Dense(46, activation='softmax'))

model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['acc']
)
history = model.fit(
    x_train,
    one_hot_train_labels,
    epochs=10,
    batch_size=128,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Evaluate RNN model on test set
model.evaluate(x_test, one_hot_test_labels)



[2.6717309951782227, 0.4065004587173462]

### Single layer LSTM model with 128 output units for 20 epochs

In [None]:
# Training a LSTM model
from keras.layers import LSTM, Dropout

model = Sequential()
model.add(Embedding(10000, embedding_size))
model.add(LSTM(128))
model.add(Dense(46, activation='softmax'))

model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['acc']
)
history = model.fit(
    x_train,
    one_hot_train_labels,
    epochs=20,
    batch_size=128,
    validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Evaluating LSTM model on test set
model.evaluate(x_test, one_hot_test_labels)



[1.7637970447540283, 0.6620659232139587]

## Using a Pre-trained Word Embedding




In [None]:
# Training a LSTM model using pretrained embeddings
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip


--2022-06-07 22:02:45--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-06-07 22:02:45--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-06-07 22:02:45--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [None]:
import os
path_to_glove_file = os.path.join(
    "/content/glove.6B.200d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
# Prepare embedding matrix
num_tokens = 35000
embedding_dim = 200
hits = 0
misses = 0
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))
# len(embedding_matrix)

Converted 24975 words (6004 misses)


In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)


In [None]:
model = Sequential()
model.add(Embedding(10000, embedding_size))
model.add(LSTM(128))
model.add(Dense(46, activation='softmax'))

model.compile(
    optimizer='rmsprop',
    loss='categorical_crossentropy',
    metrics=['acc']
)
history = model.fit(
    x_train,
    one_hot_train_labels,
    epochs=20,
    batch_size=128,
    validation_split=0.2
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Evaluating LSTM model on test set
model.evaluate(x_test, one_hot_test_labels)



[1.7595865726470947, 0.6634016036987305]