**Advanced Machine Learning Final Project:**
**Group 2**
* Jake Machulcz
* Danielle Stealy
* Bridget Liesman
* Erich Haley

In [3]:
# Import pulled from most recent HW
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, \
  Embedding, TextVectorization, Dropout, Input, GRU

1. Load the AG's News Corpus dataset.

In [None]:
# Code provided by project description to obtain dataset
import tensorflow_datasets as tfds
train_data, test_data = tfds.load(
  'ag_news_subset',
  split = ['train', 'test'],
  batch_size = -1,
  as_supervised=True #Added this line to ensure the code worked
)

# Split the text and labels apart
train_reviews, train_labels = tfds.as_numpy(train_data)
test_reviews, test_labels = tfds.as_numpy(test_data)

In [None]:
# Obtain an overview of the dataset
print("Train reviews:", train_reviews.shape)
print("Train labels:", train_labels.shape)
print("Test reviews:", test_reviews.shape)
print("Test labels:", test_labels.shape)

Train reviews: (120000,)
Train labels: (120000,)
Test reviews: (7600,)
Test labels: (7600,)


2. Create a validation set.

In [13]:
x_train, x_valid, y_train, y_valid = \
    train_test_split(train_reviews, train_labels, test_size=0.3, stratify=train_labels, random_state=1)

print(x_train.shape, x_valid.shape, y_train.shape, y_valid.shape)

(84000,) (36000,) (84000,) (36000,)


In [14]:
x_train_tf = tf.constant(x_train, dtype=tf.string)
x_valid_tf = tf.constant(x_valid, dtype=tf.string)
x_test_tf  = tf.constant(test_reviews, dtype=tf.string)

print(x_train_tf.shape, x_valid_tf.shape, x_test_tf.shape)

(84000,) (36000,) (7600,)


In [15]:
y_train_tf = tf.constant(y_train, dtype=tf.int32)
y_valid_tf = tf.constant(y_valid, dtype=tf.int32)
y_test_tf  = tf.constant(test_labels, dtype=tf.int32)

print(y_train_tf.shape, y_valid_tf.shape, y_test_tf.shape)

(84000,) (36000,) (7600,)


3. Create an integer encoding layer (using TextVectorization) to convert the sequences to (max tokens).  
* Adapt this encoder to the training data

In [16]:
max_tokens = 20000
max_sequence_length = 200
vectorized = TextVectorization(
    max_tokens=max_tokens,  
    output_sequence_length=max_sequence_length
)
vectorized.adapt(x_train_tf)

4. Run a Simple Recurrent Neural Network (RNN).

In [25]:
embedding_dims = 32 # M

model_rnn = Sequential()
model_rnn.add(Input(shape=(), dtype='string'))
model_rnn.add(vectorized)
model_rnn.add(Embedding(max_tokens + 1, embedding_dims))
model_rnn.add(SimpleRNN(16, return_sequences=False)) # K
model_rnn.add(Dense(50, activation='relu'))
model_rnn.add(Dropout(.3))
model_rnn.add(Dense(4, activation='softmax'))
model_rnn.summary()


model_rnn.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model_rnn.fit(x_train_tf, y_train_tf, epochs=20, batch_size=512, 
          validation_data=(x_valid_tf, y_valid_tf))

Epoch 1/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.5095 - loss: 1.0747 - val_accuracy: 0.6876 - val_loss: 0.7958
Epoch 2/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - accuracy: 0.7619 - loss: 0.6734 - val_accuracy: 0.7450 - val_loss: 0.6996
Epoch 3/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.8194 - loss: 0.5579 - val_accuracy: 0.7862 - val_loss: 0.6462
Epoch 4/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 33ms/step - accuracy: 0.8453 - loss: 0.4981 - val_accuracy: 0.7780 - val_loss: 0.6683
Epoch 5/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.8552 - loss: 0.4736 - val_accuracy: 0.7897 - val_loss: 0.6520
Epoch 6/20
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 34ms/step - accuracy: 0.8747 - loss: 0.4244 - val_accuracy: 0.7459 - val_loss: 0.7600
Epoch 7/20
[1m165/165

In [26]:
model_rnn.evaluate(x_test_tf, y_test_tf, verbose=0)

[0.8565855026245117, 0.7853947281837463]