In [None]:
#Danny Hong
#ECE 472 - Deep Learning
#Assignment 5

import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dropout, GlobalMaxPool1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

def get_data(train_csv_file, test_csv_file):

  train, test = pd.read_csv(train_csv_file), pd.read_csv(test_csv_file)

  train['text'], test['text'] = (train.Title + " " + train.Description), (test.Title + " " + test.Description)

  #Encoding the target labels with values between 0 and num_classes - 1 (in this case from 0 to 3).
  label_encoder = LabelEncoder().fit(train['Class Index'])
  train['label'], test['label'] = label_encoder.transform(train['Class Index']), label_encoder.transform(test['Class Index'])

  return train, test

num_classes = 4
split_fraction = 0.8

def main():

  #AG News Dataset was downloaded from this link: https://www.kaggle.com/datasets/amananandrai/ag-news-classification-dataset
  #The train and test files were then uploaded into the notebook directory and read in.
  train, test = get_data(train_csv_file = "train.csv", test_csv_file = "test.csv")

  #Applying cross validation by splitting each of the training sets into a new training set and a validation set
  original_train_size = len(train)
  new_train_size = int(split_fraction * original_train_size)
  val = train[new_train_size:original_train_size]
  train = train[0:new_train_size]

  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(train.text)

  dictionary_size = len(tokenizer.word_index)

  val_encoded, train_encoded, test_encoded = tokenizer.texts_to_sequences(val.text), tokenizer.texts_to_sequences(train.text), tokenizer.texts_to_sequences(test.text)

  x_val, x_train, x_test = pad_sequences(val_encoded), pad_sequences(train_encoded), pad_sequences(test_encoded)
  y_val, y_train, y_test = val.label, train.label, test.label

  model = Sequential()

  model.add(Embedding(input_dim = dictionary_size, output_dim = 64))
  model.add(GlobalMaxPool1D())
  model.add(Dropout(0.2))
  model.add(Dense(num_classes, activation = tf.nn.softmax))

  model.summary()

  optimizer = Adam(learning_rate = 0.001 , decay = 0, beta_1 = 0.9, beta_2 = 0.999, epsilon = 1e-08)
  model.compile(optimizer = optimizer, loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])

  model.fit(x_train, y_train, validation_data = (x_val, y_val), batch_size = 64, epochs = 5, verbose = 1)

  score = model.evaluate(x_test, y_test, verbose = 0)
  print('\nTest loss:', score[0])
  print('Test accuracy:', score[1])

main()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 64)          4079616   
                                                                 
 global_max_pooling1d_12 (Gl  (None, 64)               0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_12 (Dropout)        (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 4)                 260       
                                                                 
Total params: 4,079,876
Trainable params: 4,079,876
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Test loss: 0.25974202156066895
Test accuracy: 0.9161841869