## RNN LSTM Text Classificaiton model

### Dependencies and Libraries

In [None]:
import os
import string                           # For removal of punctuation
from collections import Counter
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
# nltk.download('stopwords')
# nltk.download('punkt')

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('drive/MyDrive/School Work/CS4248/News Labelling Project')

### Reading in data into pd dataframes, data viewing

In [None]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

In [None]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))
print(df[0].value_counts())

### Reading in testing set

In [None]:
test_df = pd.read_csv(test_path, header=None)

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(test_df))
test_df.sample(5) # Random sample values to see

In [None]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((test_df[0] == label).sum()))
print(test_df[0].value_counts())

### Count number of unique words in the entire dataset

In [None]:
# Count number of unique words
def unique_word_counter(texts):
    count = Counter() # Dictionary type
    # Access an entire string
    for text in texts:
        # Split each string into individual words separated by whitespace
        for word in text.split():
            count[word] += 1
    return count

In [None]:
# Run counter
counts = unique_word_counter(df[1])
unique_words_count = len(counts)
print('Number of unique words: ' + str(unique_words_count))
print('Most Common Words:')
counts.most_common(10)

### Prepare datasets, convert into numpy format for Keras Model

In [None]:
X_train = df[1].to_numpy()
y_train = df[0].to_numpy()

X_test = test_df[1].to_numpy()
y_test = test_df[0].to_numpy()

y_train = pd.get_dummies(df[0]).values
y_test = pd.get_dummies(test_df[0]).values
print(type(y_train))
print(type(y_test))

# y_train = y_train - 1
# y_test = y_test - 1
# temp_y_train = []
# for label in y_train:
#     temp_y_train.append([label])
# y_train = np.array(temp_y_train)
# temp_y_test = []
# for label in y_test:
#     temp_y_test.append([label])
# y_test = np.array(temp_y_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

### Tokenise words into numbers
- Each word will be assigned a specific number, according to how many unique words we have
- Inspired from this [Youtube Video](https://www.youtube.com/watch?v=kxeyoyrf2cM&ab_channel=PythonEngineer)

In [None]:
# Each string is turned into a sequence of integers
tokenizer = Tokenizer(num_words=unique_words_count)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

### Pad sequences to a common length

In [None]:
# Determine minimum number of words in a sequence 65218, 

padding_length = 500

length_count = 0
for item in X_train:
    length = len(item)
    if length > padding_length:
        length_count += 1
print('Number of texts > word length: ' + str(length_count))

In [None]:
X_train = pad_sequences(X_train, maxlen=padding_length, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=padding_length, padding="post", truncating="post")

# Ensure padded shape of dimension
X_train.shape, X_test.shape

In [None]:
X_train.shape[1]

### Train the RNN LSTM model
- We will be embedding the inputs

In [None]:
# from tensorflow.python.client import device_lib
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# if tf.test.gpu_device_name():
#     print('GPU found')
# else:
#     print("No GPU found")
# print(device_lib.list_local_devices())

In [None]:
model = Sequential()
model.add(Embedding(unique_words_count, output_dim=100, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))

model_checkpoint = tf.keras.callbacks.ModelCheckpoint("RNN_LSTM.h5", save_best_only=True, monitor='val_accuracy', verbose=1)
model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit model
model.fit(X_train, y_train, epochs=10, batch_size=128, shuffle=True, validation_data=(X_test, y_test), callbacks=[model_checkpoint])

loss, accuracy = model.evaluate(X_test, y_test)
print('Loss: ' + str(loss) + '    ' + 'Accuracy: ' + str(accuracy))