## RNN LSTM Text Classificaiton model

### Dependencies and Libraries

In [None]:
import os
import string                           # For removal of punctuation
from collections import Counter
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('drive/MyDrive/School Work/CS4248/News Labelling Project')

### Reading in data into pd dataframes, data viewing

In [None]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

In [None]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))
print(df[0].value_counts())

### Reading in testing set

In [None]:
test_df = pd.read_csv(test_path, header=None)

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(test_df))
test_df.sample(5) # Random sample values to see

In [None]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((test_df[0] == label).sum()))
print(test_df[0].value_counts())

### Preprocessing Functions
- Removal of punctuation in Python Strings, [link](https://datagy.io/python-remove-punctuation-from-string/#:~:text=One%20of%20the%20easiest%20ways,maketrans()%20method.)
- Can look at common name removal: 

In [None]:
# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))
# print(stopword_set)

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def tokenise_text(text):
    tokens = nltk.word_tokenize(text)
    # print(tokens)
    return tokens

def remove_stopwords(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        if token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Remove all punctuations - Affects words such as U.S.A etc
# Removal of stop words has to be done prior to punctuation removal
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    return depunctuated_text

# Prevent concatenation of statistics and names
def replace_hyphens(text):
    return text.replace('-', ' ')

# Combine all processes into a single preprocess text function to call on df
def preprocess_text(text):
    dehyphenated_text = replace_hyphens(text)
    lowered_text = to_lower_case(dehyphenated_text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords(depunctuated_text, stopword_set)
    return second_stopword_pass

In [None]:
# Test text preprocessing model
test_string = "I was down in the U.S.A a few days ago! Spent $1,340. But i'll be real, don't do it. Isn't it?"
print('Preprocessing test: ')
preprocess_text(test_string)

### Preprocess all text in the training data

In [None]:
print('Cleaning text...')
df[1] = df[1].map(preprocess_text)
print('Preprocessing done!')
df.sample(10) # Random sample values to see

### Preprocess text in testing data

In [None]:
print('Cleaning text...')
test_df[1] = test_df[1].map(preprocess_text)
print('Preprocessing done!')
test_df.sample(10) # Random sample values to see

### Count number of unique words in the entire dataset

In [None]:
# Count number of unique words
def unique_word_counter(texts):
    count = Counter() # Dictionary type
    # Access an entire string
    for text in texts:
        # Split each string into individual words separated by whitespace
        for word in text.split():
            count[word] += 1
    return count

In [None]:
# Run counter
counts = unique_word_counter(df[1])
unique_words_count = len(counts)
print('Number of unique words: ' + str(unique_words_count))
print('Most Common Words:')
counts.most_common(10)

### Prepare datasets, convert into numpy format for Keras Model

In [None]:
X_train = df[1].to_numpy()
y_train = df[0].to_numpy()
y_train = y_train - 1
X_test = test_df[1].to_numpy()
y_test = test_df[0].to_numpy()
y_test = y_test - 1

X_train.shape, X_test.shape

### Tokenise words into numbers
- Each word will be assigned a specific number, according to how many unique words we have
- Inspired from this [Youtube Video](https://www.youtube.com/watch?v=kxeyoyrf2cM&ab_channel=PythonEngineer)

In [None]:
# Each string is turned into a sequence of integers
tokenizer = Tokenizer(num_words=unique_words_count)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

### Pad sequences to a common length

In [None]:
# Determine minimum number of words in a sequence 65218, 

length_count = 0
for item in X_train:
    length = len(item)
    if length > 3000:
        length_count += 1
print('Number of texts > 5000 word length: ' + str(length_count))

padding_length = 3000

X_train = pad_sequences(X_train, maxlen=padding_length, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=padding_length, padding="post", truncating="post")

# Ensure padded shape of dimension
X_train.shape, X_test.shape

### Train the RNN LSTM model
- We will be embedding the inputs

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

model = keras.models.Sequential()
model.add(Embedding(unique_words_count, 150, input_length=padding_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(150, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))

model_checkpoint = tf.keras.callbacks.ModelCheckpoint("best_model.h5", save_best_only=True, monitor='val_accuracy', verbose=1)

model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=15, validation_data=(X_test, y_test), callbacks=[model_checkpoint])

loss, accuracy = model.evaluate(X_test, y_test)
print('Loss: ' + str(loss) + '    ' + 'Accuracy: ' + str(accuracy))

### Test Out Model

In [None]:
new_test = pd.read_csv(test_path, header=None)
sample = new_test.sample(30)

sample

In [None]:
print('Cleaning text...')
sample[1] = sample[1].map(preprocess_text)
print('Preprocessing done!')

sample_X = sample[1].to_numpy()
sample_y = sample[0].to_numpy()
sample_y = sample_y - 1

classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
actual_labels = []
for label in sample_y:
    actual_labels.append(classes[label])
print(actual_labels)

In [None]:
sample_X = tokenizer.texts_to_sequences(sample_X)
sample_X = pad_sequences(sample_X, maxlen=padding_length, padding="post", truncating="post")

In [None]:
# Predictions
predictions = []
for test_instance in sample_X:
    prediction = model.predict(test_instance)
    predictions.append(classes[np.argmax(prediction)])

print(predictions)