## RNN LSTM Text Classificaiton model

### Dependencies and Libraries

In [10]:
import os
import string                           # For removal of punctuation
from collections import Counter
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('drive/MyDrive/School Work/CS4248/News Labelling Project')

### Reading in data into pd dataframes, data viewing

In [11]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

<class 'pandas.core.frame.DataFrame'>
Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
26829,3,The Idea Of Resistance Against The StateJ.G. V...
33596,3,Skeletal Fluorosis: What Could Fluoride Be Doi...
12621,1,Choosing who should be entrusted to lead our n...
22500,3,Massive Fireball Roars Through Suburban Power ...
5069,1,A group of 16-year-old girls skillfully redire...


In [12]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))
print(df[0].value_counts())

Satire: 14047
Hoax: 6942
Propaganda: 17870
Reliable News: 9995
3    17870
1    14047
4     9995
2     6942
Name: 0, dtype: int64


### Reading in testing set

In [13]:
test_df = pd.read_csv(test_path, header=None)

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(test_df))
test_df.sample(5) # Random sample values to see

Total rows, Total Columns:       0                                                  1
0     1  When so many actors seem content to churn out ...
1     1   In what football insiders are calling an unex...
2     1  In a freak accident following Game 3 of the N....
3     1  North Koreas official news agency announced to...
4     1  The former Alaska Governor Sarah Palin would b...
...  ..                                                ...
2995  4  The Air Force mistakenly gave rival companies ...
2996  4  The United Nations climate chief on Friday cha...
2997  4  River Plate midfielder Diego Buonanotte has un...
2998  4  Lawmakers were on the brink Tuesday of exempti...
2999  4  The Pentagon, which is processing bids on a ne...

[3000 rows x 2 columns]


Unnamed: 0,0,1
64,1,In honor of the 15th anniversary of the theatr...
2921,4,General Electric Co. met the objectives it set...
2766,4,EU governments should stop deepwater drilling ...
2626,4,Defending champion Wolfsburg kept pace with th...
2636,4,"For the past year, I've been warning that the ..."


In [14]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((test_df[0] == label).sum()))
print(test_df[0].value_counts())

Satire: 750
Hoax: 750
Propaganda: 750
Reliable News: 750
1    750
2    750
3    750
4    750
Name: 0, dtype: int64


### Count number of unique words in the entire dataset

In [15]:
# Count number of unique words
def unique_word_counter(texts):
    count = Counter() # Dictionary type
    # Access an entire string
    for text in texts:
        # Split each string into individual words separated by whitespace
        for word in text.split():
            count[word] += 1
    return count

In [16]:
# Run counter
counts = unique_word_counter(df[1])
unique_words_count = len(counts)
print('Number of unique words: ' + str(unique_words_count))
print('Most Common Words:')
counts.most_common(10)

Number of unique words: 677571
Most Common Words:


[('the', 1356038),
 ('to', 729997),
 ('of', 705981),
 ('and', 637368),
 ('a', 522053),
 ('in', 429077),
 ('that', 324277),
 ('is', 303340),
 ('for', 227190),
 ('on', 179343)]

### Prepare datasets, convert into numpy format for Keras Model

In [17]:
X_train = df[1].to_numpy()
y_train = df[0].to_numpy()

X_test = test_df[1].to_numpy()
y_test = test_df[0].to_numpy()

y_train = pd.get_dummies(df[0]).values
y_test = pd.get_dummies(test_df[0]).values
print(type(y_train))
print(type(y_test))

# y_train = y_train - 1
# y_test = y_test - 1
# temp_y_train = []
# for label in y_train:
#     temp_y_train.append([label])
# y_train = np.array(temp_y_train)
# temp_y_test = []
# for label in y_test:
#     temp_y_test.append([label])
# y_test = np.array(temp_y_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(48854,) (3000,)
(48854, 4) (3000, 4)


### Tokenise words into numbers
- Each word will be assigned a specific number, according to how many unique words we have
- Inspired from this [Youtube Video](https://www.youtube.com/watch?v=kxeyoyrf2cM&ab_channel=PythonEngineer)

In [18]:
# Each string is turned into a sequence of integers
tokenizer = Tokenizer(num_words=unique_words_count)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

### Pad sequences to a common length

In [19]:
# Determine minimum number of words in a sequence 65218, 

padding_length = 500

length_count = 0
for item in X_train:
    length = len(item)
    if length > padding_length:
        length_count += 1
print('Number of texts > word length: ' + str(length_count))

Number of texts > word length: 18364


In [20]:
X_train = pad_sequences(X_train, maxlen=padding_length, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=padding_length, padding="post", truncating="post")

# Ensure padded shape of dimension
X_train.shape, X_test.shape

((48854, 500), (3000, 500))

In [21]:
X_train.shape[1]

500

### Train the RNN LSTM model
- We will be embedding the inputs

In [22]:
from tensorflow.python.client import device_lib
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")
print(device_lib.list_local_devices())

No GPU found
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 13402903969182203020
xla_global_id: -1
]


In [23]:
model = Sequential()
model.add(Embedding(unique_words_count, output_dim=100, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))

model_checkpoint = tf.keras.callbacks.ModelCheckpoint("RNN_LSTM.h5", save_best_only=True, monitor='val_accuracy', verbose=1)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          67757100  
                                                                 
 spatial_dropout1d (SpatialD  (None, 500, 100)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 4)                 404       
                                                                 
Total params: 67,837,904
Trainable params: 67,837,904
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit model
model.fit(X_train, y_train, epochs=10, batch_size=128, shuffle=True, validation_data=(X_test, y_test), callbacks=[model_checkpoint])

loss, accuracy = model.evaluate(X_test, y_test)
print('Loss: ' + str(loss) + '    ' + 'Accuracy: ' + str(accuracy))

Epoch 1/10
  4/382 [..............................] - ETA: 15:23 - loss: 1.3745 - accuracy: 0.3672

KeyboardInterrupt: 