## RNN LSTM Text Classificaiton model

### Dependencies and Libraries

In [1]:
import os
import string                           # For removal of punctuation
from collections import Counter
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')
# nltk.download('punkt')

# from google.colab import drive
# drive.mount('/content/drive')
# os.chdir('drive/MyDrive/School Work/CS4248/News Labelling Project')

### Reading in data into pd dataframes, data viewing

In [2]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

<class 'pandas.core.frame.DataFrame'>
Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
13640,1,This week Brooke Alvarez's new book 'Alone In ...
3611,1,It's always hard to decide what to wear when y...
2613,1,Speaking at its annual summit held around the ...
22059,3,Kratom Ban No Longer 9/30: DEA Says Timetable ...
10584,1,Citing its seductive warmth and utter remove f...


In [3]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))
print(df[0].value_counts())

Satire: 14047
Hoax: 6942
Propaganda: 17870
Reliable News: 9995
3    17870
1    14047
4     9995
2     6942
Name: 0, dtype: int64


### Reading in testing set

In [4]:
test_df = pd.read_csv(test_path, header=None)

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(test_df))
test_df.sample(5) # Random sample values to see

Total rows, Total Columns:       0                                                  1
0     1  When so many actors seem content to churn out ...
1     1   In what football insiders are calling an unex...
2     1  In a freak accident following Game 3 of the N....
3     1  North Koreas official news agency announced to...
4     1  The former Alaska Governor Sarah Palin would b...
...  ..                                                ...
2995  4  The Air Force mistakenly gave rival companies ...
2996  4  The United Nations climate chief on Friday cha...
2997  4  River Plate midfielder Diego Buonanotte has un...
2998  4  Lawmakers were on the brink Tuesday of exempti...
2999  4  The Pentagon, which is processing bids on a ne...

[3000 rows x 2 columns]


Unnamed: 0,0,1
1957,3,"California 'raw milk man' James Stewart, who ..."
1786,3,In the wake of revelations - thanks in every ...
1867,3,A major manufacturer of antibiotic and arseni...
2175,3,In a naked attempt to shred the Constitution'...
889,2,Easy Way to Make 200 Grand A Year This is no j...


In [5]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((test_df[0] == label).sum()))
print(test_df[0].value_counts())

Satire: 750
Hoax: 750
Propaganda: 750
Reliable News: 750
1    750
2    750
3    750
4    750
Name: 0, dtype: int64


### Preprocessing Functions
- Removal of punctuation in Python Strings, [link](https://datagy.io/python-remove-punctuation-from-string/#:~:text=One%20of%20the%20easiest%20ways,maketrans()%20method.)
- Can look at common name removal: 

In [6]:
# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))
# print(stopword_set)

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def tokenise_text(text):
    tokens = nltk.word_tokenize(text)
    # print(tokens)
    return tokens

def remove_stopwords(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        if token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Remove all punctuations - Affects words such as U.S.A etc
# Removal of stop words has to be done prior to punctuation removal
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    return depunctuated_text

# Prevent concatenation of statistics and names
def replace_hyphens(text):
    return text.replace('-', ' ')

# Combine all processes into a single preprocess text function to call on df
def preprocess_text(text):
    dehyphenated_text = replace_hyphens(text)
    lowered_text = to_lower_case(dehyphenated_text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords(depunctuated_text, stopword_set)
    return second_stopword_pass

In [7]:
# Test text preprocessing model
test_string = "I was down in the U.S.A a few days ago! Spent $1,340. But i'll be real, don't do it. Isn't it?"
print('Preprocessing test: ')
preprocess_text(test_string)

Preprocessing test: 


'usa days ago spent 1340 real'

### Preprocess all text in the training data

In [8]:
print('Cleaning text...')
df[1] = df[1].map(preprocess_text)
print('Preprocessing done!')
df.sample(10) # Random sample values to see

Cleaning text...
Preprocessing done!


Unnamed: 0,0,1
3742,1,marveling aloud many varieties product availab...
44362,4,many consumers heard federal government car al...
32135,3,spurious terrorism indictments follow natos vi...
170,1,securities exchange commission announced tuesd...
32444,3,alan grayson tells congress bad suck house cha...
8444,1,continuing effort destroy roster ultimately ca...
14977,2,black cop cleared shooting white teen wheres o...
38009,3,truth rises coordinated strikes work ways may ...
45666,4,cols 5 6 general motor corp long titanic symbo...
28922,3,americans treated like terroristsyoutube


### Preprocess text in testing data

In [9]:
print('Cleaning text...')
test_df[1] = test_df[1].map(preprocess_text)
print('Preprocessing done!')
test_df.sample(10) # Random sample values to see

Cleaning text...
Preprocessing done!


Unnamed: 0,0,1
2048,3,effort get children drink processed milk conve...
1176,2,watch lois lerners attorney shes innocent one ...
1314,2,watch reagans time choosing 50th anniversary f...
545,1,tempers flared aftermath thursday nights repub...
2775,4,lot talk rules capitol hill lately wednesday r...
1194,2,watch ted cruz identifies four key issues mid ...
2046,3,feeding world mantra monsanto multinational co...
2701,4,ballet blood sport hardly begins describe auda...
2649,4,us government seeing hints adversaries targeti...
1089,2,federal court orders one state reinstate dead ...


### Count number of unique words in the entire dataset

In [10]:
# Count number of unique words
def unique_word_counter(texts):
    count = Counter() # Dictionary type
    # Access an entire string
    for text in texts:
        # Split each string into individual words separated by whitespace
        for word in text.split():
            count[word] += 1
    return count

In [11]:
# Run counter
counts = unique_word_counter(df[1])
unique_words_count = len(counts)
print('Number of unique words: ' + str(unique_words_count))
print('Most Common Words:')
counts.most_common(10)

Number of unique words: 252019
Most Common Words:


[('said', 95152),
 ('us', 78350),
 ('one', 64372),
 ('would', 61931),
 ('people', 58751),
 ('government', 45594),
 ('like', 44459),
 ('new', 43537),
 ('time', 43174),
 ('also', 40434)]

### Prepare datasets, convert into numpy format for Keras Model

In [12]:
X_train = df[1].to_numpy()
y_train = df[0].to_numpy()

X_test = test_df[1].to_numpy()
y_test = test_df[0].to_numpy()

y_train = pd.get_dummies(df[0]).values
y_test = pd.get_dummies(test_df[0]).values
print(type(y_train))
print(type(y_test))

# y_train = y_train - 1
# y_test = y_test - 1
# temp_y_train = []
# for label in y_train:
#     temp_y_train.append([label])
# y_train = np.array(temp_y_train)
# temp_y_test = []
# for label in y_test:
#     temp_y_test.append([label])
# y_test = np.array(temp_y_test)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(48854,) (3000,)
(48854, 4) (3000, 4)


### Tokenise words into numbers
- Each word will be assigned a specific number, according to how many unique words we have
- Inspired from this [Youtube Video](https://www.youtube.com/watch?v=kxeyoyrf2cM&ab_channel=PythonEngineer)

In [13]:
# Each string is turned into a sequence of integers
tokenizer = Tokenizer(num_words=unique_words_count)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

### Pad sequences to a common length

In [14]:
# Determine minimum number of words in a sequence 65218, 

padding_length = 1000

length_count = 0
for item in X_train:
    length = len(item)
    if length > padding_length:
        length_count += 1
print('Number of texts > word length: ' + str(length_count))

Number of texts > word length: 2122


In [15]:
X_train = pad_sequences(X_train, maxlen=padding_length, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=padding_length, padding="post", truncating="post")

# Ensure padded shape of dimension
X_train.shape, X_test.shape

((48854, 1000), (3000, 1000))

In [16]:
X_train.shape[1]

1000

### Train the RNN LSTM model
- We will be embedding the inputs

In [17]:
from tensorflow.python.client import device_lib
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")
print(device_lib.list_local_devices())

No GPU found
[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9236316220281706515
xla_global_id: -1
]


In [18]:
model = Sequential()
model.add(Embedding(unique_words_count, output_dim=100, input_length=X_train.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))

model_checkpoint = tf.keras.callbacks.ModelCheckpoint("best_model.h5", save_best_only=True, monitor='val_accuracy', verbose=1)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         25201900  
                                                                 
 spatial_dropout1d (SpatialD  (None, 1000, 100)        0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 4)                 404       
                                                                 
Total params: 25,282,704
Trainable params: 25,282,704
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit model
model.fit(X_train, y_train, epochs=10, batch_size=128, shuffle=True, validation_data=(X_test, y_test), callbacks=[model_checkpoint])

loss, accuracy = model.evaluate(X_test, y_test)
print('Loss: ' + str(loss) + '    ' + 'Accuracy: ' + str(accuracy))

Epoch 1/10
  8/382 [..............................] - ETA: 47:03 - loss: 1.3613 - accuracy: 0.3252

KeyboardInterrupt: 

### Test Out Model

In [None]:
new_test = pd.read_csv(test_path, header=None)
sample = new_test.sample(30)

sample

In [None]:
print('Cleaning text...')
sample[1] = sample[1].map(preprocess_text)
print('Preprocessing done!')

sample_X = sample[1].to_numpy()
sample_y = sample[0].to_numpy()
sample_y = sample_y - 1

classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
actual_labels = []
for label in sample_y:
    actual_labels.append(classes[label])
print(actual_labels)

In [None]:
sample_X = tokenizer.texts_to_sequences(sample_X)
sample_X = pad_sequences(sample_X, maxlen=padding_length, padding="post", truncating="post")

In [None]:
# Predictions
predictions = []
for test_instance in sample_X:
    prediction = model.predict(test_instance)
    predictions.append(classes[np.argmax(prediction)])

print(predictions)