## RNN LSTM Text Classificaiton model

### Dependencies and Libraries

In [36]:
import string                           # For removal of punctuation
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import nltk
from nltk.corpus import stopwords

### Reading in data into pd dataframes, data viewing

In [37]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)
test_df = pd.read_csv(test_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

<class 'pandas.core.frame.DataFrame'>
Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
39153,4,Many of the patients at the Sanjali rehabilita...
16000,2,BLM Protestors Block Truck From Driving Throug...
13972,1,"According to sources around the league, the Ne..."
43616,4,WASHINGTON With the Senate's passage of financ...
29814,3,Agenda 21 and the New World Economy: where hap...


In [38]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))

Satire: 14047
Hoax: 6942
Propaganda: 17870
Reliable News: 9995


### Preprocessing Functions
- Removal of punctuation in Python Strings, [link](https://datagy.io/python-remove-punctuation-from-string/#:~:text=One%20of%20the%20easiest%20ways,maketrans()%20method.)

In [41]:
# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def remove_stopwords(text, stopword_set):
    new_text = ''
    return new_text

# Remove all punctuations - May affect words such as U.S.A etc
# Removal of stop words has to be done prior to this
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    print(depunctuated_text)
    return depunctuated_text

{"that'll", 'is', 'were', 'who', 'not', 'more', 'yourself', 'll', 'out', 'myself', 're', 'did', 'when', 'an', 'she', 'than', 'any', 'own', 'mustn', 'into', 'above', 'between', 'then', "won't", 'these', "you'll", 'a', 'for', 'himself', 'very', 'down', 'didn', 'his', 'which', 'ours', 'that', 'needn', 'during', 'again', 'm', "didn't", 'their', 'won', 'it', 'the', 'weren', 'we', 'too', "aren't", 'shan', 'just', 'its', 'my', 'in', 'been', 'o', 'has', 'you', 'doesn', 'do', "couldn't", 'but', 'our', 'against', 't', 'as', 'other', "wouldn't", 'or', 'will', "mightn't", 'further', 'having', 'because', 'before', 'over', "mustn't", 'all', 'whom', 'now', 'doing', 'he', 'about', 'here', 'if', 'of', 'can', 'what', 'this', 'same', 'some', 'by', 'such', "you're", 'themselves', 'below', 'itself', 'those', "don't", 'mightn', 'are', "shouldn't", 'nor', 'yours', 'him', 'have', "you've", "needn't", 's', "haven't", "weren't", "it's", 'ain', 'does', 'while', "should've", 'under', 've', 'there', 'wasn', 'off',