## RNN LSTM Text Classificaiton model

### Dependencies and Libraries

In [113]:
import string                           # For removal of punctuation
from collections import Counter
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import nltk
from nltk.corpus import stopwords

### Reading in data into pd dataframes, data viewing

In [114]:
# Read CSV file in
train_path = './raw_data/fulltrain.csv'
test_path = './raw_data/balancedtest.csv'
df = pd.read_csv(train_path, header=None)
test_df = pd.read_csv(test_path, header=None)

print(type(df))

# Samples, number of columns, 0 = labels, column 1 = text
print('Total rows, Total Columns: ' + str(df.shape))
df.sample(5) # Random sample values to see

<class 'pandas.core.frame.DataFrame'>
Total rows, Total Columns: (48854, 2)


Unnamed: 0,0,1
25715,3,New ground-based laser can extend drone flight...
25535,3,Obamas General Assembly AddressStephen Lendman...
13605,1,Marine biologist and best-selling author of A ...
34137,3,5 Inventions That Herald an Outernet Revolutio...
33966,3,Brandon Turbeville: Prepping on a Budget with ...


In [115]:
# Get number of labels for each task
classes = ['Satire', 'Hoax', 'Propaganda', 'Reliable News']
label_numbers = [1,2,3,4]

for label in label_numbers:
    print(classes[label-1] + ': ' + str((df[0] == label).sum()))
print(df[0].value_counts())

Satire: 14047
Hoax: 6942
Propaganda: 17870
Reliable News: 9995
3    17870
1    14047
4     9995
2     6942
Name: 0, dtype: int64


### Preprocessing Functions
- Removal of punctuation in Python Strings, [link](https://datagy.io/python-remove-punctuation-from-string/#:~:text=One%20of%20the%20easiest%20ways,maketrans()%20method.)
- Can look at common name removal: 

In [116]:
# Create set of stopwords for use in preprocessing
stopword_set = set(stopwords.words('english'))
# print(stopword_set)

# Fold to lower case
def to_lower_case(text):
    return text.lower()

def tokenise_text(text):
    tokens = nltk.word_tokenize(text)
    # print(tokens)
    return tokens

def remove_stopwords(text, stopword_set):
    # Split by whitespace
    split_text = text.split()
    new_tokens = []
    for token in split_text:
        if token in stopword_set:
            continue
        new_tokens.append(token)
    # Parse back into text
    return ' '.join(new_tokens)

# Remove all punctuations - Affects words such as U.S.A etc
# Removal of stop words has to be done prior to punctuation removal
def remove_punctuation(text):
    depunctuated_text = text.translate(str.maketrans('','', string.punctuation))
    return depunctuated_text

# Prevent concatenation of statistics and names
def replace_hyphens(text):
    return text.replace('-', ' ')

# Combine all processes into a single preprocess text function to call on df
def preprocess_text(text):
    dehyphenated_text = replace_hyphens(text)
    lowered_text = to_lower_case(dehyphenated_text)
    initial_stopword_pass = remove_stopwords(lowered_text, stopword_set)
    tokens = tokenise_text(initial_stopword_pass)
    tokenised_text = ' '.join(tokens)
    depunctuated_text = remove_punctuation(tokenised_text)
    second_stopword_pass = remove_stopwords(depunctuated_text, stopword_set)
    return second_stopword_pass

In [117]:
# Test text preprocessing model
test_string = "I was down in the U.S.A a few days ago! Spent $1,340. But i'll be real, don't do it. Isn't it?"
print('Preprocessing test: ')
preprocess_text(test_string)

Preprocessing test: 


'usa days ago spent 1340 real'

### Preprocess all text in the training data

In [118]:
print('Cleaning text...')
df[1] = df[1].map(preprocess_text)
print('Preprocessing done!')
df.sample(10) # Random sample values to see

Cleaning text...
Preprocessing done!


Unnamed: 0,0,1
18881,2,obama insists existential threats facing us vi...
22099,3,artificial intelligence researchers want survi...
2845,1,brandishing shotguns semiautomatic pistols mem...
785,1,local dad kevin marshall 29 began voice concer...
16987,2,hillary clintons vp pick exposed supporting is...
1294,1,study published latest issue journal social sc...
31692,3,feds sanitize vaccine injury reporting big tim...
10410,1,discovery biblical scholars say sheds new ligh...
44761,4,mumbai aug 18 xinhua least twenty two people k...
47185,4,gov rick perry recently waded crowded room rao...


### Count number of unique words in the entire dataset

In [119]:
# Count number of unique words
def unique_word_counter(texts):
    count = Counter() # Dictionary type
    # Access an entire string
    for text in texts:
        # Split each string into individual words separated by whitespace
        for word in text.split():
            count[word] += 1
    return count

In [123]:
# Run counter
counts = unique_word_counter(df[1])
unique_words_count = len(counts)
print('Number of unique words: ' + str(unique_words_count))
print('Most Common Words:')
counts.most_common(10)

Number of unique words: 252019
Most Common Words:


[('said', 95152),
 ('us', 78350),
 ('one', 64372),
 ('would', 61931),
 ('people', 58751),
 ('government', 45594),
 ('like', 44459),
 ('new', 43537),
 ('time', 43174),
 ('also', 40434)]