# ================ LSTM Model Train & Test Notebook =============== #

In [None]:
# !pip install demoji
# !pip install spacy
# !pip install clean-text

import pandas as pd
import ast
import re
import spacy
import demoji

import numpy as np
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
assert tf.__version__ >= "2.0"

from sklearn.feature_extraction.text import CountVectorizer
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, TextVectorization, InputLayer, Bidirectional
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# nltk.download('stopwords')

## Preprocessing

In [None]:
df = pd.read_csv('captions_df.csv')
df = df.iloc[:, 1:4]
df = df.dropna().reset_index(drop=True)

In [None]:
# Generate counts of number of post and number of captions (Post can have no captions)

df['num_of_post'] = [0] * len(df['captions'])
df['num_of_captions'] = [0] * len(df['captions'])

y = df['captions'].apply(lambda x: ast.literal_eval(x))
y = y.apply(lambda cap_list: [cap for cap in cap_list if cap is not None])
df['captions'] = y
for i in range(len(df['captions'])):
    x = df['captions'][i]
    df.loc[i, 'num_of_post'] = len(x)
    df.loc[i, 'num_of_captions'] = sum(cap != None for cap in x)

dfe = df.explode('captions').reset_index()

In [None]:
display(dfe)

In [None]:
tokenizer = CountVectorizer()
tokenizer.fit_transform(dfe['captions'].dropna())
word_index = tokenizer.vocabulary_
print(f"No. of unique words: {len(word_index)}")

### Clean Up Captions (E.g remove emoji, new lines, etc.)

In [None]:
dfe['captions'] = dfe['captions'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

In [None]:
def remove_em(text):
    dem = demoji.findall(text)
    for item in dem.keys():
        text = text.replace(item, '')
    return text

def clean_text(text):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))
    
    lemmatizer = WordNetLemmatizer()
    ps = PorterStemmer()
    
    """
        text: a string
        
        return: modified initial string
    """
    if pd.notnull(text):
        text = re.sub(r"(\n)+", " ", text)
        text = remove_em(text)
        text = text.lower() # lowercase text
        text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
        text = ' '.join([ps.stem(w) for w in text.split()])
        text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
        text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
        text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    else:
        text = ''
    
    return text

lstm_df = dfe.copy()
lstm_df['captions'] = lstm_df['captions'].apply(clean_text)
lstm_df['captions_length'] = [len(cap.split()) for cap in lstm_df['captions']]

In [None]:
print("EXAMPLE OF AN EMPTY CAPTION POST BY A BOT: ")
display(lstm_df[lstm_df['username'] == 'breely_wilkey'])

print("EXAMPLES OF CAPTION POST BY A BOT: ")
display(lstm_df[lstm_df['username'] == 'copy ai'])

print("EXAMPLES OF CAPTION POST BY A REAL PERSON: ")
display(lstm_df[lstm_df['username'] == 'ryanxgo'])

print('Shape of lstm_df:', lstm_df.shape)

In [None]:
int(lstm_df['captions_length'].max())

## LSTM Model 1

In [None]:
MAX_TOKENS_NUM = 5000  # Maximum vocab size.
MAX_SEQUENCE_LEN = 250  # Sequence length to pad the outputs to.
EMBEDDING_DIMS = 50

tokenizer = Tokenizer(num_words = MAX_TOKENS_NUM, oov_token = '<OOV>')
tokenizer.fit_on_texts(lstm_df['captions'])
word_index = tokenizer.word_index
print(f"No. of unique words: {len(word_index)}")

train_sequences = tokenizer.texts_to_sequences(lstm_df['captions'])
train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LEN, padding='post', truncating='post')

X_train_val, X_test, y_train_val, y_test = train_test_split(train_padded, lstm_df['is_fake'], test_size = 0.20, stratify = lstm_df['is_fake'])
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size = 0.20, stratify = y_train_val)

# Oversampling
fake_captions_train = X_train[y_train == 1]
real_captions_train = X_train[y_train == 0]
fake_label_train = y_train[y_train == 1].reset_index(drop = True)
real_label_train = y_train[y_train == 0].reset_index(drop = True)

ids = np.arange(len(fake_captions_train))
choices = np.random.choice(ids, len(real_captions_train))

res_pos_features = fake_captions_train[choices]
res_pos_labels = fake_label_train[choices]

resampled_features = np.concatenate([res_pos_features, real_captions_train], axis=0)
resampled_labels = np.concatenate([res_pos_labels, real_label_train], axis=0)

order = np.arange(len(resampled_labels))
np.random.shuffle(order)
resampled_features = resampled_features[order]
resampled_labels = resampled_labels[order]

X_train = resampled_features
y_train = pd.get_dummies(resampled_labels).values
y_valid = pd.get_dummies(y_valid).values
y_test = pd.get_dummies(y_test).values

print("==========")
print(f"Length of train set is {len(y_train)}")
print(f"Length of validation set is {len(y_valid)}")
print(f"Length of test set is {len(y_test)}")
print("==========")

model1 = Sequential([
    Embedding(5000, EMBEDDING_DIMS),
    LSTM(EMBEDDING_DIMS),
    Dense(2, activation='softmax')
])

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.TrueNegatives()])

epochs = 5
batch_size = 64

history1 = model1.fit(X_train, y_train, epochs=epochs, validation_data = (X_valid, y_valid))

In [None]:
model1.summary()

In [None]:
# Test on test set
model1.evaluate(X_test, y_test, return_dict = True)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
model_num = 1

ax2.plot(history1.history['loss'])
ax2.plot(history1.history['val_loss'])
ax2.set_title(f"Model {model_num} Loss")
ax2.set(xlabel="Epochs", ylabel="Loss")
ax2.legend(['loss', 'val_loss'])

ax1.plot(history1.history['recall'])
ax1.plot(history1.history['val_recall'])
ax1.set_title(f"Model {model_num} Recall")
ax1.set(xlabel="Epochs", ylabel="Recall")
ax1.legend(['recall', 'val_recall'])

total_train_fake = [x + y for x, y in zip(history1.history['false_positives'], history1.history['true_negatives'])]
total_val_fake = [x + y for x, y in zip(history1.history['val_false_positives'], history1.history['val_true_negatives'])]
ax3.plot([i/j for i, j in zip(history1.history['false_positives'],total_train_fake)])
ax3.plot([i/j for i, j in zip(history1.history['val_false_positives'],total_val_fake)])
ax3.set_title(f"Model {model_num} FPR")
ax3.set(xlabel="Epochs", ylabel="FPR")
ax3.legend(['FPR', 'val_fpr'])

fig.show()

## LSTM Model 2

In [None]:
MAX_TOKENS_NUM = 5000  # Maximum vocab size.
MAX_SEQUENCE_LEN = 250  # Sequence length to pad the outputs to.
EMBEDDING_DIMS = 50

tokenizer = Tokenizer(num_words = MAX_TOKENS_NUM, oov_token = '<OOV>')
tokenizer.fit_on_texts(lstm_df['captions'])
word_index = tokenizer.word_index
print(f"No. of unique words: {len(word_index)}")

train_sequences = tokenizer.texts_to_sequences(lstm_df['captions'])
train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LEN, padding='post', truncating='post')

X_train_val, X_test, y_train_val, y_test = train_test_split(train_padded, lstm_df['is_fake'], test_size = 0.20, stratify = lstm_df['is_fake'], random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size = 0.20, stratify = y_train_val, random_state = 42)

# Oversampling
fake_captions_train = X_train[y_train == 1]
real_captions_train = X_train[y_train == 0]
fake_label_train = y_train[y_train == 1].reset_index(drop = True)
real_label_train = y_train[y_train == 0].reset_index(drop = True)

ids = np.arange(len(fake_captions_train))
choices = np.random.choice(ids, len(real_captions_train))

res_pos_features = fake_captions_train[choices]
res_pos_labels = fake_label_train[choices]

resampled_features = np.concatenate([res_pos_features, real_captions_train], axis=0)
resampled_labels = np.concatenate([res_pos_labels, real_label_train], axis=0)

order = np.arange(len(resampled_labels))
np.random.shuffle(order)
resampled_features = resampled_features[order]
resampled_labels = resampled_labels[order]

X_train = resampled_features
y_train = pd.get_dummies(resampled_labels).values
y_valid = pd.get_dummies(y_valid).values
y_test = pd.get_dummies(y_test).values

model = Sequential([
    Embedding(MAX_TOKENS_NUM, EMBEDDING_DIMS),
    Bidirectional(LSTM(EMBEDDING_DIMS)),
    Dense(EMBEDDING_DIMS, activation='relu'),
    Dense(2, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.TrueNegatives()])

epochs = 5
batch_size = 64

history = model.fit(X_train, y_train, epochs=epochs, validation_data = (X_valid, y_valid))

In [None]:
model.summary()

In [None]:
# Test on test set
model.evaluate(X_test, y_test, return_dict = True)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
model_num = 2

ax2.plot(history.history['loss'])
ax2.plot(history.history['val_loss'])
ax2.set_title(f"Model {model_num} Loss")
ax2.set(xlabel="Epochs", ylabel="Loss")
ax2.legend(['loss', 'val_loss'])

ax1.plot(history.history['recall_1'])
ax1.plot(history.history['val_recall_1'])
ax1.set_title(f"Model {model_num} Recall")
ax1.set(xlabel="Epochs", ylabel="Recall")
ax1.legend(['recall', 'val_recall'])

total_train_fake = [x + y for x, y in zip(history.history['false_positives_1'], history.history['true_negatives_1'])]
total_val_fake = [x + y for x, y in zip(history.history['val_false_positives_1'], history.history['val_true_negatives_1'])]
ax3.plot([i/j for i, j in zip(history.history['false_positives_1'],total_train_fake)])
ax3.plot([i/j for i, j in zip(history.history['val_false_positives_1'],total_val_fake)])
ax3.set_title(f"Model {model_num} FPR")
ax3.set(xlabel="Epochs", ylabel="FPR")
ax3.legend(['FPR', 'val_fpr'])

fig.show()

## LSTM Model 3

In [None]:
MAX_TOKENS_NUM = 5000  # Maximum vocab size.
MAX_SEQUENCE_LEN = 250  # Sequence length to pad the outputs to.
EMBEDDING_DIMS = 50

tokenizer = Tokenizer(num_words = MAX_TOKENS_NUM, oov_token = '<OOV>')
tokenizer.fit_on_texts(lstm_df['captions'])
word_index = tokenizer.word_index
print(f"No. of unique words: {len(word_index)}")

train_sequences = tokenizer.texts_to_sequences(lstm_df['captions'])
train_padded = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LEN, padding='post', truncating='post')

X_train_val, X_test, y_train_val, y_test = train_test_split(train_padded, lstm_df['is_fake'], test_size = 0.20, stratify = lstm_df['is_fake'], random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_val, y_train_val, test_size = 0.20, stratify = y_train_val, random_state = 42)

# Oversampling
fake_captions_train = X_train[y_train == 1]
real_captions_train = X_train[y_train == 0]
fake_label_train = y_train[y_train == 1].reset_index(drop = True)
real_label_train = y_train[y_train == 0].reset_index(drop = True)

ids = np.arange(len(fake_captions_train))
choices = np.random.choice(ids, len(real_captions_train))

res_pos_features = fake_captions_train[choices]
res_pos_labels = fake_label_train[choices]

resampled_features = np.concatenate([res_pos_features, real_captions_train], axis=0)
resampled_labels = np.concatenate([res_pos_labels, real_label_train], axis=0)

order = np.arange(len(resampled_labels))
np.random.shuffle(order)
resampled_features = resampled_features[order]
resampled_labels = resampled_labels[order]

X_train = resampled_features
y_train = pd.get_dummies(resampled_labels).values
y_valid = pd.get_dummies(y_valid).values
y_test = pd.get_dummies(y_test).values

model3 = Sequential([
    Embedding(MAX_TOKENS_NUM, EMBEDDING_DIMS),
    Bidirectional(LSTM(EMBEDDING_DIMS)),
    Dense(2, activation='softmax')
])

model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.TrueNegatives()])

epochs = 5
batch_size = 64

history3 = model3.fit(X_train, y_train, epochs=epochs, validation_data = (X_valid, y_valid))

In [None]:
model3.summary()

In [None]:
# Test on test set
model3.evaluate(X_test, y_test, return_dict = True)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
model_num = 3

ax2.plot(history3.history['loss'])
ax2.plot(history3.history['val_loss'])
ax2.set_title(f"Model {model_num} Loss")
ax2.set(xlabel="Epochs", ylabel="Loss")
ax2.legend(['loss', 'val_loss'])

ax1.plot(history3.history['recall_2'])
ax1.plot(history3.history['val_recall_2'])
ax1.set_title(f"Model {model_num} Recall")
ax1.set(xlabel="Epochs", ylabel="Recall")
ax1.legend(['recall', 'val_recall'])

total_train_fake = [x + y for x, y in zip(history3.history['false_positives_2'], history3.history['true_negatives_2'])]
total_val_fake = [x + y for x, y in zip(history3.history['val_false_positives_2'], history3.history['val_true_negatives_2'])]
ax3.plot([i/j for i, j in zip(history3.history['false_positives_2'],total_train_fake)])
ax3.plot([i/j for i, j in zip(history3.history['val_false_positives_2'],total_val_fake)])
ax3.set_title(f"Model {model_num} FPR")
ax3.set(xlabel="Epochs", ylabel="FPR")
ax3.legend(['FPR', 'val_fpr'])

fig.show()

# Extra Testing

In [None]:
lab = ['NOT FAKE', 'FAKE']

cleaned_text = clean_text("Happy to meet Governor of South Australia Frances Adamson this afternoon. We reaffirmed the excellent relations between Singapore and Australia, as well as close economic and cultural ties between Singapore and South Australia. South Australia and its capital city of Adelaide have been the torchbearer for innovation and new ideas in areas such as renewable energy, food, education as well as arts and culture. I look forward to further cooperation with South Australia in these areas.")
example = tokenizer.texts_to_sequences([cleaned_text])
example_pad = pad_sequences(example, maxlen=30, padding='post', truncating='post')
pred = np.argmax(model.predict(example_pad), axis = -1)

for p in pred:
    print(lab[p])

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('---')
print(lstm_df['captions'][10])