# Final Project: Fake News Detection

By Felix Daubner - Hochschule der Medien

Module 'Supervised and Unsupervised Learning' - Prof. Dr.-Ing. Johannes Maucher

## Model Training

In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from keras.models import Model
from keras.layers import Embedding, Flatten, Dense, LSTM, Conv1D, Flatten, MaxPooling1D, Dropout, Bidirectional, Input, Concatenate
from sklearn.metrics import classification_report
import pickle
from gensim.models import KeyedVectors
import altair as alt

NUM_WORDS=3000
MAX_SEQUENCE_LEN = 57
NUM_CAT = 20

In [96]:
def prepareFeatures(X):
    X_token = np.array(X["token"].apply(np.asarray))
    X_token = np.array([arr for arr in X_token])

    X_enc = np.array(X.drop(["token"], axis=1).apply(np.array))

    return X_token, X_enc

def prepareTarget(y):
    return np.array(y)

def visualizeHistory(history):

    l, p, v_l, v_p = history.history.keys()

    data = pd.DataFrame({"epoch": history.epoch,
            "loss": history.history[l],
            "val_loss": history.history[v_l],
            "precision": history.history[p],
            "val_precision": history.history[v_p]})
    
    loss_min = min(data["loss"].min(), data["val_loss"].min())
    loss_max = max(data["loss"].max(), data["val_loss"].max())

    precision_min = min(data["precision"].min(), data["val_precision"].min())
    precision_max = max(data["precision"].max(), data["val_precision"].max())

    data_melted = data.melt('epoch', value_vars=['loss', 'val_loss', 'precision', 'val_precision'], var_name='type', value_name='value')
    
    data_loss = data_melted[data_melted["type"].isin(["loss", "val_loss"])]
    loss = alt.Chart(data_loss).mark_line().encode(
        x = "epoch",
        y = alt.Y("value", scale = alt.Scale(domain=[loss_min, loss_max])),
        color = alt.Color("type", legend=alt.Legend(orient="right"))
    ).properties(
        title = "Training and Validation Loss over epochs"
    )

    data_precision = data_melted[data_melted["type"].isin(["precision", "val_precision"])]
    precision = alt.Chart(data_precision).mark_line().encode(
        x = "epoch",
        y = alt.Y("value", scale = alt.Scale(domain=[precision_min, precision_max])),
        color = alt.Color("type", legend=alt.Legend(orient="right"))
    ).properties(
        title = "Training and Validation Precision over epochs"
    )

    return alt.hconcat(loss, precision).resolve_scale(color="independent")


def performanceReport(model, X_train, y_train, X_val, y_val):
    y_pred_train = (model.predict(X_train) > 0.5).astype(int)
    y_pred_val = (model.predict(X_val) > 0.5).astype(int)

    print("\nClassifcation Report of Performance on Training data")
    print(classification_report(y_train, y_pred_train))
    
    print("\n")
    print("* "*10)

    print("\nClassifcation Report of Performance on Validation data")
    print(classification_report(y_val, y_pred_val))

This section contains the model training and feature selection. Different types of models should be trained and then compared to find out which model fits the challenge, to determine whether a political statement was fake-news or true, best. There are three types of models to be compared: MLP, CNN and LSTM. Those models should also vary in terms of hyperparameters like layers, neurons, optimization and else. The best model is evaluated and then optimized in the next section.

### Prepare data for training and validation

In [3]:
data = pd.read_json("data/processed.json", orient="records", lines=True)

In [4]:
data.columns

Index(['statement', 'channel_Instagram', 'channel_Other', 'channel_TV',
       'channel_TikTok', 'channel_X', 'channel_ad', 'channel_article',
       'channel_blog', 'channel_campaign', 'channel_debate',
       'channel_interview', 'channel_lecture', 'channel_mail',
       'channel_podcast', 'channel_presentation', 'channel_press',
       'channel_social media', 'channel_speech', 'channel_talk',
       'channel_video', 'truth', 'token', 'statement_stop', 'token_stop'],
      dtype='object')

Before starting defining the different models, the data is prepared for the training process. The neural network to be trained only takes numpy arrays as input. Thus, the data currently saved as a pandas DataFrame is converted in to a numpy array. In this conversion process, only "token", the encoded channel and issue columns and "truth" are kept meaning column 'statement' is dropped as it can not be used for model training.

After splitting the data into features and target, the features still have to preprared for training by splitting the encoded categorical data from the tokenized and padded statements. The statement data has to be taken care of using an Embedding Layer while a Dense layer is sufficient to handle the encoded categorical data.

In [5]:
X = data.drop(["statement", "statement_stop", "token_stop", "truth"], axis=1)
y = data["truth"]

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=42)

In [7]:
X_train_token, X_train_enc = prepareFeatures(X_train)
X_val_token, X_val_enc = prepareFeatures(X_val)
y_train = prepareTarget(y_train)
y_val = prepareTarget(y_val)

### Prepare data for testing

In [8]:
test = pd.read_json("data/LIAR_processed.json", orient="records", lines=True)

In [9]:
X_test = test.drop(["statement", "statement_stop", "token_stop", "truth"], axis=1)
y_test = test["truth"]

In [10]:
X_test_token, X_test_enc = prepareFeatures(X_test)
y_test = prepareTarget(y_test)

When looking at fake-news detection, it is decided which of accuracy, precision or recall should be optimized. Most often, accuracy is not a good metric as it doesn't include the cost of mis-predicting. That's why either precision or recall should be used.
The worst case at fake-news is when a fake-news is not identified as fake-news. Whereas the other way, a true statement being classified as fake-news does not harm in the same way. Translating this into the terms of this project means a false positive ("a statement which is 'fake' (0) gets classified as 'true' (1)") is worse than a false negative ("a statement which is 'true' (1) gets classified as 'false' (0)"). The metrics focusing on optimizing the false positives is precision.

In the following, four different types models are trained and evaluated. Based on those evaluations, the best model is chosen and optimized until pre-defined metrics reach their peak. The evaluation for the best model are done using all available features of the data. In the following section [Optimiziation](07_evaluation-optimization.ipynb) the most useful features and hyperparameters of the model are chosen until the best model is reached.

### Prepare infrastructure

In [11]:
word2vec = KeyedVectors.load_word2vec_format("wiki-news-300d-1M.vec")

In [12]:
with open("tokenizer/tokenizer.pickle", "rb") as handle:
    tokenizer = pickle.load(handle)

In [13]:
embedding_dim = 300  
word_index = tokenizer.word_index 
num_words = min(len(word_index) + 1, NUM_WORDS)  

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < num_words:
        if word in word2vec.key_to_index:
            embedding_vector = word2vec[word]
            embedding_matrix[i] = embedding_vector

In [14]:
text_input = Input(shape=(MAX_SEQUENCE_LEN,), name="text_input")
categorical_input = Input(shape=(NUM_CAT,), name="categorical_input")

In [15]:
emb = Embedding(NUM_WORDS, embedding_dim, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LEN, trainable=False)(text_input)

In [16]:
cat = Dense(32, activation="relu")(categorical_input)

### Feedforward Neural Nerwork

In [17]:
ff_flatten_text = Flatten()(emb)

ff_combined = Concatenate()([ff_flatten_text, cat])
ff_dense1 = Dense(128, activation="relu")(ff_combined)
ff_drop = Dropout(0.3)(ff_dense1)
ff_dense2 = Dense(64, activation="relu")(ff_drop)
ff_output = Dense(1, activation="sigmoid")(ff_dense2)

In [18]:
ff = Model(inputs=[categorical_input, text_input], outputs=ff_output)
ff.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None, 57)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 57, 300)      900000      ['text_input[0][0]']             
                                                                                                  
 categorical_input (InputLayer)  [(None, 20)]        0           []                               
                                                                                                  
 flatten (Flatten)              (None, 17100)        0           ['embedding[0][0]']              
                                                                                              

In [19]:
ff.compile(optimizer="sgd", loss="binary_crossentropy", metrics=[keras.metrics.Precision()])

In [85]:
ff_hist = ff.fit([X_train_enc, X_train_token], y_train, epochs=20, batch_size=128, validation_data=([X_val_enc, X_val_token], y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [86]:
visualizeHistory(ff_hist)

In [98]:
performanceReport(ff, [X_train_enc, X_train_token], y_train, [X_val_enc, X_val_token], y_val)


Classifcation Report of Performance on Training data
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      7000
           1       0.89      0.89      0.89      7109

    accuracy                           0.89     14109
   macro avg       0.89      0.89      0.89     14109
weighted avg       0.89      0.89      0.89     14109



* * * * * * * * * * 

Classifcation Report of Performance on Validation data
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      3078
           1       0.80      0.82      0.81      2969

    accuracy                           0.81      6047
   macro avg       0.81      0.81      0.81      6047
weighted avg       0.81      0.81      0.81      6047



### LSTM

In [21]:
lstm_ = LSTM(64)(emb)

In [22]:
lstm_combined = Concatenate()([lstm_, cat])

In [23]:
lstm_dense1 = Dense(64, activation='relu')(lstm_combined)
lstm_drop1 = Dropout(0.2)(lstm_dense1)
lstm_dense2 = Dense(32, activation='relu')(lstm_drop1)
lstm_drop2 = Dropout(0.4)(lstm_dense2)
lstm_output = Dense(1, activation='sigmoid')(lstm_drop2)

In [24]:
lstm = Model(inputs=[categorical_input, text_input], outputs=lstm_output)
lstm.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None, 57)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 57, 300)      900000      ['text_input[0][0]']             
                                                                                                  
 categorical_input (InputLayer)  [(None, 20)]        0           []                               
                                                                                                  
 lstm (LSTM)                    (None, 64)           93440       ['embedding[0][0]']              
                                                                                            

In [25]:
lstm.compile(optimizer="sgd", loss="binary_crossentropy", metrics=[keras.metrics.Precision()])

In [26]:
lstm_hist = lstm.fit([X_train_enc, X_train_token], y_train, batch_size=128, epochs=20, validation_data=([X_val_enc, X_val_token], y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [102]:
visualizeHistory(lstm_hist)

In [99]:
performanceReport(lstm, [X_train_enc, X_train_token], y_train, [X_val_enc, X_val_token], y_val)


Classifcation Report of Performance on Training data
              precision    recall  f1-score   support

           0       0.70      0.57      0.63      7000
           1       0.64      0.76      0.70      7109

    accuracy                           0.67     14109
   macro avg       0.67      0.66      0.66     14109
weighted avg       0.67      0.67      0.66     14109



* * * * * * * * * * 

Classifcation Report of Performance on Validation data
              precision    recall  f1-score   support

           0       0.70      0.57      0.63      3078
           1       0.63      0.74      0.68      2969

    accuracy                           0.66      6047
   macro avg       0.66      0.66      0.66      6047
weighted avg       0.66      0.66      0.66      6047



### Bi-directional LSTM

In [27]:
blstm_ = Bidirectional(LSTM(128))(emb)

In [28]:
blstm_combined = Concatenate()([blstm_, cat])

In [29]:
blstm_dense1 = Dense(64, activation='relu')(blstm_combined)
blstm_drop1 = Dropout(0.2)(blstm_dense1)
blstm_output = Dense(1, activation='sigmoid')(blstm_drop1)

In [30]:
blstm = Model(inputs=[categorical_input, text_input], outputs=blstm_output)
blstm.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None, 57)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 57, 300)      900000      ['text_input[0][0]']             
                                                                                                  
 categorical_input (InputLayer)  [(None, 20)]        0           []                               
                                                                                                  
 bidirectional (Bidirectional)  (None, 256)          439296      ['embedding[0][0]']              
                                                                                            

In [31]:
blstm.compile(optimizer="sgd", loss="binary_crossentropy", metrics=[keras.metrics.Precision()])

In [32]:
blstm_hist = blstm.fit([X_train_enc, X_train_token], y_train, batch_size=128, epochs=20, validation_data=([X_val_enc, X_val_token], y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [83]:
visualizeHistory(blstm_hist)

In [100]:
performanceReport(blstm, [X_train_enc, X_train_token], y_train, [X_val_enc, X_val_token], y_val)


Classifcation Report of Performance on Training data
              precision    recall  f1-score   support

           0       0.70      0.62      0.66      7000
           1       0.66      0.74      0.70      7109

    accuracy                           0.68     14109
   macro avg       0.68      0.68      0.68     14109
weighted avg       0.68      0.68      0.68     14109



* * * * * * * * * * 

Classifcation Report of Performance on Validation data
              precision    recall  f1-score   support

           0       0.71      0.62      0.66      3078
           1       0.65      0.74      0.69      2969

    accuracy                           0.68      6047
   macro avg       0.68      0.68      0.68      6047
weighted avg       0.68      0.68      0.68      6047



### Convolutional Neural Network

In [33]:
cnn_ = Conv1D(filters=128, kernel_size=5, activation='relu')(emb)
cnn_maxpool = MaxPooling1D(pool_size=5)(cnn_)

In [34]:
cnn_flatten_text = Flatten()(cnn_)

cnn_combined = Concatenate()([cnn_flatten_text, cat])
cnn_flatten = Flatten()(cnn_maxpool)
cnn_dense1 = Dense(128, activation="relu")(cnn_flatten)
cnn_drop = Dropout(0.3)(cnn_dense1)
cnn_output = Dense(1, activation="sigmoid")(cnn_drop)

In [35]:
cnn = Model(inputs=[categorical_input, text_input], outputs=cnn_output)
cnn.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text_input (InputLayer)        [(None, 57)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 57, 300)      900000      ['text_input[0][0]']             
                                                                                                  
 conv1d (Conv1D)                (None, 53, 128)      192128      ['embedding[0][0]']              
                                                                                                  
 max_pooling1d (MaxPooling1D)   (None, 10, 128)      0           ['conv1d[0][0]']                 
                                                                                            

In [36]:
cnn.compile(optimizer="sgd", loss="binary_crossentropy", metrics=[keras.metrics.Precision()])

In [37]:
cnn_hist = cnn.fit([X_train_enc, X_train_token], y_train, batch_size=128, epochs=20, validation_data=([X_val_enc, X_val_token], y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [84]:
visualizeHistory(cnn_hist)

In [101]:
performanceReport(cnn, [X_train_enc, X_train_token], y_train, [X_val_enc, X_val_token], y_val)


Classifcation Report of Performance on Training data
              precision    recall  f1-score   support

           0       0.62      0.59      0.60      7000
           1       0.61      0.64      0.63      7109

    accuracy                           0.62     14109
   macro avg       0.62      0.62      0.62     14109
weighted avg       0.62      0.62      0.62     14109



* * * * * * * * * * 

Classifcation Report of Performance on Validation data
              precision    recall  f1-score   support

           0       0.62      0.57      0.60      3078
           1       0.59      0.64      0.62      2969

    accuracy                           0.61      6047
   macro avg       0.61      0.61      0.61      6047
weighted avg       0.61      0.61      0.61      6047



### Evaluation 