In [17]:
# Imports
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import tensorflow_text as text
from keras.metrics import Recall
from tensorflow import keras
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import csv
from tensorflow.python.keras.metrics import Precision, Recall
from sklearn.metrics import classification_report

# Data processing

In [18]:
csv.field_size_limit(100000000)

#filepath = "dataset/news_cleaned_2018_02_13.csv"
#selected_columns = ["domain", "title", "authors", "type", "content", "url"]

#chunks = []
#for chunk in pd.read_csv(filepath, usecols=selected_columns, chunksize=100000, on_bad_lines='warn', engine='python'):
#    sample = chunk.sample(frac=0.1, random_state=42)
#    chunks.append(sample)

#df = pd.concat(chunks, ignore_index=True)
#train_dataframe, temp_dataframe = train_test_split(df, test_size=0.2, random_state=42) # 80% for training and temp for validation and testing
#validation_dataframe, test_dataframe = train_test_split(temp_dataframe, test_size=0.5, random_state=42) # splitting the temp data into 10% for validation and 10% for testing

#train_dataframe.to_csv("dataset/train_data_final.csv", chunksize=100000)
#validation_dataframe.to_csv("dataset/validation_data_final.csv", chunksize=100000)
#test_dataframe.to_csv("dataset/test_data_final.csv", chunksize=100000)

# get dataset from previously loaded
train_file_path = "dataset/train_data_final.csv"
train_data_chunks = []
for chunk in pd.read_csv(train_file_path, chunksize=100000, on_bad_lines='warn', engine='python'):
    chunk = chunk.dropna(subset=['content', 'type'])
    # adding label column which tells whether the article is reliable (=1) or not (=0)
    chunk["label"] = chunk["type"].apply(lambda x: 1 if str(x).strip().lower() == "reliable" else 0)
    train_data_chunks.append(chunk)

train_data = pd.concat(train_data_chunks, ignore_index=True)

test_file_path = "dataset/test_data_final.csv"
test_data_chunks = []
for chunk in pd.read_csv(test_file_path, chunksize=100000, on_bad_lines='warn', engine='python'):
    chunk = chunk.dropna(subset=['content', 'type'])
    # adding label column which tells whether the article is reliable (=1) or not (=0)
    chunk["label"] = chunk["type"].apply(lambda x: 1 if str(x).strip().lower() == "reliable" else 0)
    test_data_chunks.append(chunk)

test_data = pd.concat(test_data_chunks, ignore_index=True)


In [19]:
print("Sample label distribution:")
print(train_data['label'].value_counts())
print(test_data['label'].value_counts())

Sample label distribution:
label
0    497052
1    152883
Name: count, dtype: int64
label
0    62258
1    19120
Name: count, dtype: int64


# Bert model

In [20]:
# Convert to TensorFlow datasets using raw text
train_dataset = tf.data.Dataset.from_tensor_slices((train_data['content'].tolist(), train_data['label'].tolist()))
test_dataset = tf.data.Dataset.from_tensor_slices((test_data['content'].tolist(), test_data['label'].tolist()))

# Batch datasets
train_dataset = train_dataset.batch(64).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(64).prefetch(tf.data.AUTOTUNE)

In [22]:
bert_preprocess = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
bert_model = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/2', trainable=True)

text_input = keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
bert_output = bert_model(preprocessed_text)['pooled_output']
dense = tf.keras.layers.Dense(64, activation='relu')(bert_output)
output = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = keras.Model(inputs=[text_input], outputs=[output])

model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy', Precision(), Recall()])
history = model.fit(train_dataset, epochs=5, validation_data=test_dataset)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluate the model

In [23]:
loss, accuracy, precision, recall = model.evaluate(test_dataset)

print(f"Results: \n Test accuracy: {accuracy} \n Test loss: {loss} \n Test precision: {precision} \n Test recall: {recall}")

# Predictions and final metrics
true_labels = test_data['label'].values
# create dataset for processing and to avoid memory error
test_content = tf.data.Dataset.from_tensor_slices(test_data['content'].values)
test_content = test_content.batch(32).prefetch(tf.data.AUTOTUNE)
predictions = model.predict(test_content)
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Calculate precision, recall, and F1 score

print(classification_report(true_labels, predicted_labels))

test_precision = precision_score(true_labels, predicted_labels)
test_recall = recall_score(true_labels, predicted_labels)
test_f1 = f1_score(true_labels, predicted_labels)

print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1}")

Results: 
 Test accuracy: 0.9545700550079346 
 Test loss: 0.14685742557048798 
 Test precision: 0.9371037483215332 
 Test recall: 0.8952885270118713 
 Test F1_Score: 0.9157190256539762
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     62258
           1       0.94      0.87      0.90     19120

    accuracy                           0.95     81378
   macro avg       0.95      0.92      0.94     81378
weighted avg       0.95      0.95      0.95     81378

Test Precision: 0.9353582114830915
Test Recall: 0.8665271966527197
Test F1 Score: 0.8996280509325876


# Validate with LIAR

## Preprocess LIAR

In [None]:
# already converted to csv
liar_test_data_path = "dataset/liar/test.csv"
liar_test_chunks = []
for chunk in pd.read_csv(liar_test_data_path, chunksize=10000, on_bad_lines='warn', engine='python'):
    # labels in liar = true, false, half-true, pants-fire, barely-true, mostly-true
    # only true and mostly-true are true others are false
    # converting labels to binary 1=true, 0=false
    chunk["binary_label"] = chunk["label"].apply(lambda x: 1 if str(x).strip().lower() in ["true", "mostly-true"] else 0)
    liar_test_chunks.append(chunk)

liar_test_data = pd.concat(liar_test_chunks, ignore_index=True)

liar_valid_data_path = "dataset/liar/valid.csv"
liar_valid_chunks = []
for chunk in pd.read_csv(liar_valid_data_path, chunksize=10000, on_bad_lines='warn', engine='python'):
    # labels in liar = true, false, half-true, pants-fire, barely-true, mostly-true
    # only true and mostly-true are true others are false
    # converting labels to binary 1=true, 0=false
    chunk["binary_label"] = chunk["label"].apply(lambda x: 1 if str(x).strip().lower() in ["true", "mostly-true"] else 0)
    liar_valid_chunks.append(chunk)

liar_valid_data = pd.concat(liar_valid_chunks, ignore_index=True)

label_distribution_test = liar_test_data["binary_label"].value_counts(normalize=True) * 100 # counting the distribution
label_distribution_valid = liar_valid_data["binary_label"].value_counts(normalize=True) * 100 # counting the distribution
print(f"Label distribution in liar test set: {label_distribution_test}")
print(f"Label distribution in liar validation set: {label_distribution_valid}")

# Convert to TensorFlow datasets using raw text
liar_valid_dataset = tf.data.Dataset.from_tensor_slices((liar_valid_data['statement'].tolist(), liar_valid_data['binary_label'].tolist()))
# Batch dataset
liar_valid_dataset = liar_valid_dataset.batch(64).prefetch(tf.data.AUTOTUNE)

Label distribution in liar test set: binary_label
0    64.561957
1    35.438043
Name: proportion, dtype: float64
Label distribution in liar validation set: binary_label
0    67.28972
1    32.71028
Name: proportion, dtype: float64


## Evaluate with LIAR

In [28]:
valid_loss, valid_accuracy, valid_precision, valid_recall = model.evaluate(liar_valid_dataset)

print(f"Results: \n Test accuracy: {valid_accuracy} \n Test loss: {valid_loss} \n Test precision: {valid_precision} \n Test recall: {valid_recall}")


# Predictions and final metrics
true_labels = liar_valid_data['binary_label'].values
# create dataset for processing and to avoid memory error
liar_valid_statement = tf.data.Dataset.from_tensor_slices(liar_valid_data['statement'].values)
liar_valid_statement = liar_valid_statement.batch(32).prefetch(tf.data.AUTOTUNE)
predictions = model.predict(liar_valid_statement)
predicted_labels = (predictions > 0.5).astype(int).flatten()

# Calculate precision, recall, and F1 score

print(classification_report(true_labels, predicted_labels))

valid_precision = precision_score(true_labels, predicted_labels)
valid_recall = recall_score(true_labels, predicted_labels)
valid_f1 = f1_score(true_labels, predicted_labels)

print(f"Test Precision: {valid_precision}")
print(f"Test Recall: {valid_recall}")
print(f"Test F1 Score: {valid_f1}")

Results: 
 Test accuracy: 0.6705607771873474 
 Test loss: 1.7774035930633545 
 Test precision: 0.9370768666267395 
 Test recall: 0.8943120837211609
              precision    recall  f1-score   support

           0       0.67      0.99      0.80       864
           1       0.43      0.02      0.04       420

    accuracy                           0.67      1284
   macro avg       0.55      0.50      0.42      1284
weighted avg       0.59      0.67      0.55      1284

Test Precision: 0.42857142857142855
Test Recall: 0.02142857142857143
Test F1 Score: 0.04081632653061224
