<div style='text-align: center;'>
    <span style='font-size: 30px; font-weight: bold;'>
        CM3070 Final Project
    </span>
</div>
<div style='text-align: center;'>
    <span style='font-size: 30px; font-weight: bold;'>
        Final Report
    </span>
</div>

# 1. Introduction

In [1]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import os            # to access operating system functions like file or folder handling

In [2]:
# --------------------------------------------------------------------------------------------------
# Create these folder if they do not exist
# --------------------------------------------------------------------------------------------------
os.makedirs('saved_models', exist_ok=True)       # for storing features, encoders and trained models
os.makedirs('saved_predictions', exist_ok=True)  # for storing predicted probabilities
os.makedirs('user_feedback', exist_ok=True)      # for storing user feedback
os.makedirs('utils', exist_ok=True)              # for storing custom functions
os.makedirs('results', exist_ok=True)            # for storing evaluation results

# 2. Dataset Loading, Preprocessing, and Saving

In [3]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import pandas as pd  # to read CSV files into pandas dataframe

In [4]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Display dataset summary, size and first few rows of the dataset
# --------------------------------------------------------------------------------------------------
def display_dataset_info_size_head(df, df_desc):
    print(f'\033[1m{df_desc} summary\033[0m') 
    df.info()                                                  # display the dataset summary
    print(f'\n\033[1m{df_desc} size:\033[1m', df.shape, '\n')  # display the number of rows and columns of the dataset
    display(df.head(3))                                        # display the first few rows of the dataset

In [5]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Display dataset information in a DataFrame format
# --------------------------------------------------------------------------------------------------
def display_dataframe(headers, data):
    df = pd.DataFrame(data, columns=headers)
    display(df)

## 2.1. Dataset 1 - News Domain

In [6]:
# --------------------------------------------------------------------------------------------------
# Step 1 - Load the 1st dataset and display the structure, shape and first few rows of the dataframe
# --------------------------------------------------------------------------------------------------
csv_file = 'datasets/fake_and_real_news.csv'  # set file name of csv file
df1 = pd.read_csv(csv_file)                   # read csv file (dataset) into dataframe

# Call function - to dislay the dataset summary, size and first few rows of dataset 1
df_desc = 'Dataset 1'
display_dataset_info_size_head(df1, df_desc)

[1mDataset 1 summary[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB

[1mDataset 1 size:[1m (9900, 2) 



Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real


In [7]:
# --------------------------------------------------------------------------------------------------
# Step 2 - Rename name of column 'Text' to 'text',
#          and convert 'Real' to 1 and 'Fake' to 0 in the label column, 
#          and display dataset summary
# --------------------------------------------------------------------------------------------------
# Rename 'Text' to 'text'
df1.rename(columns={'Text': 'text'}, inplace=True)

# Convert 'Real' to 1 and 'Fake' to 0 in 'label' column
df1['label'] = df1['label'].map({'Real': 1, 'Fake': 0})

# Call function - to dislay the dataset summary, size and first few rows of dataset 1
df_desc = 'Dataset 1'
display_dataset_info_size_head(df1, df_desc)

[1mDataset 1 summary[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    9900 non-null   object
 1   label   9900 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 154.8+ KB

[1mDataset 1 size:[1m (9900, 2) 



Unnamed: 0,text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,0
1,U.S. conservative leader optimistic of common ...,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",1


In [8]:
# --------------------------------------------------------------------------------------------------
# Step 3 - Display number of records and percentages of real and fake news of the dataset
# --------------------------------------------------------------------------------------------------
headers = ['Type of news', 'Label', 'Number of records', 'Percentage (%)']
data = [['Real', 1, str(len(df1[df1['label'] == 1])), round(len(df1[df1['label'] == 1]) / len(df1) * 100)]]
data.append (['Fake', 0, str(len(df1[df1['label'] == 0])), round(len(df1[df1['label'] == 0]) / len(df1) * 100)])

# Call function - to create and display dataframe
display_dataframe(headers, data)

Unnamed: 0,Type of news,Label,Number of records,Percentage (%)
0,Real,1,4900,49
1,Fake,0,5000,51


## 2.2. Dataset 2 - Football Domain

In [9]:
# --------------------------------------------------------------------------------------------------
# Step 1 - Load the 2nd datasets (separate real and fake news files), 
#          and display the structure, shape and first few rows of the dataframe
# --------------------------------------------------------------------------------------------------
df2_real = pd.read_csv('datasets/real.csv')  # real news
df2_fake = pd.read_csv('datasets/fake.csv')  # fake news

# For real news dataset
# Call function - to dislay the dataset summary, size and first few rows
df_desc = 'Dataset 2 (Real news)' 
display_dataset_info_size_head(df2_real, df_desc)

# For fake news dataset
# Call function - to dislay the dataset summary, size and first few rows
df_desc = 'Dataset 2 (Fake news)'
display_dataset_info_size_head(df2_fake, df_desc)

[1mDataset 2 (Real news) summary[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21869 entries, 0 to 21868
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   21863 non-null  object
dtypes: object(1)
memory usage: 171.0+ KB

[1mDataset 2 (Real news) size:[1m (21869, 1) 



Unnamed: 0,tweet
0,sun downs technical director: al-ahly respecte...
1,shawky gharib after the tie with enppi: our go...
2,"egyptian sports news today, wednesday 1/25/202..."


[1mDataset 2 (Fake news) summary[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweet   19988 non-null  object
dtypes: object(1)
memory usage: 156.4+ KB

[1mDataset 2 (Fake news) size:[1m (19999, 1) 



Unnamed: 0,tweet
0,"the tongue of his condition, now i saw things ..."
1,"by god the great, after i saw the derby of wyd..."
2,"believe in god, this zamalek fans are as good ..."


In [10]:
# --------------------------------------------------------------------------------------------------
# Step 2 - Remove rows with null values, rename the column name to 'text',
#          and add a column to each dataset (label = 1 for real news and label = 0 for fake news)
# --------------------------------------------------------------------------------------------------
# Drop rows with null values
df2_real = df2_real.dropna()
df2_fake = df2_fake.dropna()

# Rename column names
df2_real = df2_real.rename(columns={'tweet': 'text'})
df2_fake = df2_fake.rename(columns={'tweet': 'text'})

# Add a 'label' column to indicate fake (0) or real (1) news
df2_real['label'] = 1  # Label for real news
df2_fake['label'] = 0  # Label for fake news

# For real news dataset
# Call function - to dislay the dataset summary, size and first few rows
df_desc = 'Dataset 2 (Real news)' 
display_dataset_info_size_head(df2_real, df_desc)

# For fake news dataset
# Call function - to dislay the dataset summary, size and first few rows
df_desc = 'Dataset 2 (Fake news)'
display_dataset_info_size_head(df2_fake, df_desc)

[1mDataset 2 (Real news) summary[0m
<class 'pandas.core.frame.DataFrame'>
Index: 21863 entries, 0 to 21868
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    21863 non-null  object
 1   label   21863 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 512.4+ KB

[1mDataset 2 (Real news) size:[1m (21863, 2) 



Unnamed: 0,text,label
0,sun downs technical director: al-ahly respecte...,1
1,shawky gharib after the tie with enppi: our go...,1
2,"egyptian sports news today, wednesday 1/25/202...",1


[1mDataset 2 (Fake news) summary[0m
<class 'pandas.core.frame.DataFrame'>
Index: 19988 entries, 0 to 19998
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    19988 non-null  object
 1   label   19988 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 468.5+ KB

[1mDataset 2 (Fake news) size:[1m (19988, 2) 



Unnamed: 0,text,label
0,"the tongue of his condition, now i saw things ...",0
1,"by god the great, after i saw the derby of wyd...",0
2,"believe in god, this zamalek fans are as good ...",0


In [11]:
# --------------------------------------------------------------------------------------------------
# Step 3 - Combine the two dataframes into one,
#          and shuffle the combined dataframe to jumble the records
# --------------------------------------------------------------------------------------------------
# Combine the two dataframes into one
df2_combined = pd.concat([df2_fake, df2_real], ignore_index=True)

# Shuffle the combined dataframe to jumble the records and reset the index after shuffling
df2 = df2_combined.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few records of the shuffled DataFrame
df2.head(3)

Unnamed: 0,text,label
0,"praise be to god, zamalek appeared before al-a...",0
1,muhammad marouf is running a match in the prem...,1
2,"the sudanese team, al-merreikh, ignited the ra...",1


In [12]:
# --------------------------------------------------------------------------------------------------
# Step 4 - Display number of records and percentages of real and fake news of the dataset
# --------------------------------------------------------------------------------------------------
headers = ['Type of news', 'Label', 'Number of records', 'Percentage (%)']
data = [['Real', 1, str(len(df2[df2['label'] == 1])), round(len(df2[df2['label'] == 1]) / len(df2) * 100)]]
data.append (['Fake', 0, str(len(df2[df2['label'] == 0])), round(len(df2[df2['label'] == 0]) / len(df2) * 100)])

# Call function - to create and display dataframe
display_dataframe(headers, data)

Unnamed: 0,Type of news,Label,Number of records,Percentage (%)
0,Real,1,21863,52
1,Fake,0,19988,48


## 2.3. Text Preprocessing

In [13]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import re                                 # Python's re module for regular expressions
import nltk                               # Nltk library for text processing
from nltk.tokenize import word_tokenize   # for word tokenization
from nltk.stem import WordNetLemmatizer   # for lemmatization
from nltk.corpus import stopwords         # for stop words removal

In [14]:
# --------------------------------------------------------------------------------------------------
# Create a copy of datasets before preprocessing to keep the original datasets clean
# --------------------------------------------------------------------------------------------------
df1_copy = df1.copy()         
df2_copy = df2.copy()     

In [15]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - For word tokenization, lemmatization and stop words removal
# --------------------------------------------------------------------------------------------------
def token_lemma_stopwords(text):

    # Tokenize the text into words
    tokens = word_tokenize(text)
    
    # Lemmatize each word to its base form
    lemmatizer = WordNetLemmatizer()                        
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Remove stop words from the lemmatized tokens
    stop_words = set(stopwords.words('english'))            
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words]
    
    return filtered_tokens

In [16]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - To clean text (keeping only lowercase letters with only one space between the words
#            without leading and trailing spaces), 
#            and apply word tokenization, lemmatization and stop words removal, 
#            and join the tokens back into single stringsand an
# --------------------------------------------------------------------------------------------------
def clean_text(text):

    # Convert text to lowercase
    text = text.lower()
    
    # Remove non-alphabetical characters (keeping only lowercase letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)

    # Replace multiple spaces with one space and strip leading and trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Call function - for word tokenization, lemmatization and stop words removal
    text = token_lemma_stopwords(text)

    # join tokens back into single strings
    text_cleaned = ' '.join(text)  

    return text_cleaned

In [17]:
# --------------------------------------------------------------------------------------------------
# Clean the 'text' columns for datasets 1 and 2
# --------------------------------------------------------------------------------------------------
# Call function - to clean the 'text' column and store the cleaned text in a new column 'text_cleaned'
df1_copy['text_cleaned'] = df1_copy['text'].apply(clean_text)
df2_copy['text_cleaned'] = df2_copy['text'].apply(clean_text)

# Display the first few records of the cleaned datasets
print("\033[1mDataset 1\033[0m")
display(df1_copy.head(3))          # dataset 1

print("\033[1mDataset 2\033[0m")   
display(df2_copy.head(3))          # dataset 2

[1mDataset 1[0m


Unnamed: 0,text,label,text_cleaned
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,0,top trump surrogate brutally stab back patheti...
1,U.S. conservative leader optimistic of common ...,1,u conservative leader optimistic common ground...
2,"Trump proposes U.S. tax overhaul, stirs concer...",1,trump proposes u tax overhaul stir concern def...


[1mDataset 2[0m


Unnamed: 0,text,label,text_cleaned
0,"praise be to god, zamalek appeared before al-a...",0,praise god zamalek appeared alahly came catch ...
1,muhammad marouf is running a match in the prem...,1,muhammad marouf running match premier league t...
2,"the sudanese team, al-merreikh, ignited the ra...",1,sudanese team almerreikh ignited ranking group...


## 2.4. Dataset Splitting

In [18]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split   # for splitting 1 dataset into 2 sets
from sklearn.utils import shuffle                      # for shuffling texts and labels together to jumble the records

In [19]:
# --------------------------------------------------------------------------------------------------
# Retrieve the cleaned texts and labels from datasets 1 and 2
# --------------------------------------------------------------------------------------------------
# For dataset 1
X1 = df1_copy['text_cleaned']       
y1 = df1_copy['label']     

# For dataset 2
X2 = df2_copy['text_cleaned']       
y2 = df2_copy['label']     

In [20]:
# --------------------------------------------------------------------------------------------------
# Split datasets 1 and 2, each into 80% training, 10% validation and 10% test sets,
# and display the shapes of the sets
# --------------------------------------------------------------------------------------------------
# X_train X_train --> texts of training set
# X_val   X_val   --> texts of validation set
# X_test  X_test  --> texts of test set
# y_train y_train --> labels 1 (Real) and 0 (Fake) of training set
# X_val   X_val   --> labels 1 (Real) and 0 (Fake) of validation set
# y_test  y_test  --> labels 1 (Real) and 0 (Fake) of test set

# Split datasets into train and temp sets for datasets 1 and 2
X1_train, X1_temp, y1_train, y1_temp = train_test_split(X1, y1, test_size=0.2, random_state=42)
X2_train, X2_temp, y2_train, y2_temp = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Split temp datasets into validation and test sets for datasets 1 and 2
X1_val, X1_test, y1_val, y1_test = train_test_split(X1_temp, y1_temp, test_size=0.5, random_state=42)
X2_val, X2_test, y2_val, y2_test = train_test_split(X2_temp, y2_temp, test_size=0.5, random_state=42)

# Define dataset headers for training, validation, and test sets for datasets 1 and 2
headers = ['Dataset', 'X train (text)', 'y train (label)', 'train %', 
                      'X val (text)', 'y val (label)', 'validation %',
                      'X test (text)', 'y test (label)', 'test %']

# Store dataset shapes for training, validation, and test sets for datasets 1
data = [['Dataset 1', X1_train.shape, y1_train.shape, round(len(X1_train) / len(X1) * 100),
                      X1_val.shape, y1_val.shape, round(len(X1_val) / len(X1) * 100),
                      X1_test.shape, y1_test.shape, round(len(X1_test) / len(X1) * 100)]]

# Store dataset shapes for training, validation, and test sets for dataset 2
data.append (['Dataset 2', X2_train.shape, y2_train.shape, round(len(X2_train) / len(X2) * 100),
                           X2_val.shape, y2_val.shape, round(len(X2_val) / len(X2) * 100),
                           X2_test.shape, y2_test.shape, round(len(X2_test) / len(X2) * 100)])

# Call function - to display shapes of training, validation and test sets for datasets 1 and 2
display_dataframe(headers, data)

Unnamed: 0,Dataset,X train (text),y train (label),train %,X val (text),y val (label),validation %,X test (text),y test (label),test %
0,Dataset 1,"(7920,)","(7920,)",80,"(990,)","(990,)",10,"(990,)","(990,)",10
1,Dataset 2,"(33480,)","(33480,)",80,"(4185,)","(4185,)",10,"(4186,)","(4186,)",10


In [21]:
# --------------------------------------------------------------------------------------------------
# Combine training and validation sets of datasets 1 and 2 together 
# --------------------------------------------------------------------------------------------------
# Combine texts of both training sets together by stacking rows
X_train = pd.concat([X1_train, X2_train], ignore_index=True)
y_train = pd.concat([y1_train, y2_train], ignore_index=True)

# Combine labels of both validation sets together by stacking rows
X_val = pd.concat([X1_val, X2_val], ignore_index=True)
y_val = pd.concat([y1_val, y2_val], ignore_index=True)

# Shuffle the combined texts and labels together to jumble the records
X_train, y_train = shuffle(X_train, y_train, random_state=42)
X_val, y_val = shuffle(X_val, y_val, random_state=42)

# Reset index
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

# Display first few rows of texts and labels of training and validation sets
print("\033[1mTraining texts\033[0m")
print(X_train.head(3))
print()
print("\033[1mTraining labels\033[0m")
print(y_train.head(3))
print()
print("\033[1mValidation texts\033[0m")
print(X_val.head(3))
print()
print("\033[1mValidation labels\033[0m")
print(y_val.head(3))

[1mTraining texts[0m
0    way problem fan zamalek turki alsheikh fact cl...
1    course real coach team know khebari better nas...
2    alahly defeated zamalek african final var alah...
Name: text_cleaned, dtype: object

[1mTraining labels[0m
0    0
1    0
2    0
Name: label, dtype: int64

[1mValidation texts[0m
0                      demand continuation league haha
1    haha next week boatman got fat thought sukkary...
2    manchester except swissdad barcelona real beti...
Name: text_cleaned, dtype: object

[1mValidation labels[0m
0    0
1    0
2    0
Name: label, dtype: int64


In [22]:
# --------------------------------------------------------------------------------------------------
# Rename test sets names to follow same convention as training and validation sets
# --------------------------------------------------------------------------------------------------
# Rename for test set 1
X_test_1 = X1_test
y_test_1 = y1_test

# Rename for test set 2
X_test_2 = X2_test
y_test_2 = y2_test

# Reset index for test set 1
X_test_1 = X_test_1.reset_index(drop=True)
y_test_1 = y_test_1.reset_index(drop=True)

# Reset index for test set 2
X_test_2 = X_test_2.reset_index(drop=True)
y_test_2 = y_test_2.reset_index(drop=True)

# Display first few rows of texts and labels of test sets 1 and 2 
print("\033[1mDataset 1 Test texts\033[0m")
print(X_test_1.head(3))
print()
print("\033[1mDataset 1 Test labels\033[0m")
print(y_test_1.head(3))
print()
print("\033[1mDataset 2 Test texts\033[0m")
print(X_test_2.head(3))
print()
print("\033[1mDataset 2 Test labels\033[0m")
print(y_test_2.head(3))
print()

[1mDataset 1 Test texts[0m
0    trump explodes obama drop devastating truth bo...
1    trump explained u position thaad xi south kore...
2    house widens ethic probe include farenthold ca...
Name: text_cleaned, dtype: object

[1mDataset 1 Test labels[0m
0    0
1    1
2    1
Name: label, dtype: int64

[1mDataset 2 Test texts[0m
0    mokhtar mokhtar marvel queirozs choice prelimi...
1    ahmed alfadly egyptian ambassador south africa...
2    hahahahahahahahahahahahahahahahahahahahahahaha...
Name: text_cleaned, dtype: object

[1mDataset 2 Test labels[0m
0    1
1    1
2    0
Name: label, dtype: int64



## 2.5. Save Dataset Storage

In [23]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import joblib   # for saving data, features, models and evaluation results for retrievals across kernels

In [24]:
# --------------------------------------------------------------------------------------------------
# Save training, validation and test sets for retrievals across kernels
# --------------------------------------------------------------------------------------------------
# Save data of training, validation and test setes
joblib.dump(X_train, 'datasets/X_train.pkl')
joblib.dump(X_val, 'datasets/X_val.pkl')
joblib.dump(X_test_1, 'datasets/X_test_1.pkl')
joblib.dump(X_test_2, 'datasets/X_test_2.pkl')

# Save labels of training, validation and test sets
joblib.dump(y_train, 'datasets/y_train.pkl')
joblib.dump(y_val, 'datasets/y_val.pkl')
joblib.dump(y_test_1, 'datasets/y_test_1.pkl')
joblib.dump(y_test_2, 'datasets/y_test_2.pkl')

['datasets/y_test_2.pkl']

# 3. Utility Functions

## 3.1.  Utility Function for Model Evaluation

In [25]:
%%writefile utils/evaluate_model_performance.py

# --------------------------------------------------------------------------------------------------
# FUNCTION - Metrics to evaluate the effectiveness of the trained models,
#            and save this function into the 'utils' folder as a Python file
# --------------------------------------------------------------------------------------------------
import pandas as pd            # to read CSV files into pandas dataframe
from sklearn.metrics import (
    accuracy_score,            # overall correctness
    precision_score,           # correct positives / predicted positives
    recall_score,              # correct positives / actual positives
    f1_score,                  # balance between precision and recall
    classification_report      # detailed per-class performance
)

def evaluate_model_performance(model_num, model_name, model_type, model_desc, actual_labels, predicted_labels):

    print(f'\033[1m\n{model_desc}:\n\033[0m')

    # Calculate metrics
    accuracy = accuracy_score(actual_labels, predicted_labels)
    precision = precision_score(actual_labels, predicted_labels)
    recall = recall_score(actual_labels, predicted_labels)
    f1 = f1_score(actual_labels, predicted_labels)

    # Create a single-row DataFrame with metric names as columns
    results_df = pd.DataFrame({
        'Model': [model_num],
        'Model name': [model_name],
        'Accuracy': [accuracy],
        'Precision': [precision],
        'Recall': [recall],
        'F1 Score': [f1],
        'Model type': [model_type]
    })

    # Style: bold 'Accuracy' column and format all values to 2 decimal places
    styled_df = results_df.style \
        .map(lambda val: 'font-weight: bold', subset=['Accuracy']) \
        .format(precision=2)

    # Display the full table with bolded Accuracy column
    display(styled_df)
    
    # Classification Report
    class_report = classification_report(actual_labels, predicted_labels, target_names=['Real', 'Fake'])
    print(f"Classification Report:\n{class_report}")

    return results_df

Overwriting utils/evaluate_model_performance.py


## 3.2. Utility Function for Storing Evaluation Results

In [26]:
%%writefile utils/store_evaluation_results.py

# --------------------------------------------------------------------------------------------------
# FUNCTION - Store evaluation results for validation and test phases
# --------------------------------------------------------------------------------------------------
import joblib   # for saving data, features, models and evaluation results for retrievals across kernels
def store_evaluation_results(phase, test_set, first_model, model_num, result):

    # Set the location of the file path depending on validation (value 1), test phase (value 2) and test sets (1 or 2)
    if phase == 1:
        file_path = 'results/all_model_results_val.pkl'
    else:
        if test_set == 1:
            file_path = 'results/all_model_results_test_1.pkl'
        else:
            file_path = 'results/all_model_results_test_2.pkl'

    # List to store evaluation results for all models
    # If it is the first model for that phase or test set, intialise list
    # Else (subsequent runs), retrieve the stored list
    if first_model == True:
        all_model_results = []
    else:
        all_model_results = joblib.load(file_path)
        
    # append evaluation results for current model to 
    all_model_results.append(result)

    # Save the results
    joblib.dump(all_model_results, file_path)

Overwriting utils/store_evaluation_results.py


# 4. Feature Engineering and Traditional & Deep Learning Models

In [27]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import joblib   # for saving models for future use and retrieval of models

## 4.1. TF-IDF Vectorisation

In [28]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
# TFIDF for text vectorisation (weight words by importance)
from sklearn.feature_extraction.text import TfidfVectorizer   

In [29]:
# --------------------------------------------------------------------------------------------------
# Convert text data to numerical features using TFIDF Vectoriser, representing text as word
# occurrence counts
# --------------------------------------------------------------------------------------------------
# Define model parameters
NUM_WORDS = 10000

# Transforms text into TF-IDF features, keeping only top max_features words and removing stopwords
vectorizer_tfidf = TfidfVectorizer(max_features=NUM_WORDS, stop_words='english')

# Learn vocabulary from training data
vectorizer_tfidf.fit(X_train)

# Save the TF-IDF vectorizer for future use
joblib.dump(vectorizer_tfidf, "saved_models/vectorizer_tfidf.pkl")

# Transform datasets using the same vocabulary
X_train_tfidf = vectorizer_tfidf.transform(X_train)
X_val_tfidf = vectorizer_tfidf.transform(X_val)
X_test_1_tfidf = vectorizer_tfidf.transform(X_test_1)
X_test_2_tfidf = vectorizer_tfidf.transform(X_test_2)

# Save TF-IDF transformed datasets
joblib.dump(X_train_tfidf, 'datasets/X_train_tfidf.pkl')
joblib.dump(X_val_tfidf, 'datasets/X_val_tfidf.pkl')
joblib.dump(X_test_1_tfidf, 'datasets/X_test_1_tfidf.pkl')
joblib.dump(X_test_2_tfidf, 'datasets/X_test_2_tfidf.pkl')

['datasets/X_test_2_tfidf.pkl']

## 4.2. Tokenisation and Padding for CNN

In [30]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from tensorflow.keras.preprocessing.text import Tokenizer           # Converts text to integer sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences   # Pads sequences to equal length

2025-03-30 13:49:36.354570: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743313776.369581   10807 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743313776.373573   10807 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743313776.384188   10807 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743313776.384208   10807 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743313776.384210   10807 computation_placer.cc:177] computation placer alr

In [31]:
# --------------------------------------------------------------------------------------------------
# Tokenise text, convert it to sequences, and pad them for uniform input in CNN model
# --------------------------------------------------------------------------------------------------
# Define model parameters
NUM_WORDS = 10000
MAX_LEN = 500

# Tokenizer to learn the vocabulary
tokenizer_cnn = Tokenizer(num_words=NUM_WORDS, oov_token="<OOV>")
tokenizer_cnn.fit_on_texts(X_train)

# Save the trained tokenizer
joblib.dump(tokenizer_cnn, "saved_models/tokenizer_cnn.pkl")

# Convert text into sequences
X_train_seq = tokenizer_cnn.texts_to_sequences(X_train)
X_val_seq = tokenizer_cnn.texts_to_sequences(X_val)
X_test_1_seq = tokenizer_cnn.texts_to_sequences(X_test_1)
X_test_2_seq = tokenizer_cnn.texts_to_sequences(X_test_2)

# Pad sequences to ensure equal length
X_train_padded = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_1_padded = pad_sequences(X_test_1_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_2_padded = pad_sequences(X_test_2_seq, maxlen=MAX_LEN, padding='post', truncating='post')

# Save the padded sequences
joblib.dump(X_train_padded, 'datasets/X_train_padded.pkl')
joblib.dump(X_val_padded, 'datasets/X_val_padded.pkl')
joblib.dump(X_test_1_padded, 'datasets/X_test_1_padded.pkl')
joblib.dump(X_test_2_padded, 'datasets/X_test_2_padded.pkl')

['datasets/X_test_2_padded.pkl']

## 4.3. Label Encoding

In [32]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from sklearn.preprocessing import LabelEncoder   # for encoding categorical labels into numerical values

In [33]:
# --------------------------------------------------------------------------------------------------
# Encode categorical labels into numerical values
# --------------------------------------------------------------------------------------------------
# Initialise label encoder
label_encoder = LabelEncoder()

# Fit encoder to training labels
label_encoder.fit(y_train)

# Save the label encoder
joblib.dump(label_encoder, 'saved_models/label_encoder.pkl')

# Encode training, validation and tests labels
y_train_encoded = label_encoder.transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_1_encoded = label_encoder.transform(y_test_1)
y_test_2_encoded = label_encoder.transform(y_test_2)

# Save encoded labels separately
joblib.dump(y_train_encoded, 'datasets/y_train_encoded.pkl')
joblib.dump(y_val_encoded, 'datasets/y_val_encoded.pkl')
joblib.dump(y_test_1_encoded, 'datasets/y_test_1_encoded.pkl')
joblib.dump(y_test_2_encoded, 'datasets/y_test_2_encoded.pkl')

['datasets/y_test_2_encoded.pkl']

## 4.4 Model 1: Naive Bayes (Baseline Model) with TF-IDF Vectoriser - Training and Validation

In [34]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from sklearn.naive_bayes import MultinomialNB                             # Multinomial Naive Bayes model
from importlib import reload                                              # to reload updated Python modules
import utils.evaluate_model_performance                                   # import custom function (needed for reload)
reload(utils.evaluate_model_performance)                                  # reload to reflect any latest changes in file
from utils.evaluate_model_performance import evaluate_model_performance   # custom function to evaluate models
import utils.store_evaluation_results                                     # import custom function (needed for reload)
reload(utils.store_evaluation_results)                                    # reload to reflect any latest changes in file
from utils.store_evaluation_results import store_evaluation_results       # custom function to store evaluation results

In [35]:
# --------------------------------------------------------------------------------------------------
# Step 1 - Train and save a Multinomial Naive Bayes model using TF-IDF features
# --------------------------------------------------------------------------------------------------
# Initialise the Multinomial NB models
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train_tfidf.toarray(), y_train)

# Save the naive bayes model and the TF-IDF vectorizer for future use.
joblib.dump(nb_model, "saved_models/model_nb.pkl")

['saved_models/model_nb.pkl']

In [36]:
# --------------------------------------------------------------------------------------------------
# Step 2 - Generate predictions and evaluate the Naive Bayes model on the validation set,
#          and save predictions for stacking (ensemble) model later.
# --------------------------------------------------------------------------------------------------
# Predict class labels for the validation set
y_pred_val_nb = nb_model.predict(X_val_tfidf.toarray())

# Predict class probabilities for the validation set
y_pred_proba_val_nb = nb_model.predict_proba(X_val_tfidf.toarray())

# Save predicted probabilities for ensemble stacking
joblib.dump(y_pred_proba_val_nb, "saved_predictions/y_pred_proba_val_nb.pkl")

# Call function - to evaluate the effectiveness of the model
model_num = 1
model_name = 'Naive Bayes'
model_type = 'Baseline'
model_desc = f'Model {model_num}: {model_name} ({model_type} model) using TF-IDF Vectoriser\nValidation Set'
result     = evaluate_model_performance(model_num, model_name, model_type, model_desc, y_val, y_pred_val_nb)

# Call function - to store validation results
phase = 1                  # 1 for validation phase / 2 for testing phase
test_set = 0               # test set 1 or 2 for testing phase
first_model = True         # True for first model for that phase / False for subsequent model for the same phase
store_evaluation_results(phase, test_set, first_model, model_num, result)

[1m
Model 1: Naive Bayes (Baseline model) using TF-IDF Vectoriser
Validation Set:
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,1,Naive Bayes,0.93,0.94,0.93,0.94,Baseline


Classification Report:
              precision    recall  f1-score   support

        Real       0.93      0.93      0.93      2479
        Fake       0.94      0.93      0.94      2696

    accuracy                           0.93      5175
   macro avg       0.93      0.93      0.93      5175
weighted avg       0.93      0.93      0.93      5175



## 4.5. Model 2: Logistic Regression (Traditional Machine Learning) with TF-IDF Vectoriser - Training and Validation

In [37]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from sklearn.linear_model import LogisticRegression   # Statistical model - Logistic Regression
from sklearn.model_selection import GridSearchCV      # Tunes hyperparameters via cross-validation

In [38]:
# --------------------------------------------------------------------------------------------------
# Step 1 - Perform hyperparameter tuning for Logistic Regression using GridSearchCV, 
#          and select the best model, and save it
# --------------------------------------------------------------------------------------------------
# Define a parameter grid for regularization strength 'C'
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2']              # L2 regularization
}

# Initialize the logistic regression model
model_lr = LogisticRegression(max_iter=1000, solver='lbfgs', class_weight='balanced')

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(model_lr, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
grid_search.fit(X_train_tfidf, y_train)

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Use the best model from grid search
model_lr_best = grid_search.best_estimator_

# Save the naive bayes model and the TF-IDF vectorizer for future use.
joblib.dump(model_lr_best, "saved_models/model_lr_best.pkl")

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best parameters found: {'C': 10, 'penalty': 'l2'}
Best cross-validation accuracy: 0.99138978054308


['saved_models/model_lr_best.pkl']

In [39]:
# --------------------------------------------------------------------------------------------------
# Step 2 - Generate predictions and evaluate the Logistic Regression model on the validation set,
#          and save predictions for stacking (ensemble) model later.
# --------------------------------------------------------------------------------------------------
# Predict class labels for the validation set
y_pred_val_lr = model_lr_best.predict(X_val_tfidf)

# Predict class probabilities for the validation set
y_pred_proba_val_lr = model_lr_best.predict_proba(X_val_tfidf)

# Save predicted probabilities for ensemble stacking
joblib.dump(y_pred_proba_val_lr, "saved_predictions/y_pred_proba_val_lr.pkl")

# Call function - to evaluate the effectiveness of the model
model_num = 2
model_name = 'Logistic Regression'
model_type = 'Traditional Machine Learning'
model_desc = f'Model {model_num}: {model_name} ({model_type} model) using TF-IDF Vectoriser with Regularization\nValidation Set'
result     = evaluate_model_performance(model_num, model_name, model_type, model_desc, y_val, y_pred_val_lr)

# Call function - to store validation results
phase = 1                  # 1 for validation phase / 2 for testing phase
test_set = 0               # test set 1 or 2 for testing phase
first_model = False        # True for first model for that phase / False for subsequent model for the same phase
store_evaluation_results(phase, test_set, first_model, model_num, result)

[1m
Model 2: Logistic Regression (Traditional Machine Learning model) using TF-IDF Vectoriser with Regularization
Validation Set:
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,2,Logistic Regression,0.96,0.96,0.96,0.96,Traditional Machine Learning


Classification Report:
              precision    recall  f1-score   support

        Real       0.96      0.96      0.96      2479
        Fake       0.96      0.96      0.96      2696

    accuracy                           0.96      5175
   macro avg       0.96      0.96      0.96      5175
weighted avg       0.96      0.96      0.96      5175



## 4.6. Model 3: CNN (Deep Learning Model) - Training and Validation

In [40]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from tensorflow.keras.models import Sequential                                                            # Build CNN model
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Dropout, Embedding, GlobalMaxPooling1D   # CNN layers
import numpy as np                                                                                        # for numerical operations

In [41]:
# --------------------------------------------------------------------------------------------------
# Step 1 - Load the preprocessed input data, tokenizer, and label encoder needed 
#          for CNN model training or evaluation
# --------------------------------------------------------------------------------------------------
# Load the trained tokenizer
tokenizer_cnn = joblib.load('saved_models/tokenizer_cnn.pkl')

# Load the padded sequences
X_train_padded = joblib.load('datasets/X_train_padded.pkl')
X_val_padded = joblib.load('datasets/X_val_padded.pkl')

# Load the encoded labels
y_train_encoded = joblib.load('datasets/y_train_encoded.pkl')
y_val_encoded = joblib.load('datasets/y_val_encoded.pkl')

In [42]:
# --------------------------------------------------------------------------------------------------
# Step 2 - Define, compile, train, and save a CNN model for binary text classification
# --------------------------------------------------------------------------------------------------
# Define model parameters
VOCAB_SIZE = 10000
EMBEDDING_DIM = 64

# Create CNN model
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM))
model_cnn.add(Conv1D(filters=32, kernel_size=5, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
#model_cnn.add(Conv1D(filters=16, kernel_size=5, activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dropout(0.3))
model_cnn.add(Dense(32, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))

# Compile model
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
history = model_cnn.fit(
    X_train_padded, y_train_encoded,
    validation_data=(X_val_padded, y_val_encoded),
    epochs=5,
    batch_size=4,
    verbose=1
)

# Save model
model_cnn.save('saved_models/model_cnn.keras')

Epoch 1/5


I0000 00:00:1743313824.255387   10807 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7168 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1
I0000 00:00:1743313825.785694   11198 service.cc:152] XLA service 0x75adf8004df0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743313825.785719   11198 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce GTX 1080, Compute Capability 6.1
2025-03-30 13:50:25.822454: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1743313826.105291   11198 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m   51/10350[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m31s[0m 3ms/step - accuracy: 0.5476 - loss: 0.6883

I0000 00:00:1743313827.717531   11198 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m10350/10350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - accuracy: 0.9116 - loss: 0.2023 - val_accuracy: 0.9573 - val_loss: 0.1040
Epoch 2/5
[1m10350/10350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9672 - loss: 0.0884 - val_accuracy: 0.9625 - val_loss: 0.0980
Epoch 3/5
[1m10350/10350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9797 - loss: 0.0557 - val_accuracy: 0.9592 - val_loss: 0.1029
Epoch 4/5
[1m10350/10350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9858 - loss: 0.0381 - val_accuracy: 0.9606 - val_loss: 0.1061
Epoch 5/5
[1m10350/10350[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 3ms/step - accuracy: 0.9912 - loss: 0.0260 - val_accuracy: 0.9567 - val_loss: 0.1160


In [43]:
# --------------------------------------------------------------------------------------------------
# Step 3 - Generate predictions and evaluate the CNN model on the validation set,
#          and save predictions for stacking (ensemble) model later.
# --------------------------------------------------------------------------------------------------
# Predictions for CNN Model (use padded sequences) for the validation set
y_pred_proba_val_cnn = model_cnn.predict(X_val_padded)                                    # Original: (num_samples, 1)
y_pred_proba_val_cnn = np.column_stack([1 - y_pred_proba_val_cnn, y_pred_proba_val_cnn])  # Ensure follow [p(Fake), p(Real)] explicitly
y_pred_val_cnn = np.argmax(y_pred_proba_val_cnn, axis=1)                                  # Class labels

# Save predicted probabilities for ensemble stacking
joblib.dump(y_pred_proba_val_cnn, "saved_predictions/y_pred_proba_val_cnn.pkl")

# Call function - to evaluate the effectiveness of the model
model_num = 3
model_name = 'CNN'
model_type = 'Deep Learning'
model_desc = f'Model {model_num}: {model_name} ({model_type} model)\nValidation Set'
result     = evaluate_model_performance(model_num, model_name, model_type, model_desc, y_val, y_pred_val_cnn)

# Call function - to store validation results
phase = 1                  # 1 for validation phase / 2 for testing phase
test_set = 0               # test set 1 or 2 for testing phase
first_model = False        # True for first model for that phase / False for subsequent model for the same phase
store_evaluation_results(phase, test_set, first_model, model_num, result)

[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m
Model 3: CNN (Deep Learning model)
Validation Set:
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,3,CNN,0.96,0.97,0.95,0.96,Deep Learning


Classification Report:
              precision    recall  f1-score   support

        Real       0.95      0.97      0.96      2479
        Fake       0.97      0.95      0.96      2696

    accuracy                           0.96      5175
   macro avg       0.96      0.96      0.96      5175
weighted avg       0.96      0.96      0.96      5175



In [44]:
# --------------------------------------------------------------------------------------------------
# Step 4 - Delete the CNN model and call garbage collection to release the memory held by Python
# --------------------------------------------------------------------------------------------------
import gc
import tensorflow as tf

# Delete the model
del model_cnn

# Call garbage collection to free memory
gc.collect()

# Clear session
tf.keras.backend.clear_session()

# 5. Feature Engineering and Transformer-Based Model

# _A different kernel for PyTorch_

### ⚠️ **Note:** 
* Please switch to the **kernel** for **`pytorch`** before running this section.
* This is required for BERT and RoBERTa models using the `transformers` and `torch` libraries.

## 5.1 Retrieval of Training, Validation and Test Sets

In [1]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import joblib                                    # for saving models for future use and retrieval of models

In [2]:
# --------------------------------------------------------------------------------------------------
# Load X and y for training, validation and test sets, and label encoded sets 
# for BERT model training and evaluation
# --------------------------------------------------------------------------------------------------
# Load X and y from training and validation sets
X_train = joblib.load('datasets/X_train.pkl')
y_train = joblib.load('datasets/y_train.pkl')
X_val = joblib.load('datasets/X_val.pkl')
y_val = joblib.load('datasets/y_val.pkl')

# Load X and y from test sets
X_test_1 = joblib.load('datasets/X_test_1.pkl')
y_test_1 = joblib.load('datasets/y_test_1.pkl')
X_test_2 = joblib.load('datasets/X_test_2.pkl')
y_test_2 = joblib.load('datasets/y_test_2.pkl')

# Load the encoded labels
y_train_encoded = joblib.load('datasets/y_train_encoded.pkl')
y_val_encoded = joblib.load('datasets/y_val_encoded.pkl')

## 5.2. Tokenisation for BERT

In [3]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from transformers import BertTokenizer             # Loads pretrained BERT tokenizer
import torch                                       # PyTorch core library for tensors and models
from torch.utils.data import Dataset, DataLoader   # For custom dataset and batching

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# --------------------------------------------------------------------------------------------------
# CLASS - A custom dataset for BERT and RoBERTa using tokenised inputs and labels
# --------------------------------------------------------------------------------------------------
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [5]:
# --------------------------------------------------------------------------------------------------
# Prepare and save BERT-tokenized training and validation data
# --------------------------------------------------------------------------------------------------
# Load the BERT tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

# Ensure X_train and X_val are lists of strings
X_train_clean = X_train.astype(str).tolist()
X_val_clean = X_val.astype(str).tolist()

# Tokenize the data
X_train_encodings_bert = tokenizer_bert(X_train_clean, truncation=True, padding=True, max_length=128)
X_val_encodings_bert = tokenizer_bert(X_val_clean, truncation=True, padding=True, max_length=128)

# Save the BERT tokenizer
tokenizer_bert.save_pretrained("model_bert")

# Initialise training and validation sets with tokenised BERT inputs
train_dataset_bert = NewsDataset(X_train_encodings_bert, y_train)
val_dataset_bert = NewsDataset(X_val_encodings_bert, y_val)

## 5.3. Model 4: BERT - Training and Validation

In [6]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from transformers import BertTokenizer                                    # Tokenise text for BERT
from transformers import BertForSequenceClassification                    # BERT model for classification
from transformers import TrainingArguments, Trainer                       # Hugging Face training tools
import numpy as np                                                        # for numerical operations
from importlib import reload                                              # to reload updated Python modules
import utils.evaluate_model_performance                                   # import module (needed for reload)
reload(utils.evaluate_model_performance)                                  # reload to reflect any latest changes in file
from utils.evaluate_model_performance import evaluate_model_performance   # custom function to evaluate models
import utils.store_evaluation_results                                     # import custom function (needed for reload)
reload(utils.store_evaluation_results)                                    # reload to reflect any latest changes in file
from utils.store_evaluation_results import store_evaluation_results       # custom function to store evaluation results

In [7]:
# --------------------------------------------------------------------------------------------------
# Step 1 - Load BERT model
# --------------------------------------------------------------------------------------------------
model_bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# --------------------------------------------------------------------------------------------------
# Step 2 - FUNCTION - Define a simple accuracy computation for evaluation
# --------------------------------------------------------------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {'accuracy': acc}

In [9]:
# --------------------------------------------------------------------------------------------------
# Step 3 - Clear unused GPU memory cache
# --------------------------------------------------------------------------------------------------
torch.cuda.empty_cache()

In [10]:
# --------------------------------------------------------------------------------------------------
# Step 4 - Train, evaluate, and save a BERT text classification model
# --------------------------------------------------------------------------------------------------
# Set training configurations for BERT model
training_args_bert = TrainingArguments(
    output_dir='./bert-fake-news',          # output directory
    eval_strategy='epoch',           # evaluate on each epoch end
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',     # choose best model based on validation loss
    save_total_limit=1,# only keep 1 best model (to save disk)
    report_to=[],
    seed=42
)

# Initialise Trainer to train and evaluate BERT model
trainer_bert = Trainer(
    model=model_bert,
    args=training_args_bert,
    train_dataset=train_dataset_bert,
    eval_dataset=val_dataset_bert,
    compute_metrics=compute_metrics
)

# Train the BERT model
trainer_bert.train()

# Save the BERT model
trainer_bert.save_model("model_bert")

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1771,0.173332,0.962512
2,0.0761,0.165509,0.965797
3,0.0571,0.192833,0.971014


In [11]:
# --------------------------------------------------------------------------------------------------
# Step 5 - Generate predictions and evaluate the BERT model on the validation set
# --------------------------------------------------------------------------------------------------
# Predictions for the validation set
y_pred_proba_val_bert = trainer_bert.predict(val_dataset_bert)
y_pred_val_bert = np.argmax(y_pred_proba_val_bert.predictions, axis=1)

# Save predicted probabilities for ensemble stacking
joblib.dump(y_pred_proba_val_bert.predictions, "saved_predictions/y_pred_proba_val_bert.pkl")

# Call function - to evaluate the effectiveness of the model
model_num = 4
model_name = 'BERT'
model_type = 'Transformer-Based'
model_desc = f'Model {model_num}: {model_name} ({model_type} model)\nValidation Set'
result     = evaluate_model_performance(model_num, model_name, model_type, model_desc, y_val, y_pred_val_bert)

# Call function - to store validation results
phase = 1                  # 1 for validation phase / 2 for testing phase
test_set = 0               # test set 1 or 2 for testing phase
first_model = False        # True for first model for that phase / False for subsequent model for the same phase
store_evaluation_results(phase, test_set, first_model, model_num, result)

[1m
Model 4: BERT (Transformer-Based model)
Validation Set:
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,4,BERT,0.97,0.95,0.99,0.97,Transformer-Based


Classification Report:
              precision    recall  f1-score   support

        Real       0.98      0.94      0.96      2479
        Fake       0.95      0.99      0.97      2696

    accuracy                           0.97      5175
   macro avg       0.97      0.96      0.97      5175
weighted avg       0.97      0.97      0.97      5175



## 5.4. Model 4: BERT - Testing

In [12]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import torch.nn.functional as F                                          # neural network functions (e.g., softmax, loss)
from utils.evaluate_model_performance import evaluate_model_performance  # custom function to evaluate models

In [13]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Generate class probabilities and predicted labels for the BERT model on the test set
# --------------------------------------------------------------------------------------------------
def predict_bert_test_labels(X_test):
    cleaned_inputs = [str(x) if x is not None else "" for x in X_test]  # Ensure all inputs are strings
    model_test_bert.eval()                                              # Set model to evaluation mode
    test_encodings_bert = tokenizer_test_bert(cleaned_inputs,           # Tokenise test inputs
                                              truncation=True, 
                                              padding=True, 
                                              max_length=512, 
                                              return_tensors='pt')  
    with torch.no_grad():                                               # Disable gradient calculation
        outputs_bert = model_test_bert(**test_encodings_bert)           # Get model outputs
    logits_test_bert = outputs_bert.logits                              # Raw model scores
    probabilities_test_bert = F.softmax(logits_test_bert, dim=1)        # Convert to probabilities
    y_pred_proba_test_bert = probabilities_test_bert.numpy()            # Convert to NumPy array
    y_pred_test_bert = logits_test_bert.argmax(dim=1).numpy()           # Get predicted labels
        
    return y_pred_test_bert, y_pred_proba_test_bert

In [14]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Loop through multiple test sets to generate predictions and evaluate model performance
# --------------------------------------------------------------------------------------------------
def evaluate_model_bert_on_test_sets(model_num, model_name, model_type, test_sets):

    test_num = 0

    # Loop through each test set
    for X_test, y_test in test_sets:

        # Call function - to generate class probabilities and predicted labels
        y_pred_test, y_pred_proba_test = predict_bert_test_labels(X_test)
        print()

        # Increment test_num by 1 (for Test 1 or Test 2 sets)
        test_num += 1
        if test_num == 1:
            y_pred_test_1 = y_pred_test
            y_pred_proba_test_1 = y_pred_proba_test
            domain = 'News'
        else:
            y_pred_test_2 = y_pred_test
            y_pred_proba_test_2 = y_pred_proba_test
            domain = 'Football'
        
        # Call function - to evaluate the effectiveness of the model
        model_desc = f'Model {model_num}: {model_name} ({model_type} model)\nTest {test_num} Set ({domain} Domain)'
        result = evaluate_model_performance(model_num, model_name, model_type, model_desc, y_test, y_pred_test)
        print ()

        # Save the results (Test 1 or Test 2 sets)
        # Call function - to store test results
        phase = 2                   # 1 for validation phase / 2 for testing phase
        test_set = test_num         # test set 1 or 2 for testing phase
        first_model = True          # True for first model for that phase / False for subsequent model for the same phase
        store_evaluation_results(phase, test_set, first_model, model_num, result)

    return y_pred_test_1, y_pred_proba_test_1, y_pred_test_2, y_pred_proba_test_2

In [15]:
# --------------------------------------------------------------------------------------------------
# Load the trained BERT model and evaluate its performance on multiple test sets
# --------------------------------------------------------------------------------------------------
# Load the saved BERT model
model_test_bert = BertForSequenceClassification.from_pretrained("model_bert")
tokenizer_test_bert = BertTokenizer.from_pretrained("model_bert")

# Test sets 1 and 2
test_sets = [(X_test_1, y_test_1),   # Test set 1
             (X_test_2, y_test_2)]   # Test set 2

# Call function - to generate predictions and evaluate performance of CNN model
model_num = 4
model_name = 'BERT'
model_type = 'Transformer-Based'
(y_pred_test_1_bert, y_pred_proba_test_1_bert,
 y_pred_test_2_bert, y_pred_proba_test_2_bert) = evaluate_model_bert_on_test_sets(model_num, 
                                                                                  model_name, 
                                                                                  model_type,
                                                                                  test_sets
)

# Save predicted probabilities for ensemble stacking (test sets 1 and 2)
joblib.dump(y_pred_proba_test_1_bert, "saved_predictions/y_pred_proba_test_1_bert.pkl")
joblib.dump(y_pred_proba_test_2_bert, "saved_predictions/y_pred_proba_test_2_bert.pkl")


[1m
Model 4: BERT (Transformer-Based model)
Test 1 Set (News Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,4,BERT,1.0,0.99,1.0,1.0,Transformer-Based


Classification Report:
              precision    recall  f1-score   support

        Real       1.00      0.99      1.00       508
        Fake       0.99      1.00      1.00       482

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990



[1m
Model 4: BERT (Transformer-Based model)
Test 2 Set (Football Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,4,BERT,0.96,0.94,0.98,0.96,Transformer-Based


Classification Report:
              precision    recall  f1-score   support

        Real       0.98      0.93      0.95      2004
        Fake       0.94      0.98      0.96      2182

    accuracy                           0.96      4186
   macro avg       0.96      0.96      0.96      4186
weighted avg       0.96      0.96      0.96      4186




['saved_predictions/y_pred_proba_test_2_bert.pkl']

In [16]:
# --------------------------------------------------------------------------------------------------
# Delete the BERT model and tokenizer,
# and call garbage collection to release the memory held by Python
# --------------------------------------------------------------------------------------------------
import gc

# Delete the BERT model and tokenizer
del model_bert
del model_test_bert
del tokenizer_bert
del tokenizer_test_bert

# Call garbage collection to free memory
gc.collect()

# Clear session
torch.cuda.empty_cache()

# 6. Ensemble Model and Validation Accuracy Comparison

### ⚠️ **Note:** 
* Please switch back to the **kernel** for **`TensorFlow`** before continuing this section.
* This is required for the CNN model that uses the `TensorFlow` libraries.

In [1]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import joblib   # for saving models for future use and retrieval of models

## 6.1. Retrieval of Training and Validation Sets

In [2]:
# --------------------------------------------------------------------------------------------------
# Load X and y for training and validation sets and predicted probabilities from other models
# --------------------------------------------------------------------------------------------------
# Load X and y for training and validation sets
X_train = joblib.load('datasets/X_train.pkl')
y_train = joblib.load('datasets/y_train.pkl')
X_val = joblib.load('datasets/X_val.pkl')
y_val = joblib.load('datasets/y_val.pkl')

# Load predicted probabilities for ensemble stacking (validation set)
y_pred_proba_val_nb = joblib.load("saved_predictions/y_pred_proba_val_nb.pkl")            # Model 1 (NB)
y_pred_proba_val_lr = joblib.load("saved_predictions/y_pred_proba_val_lr.pkl")            # Model 2 (LR)
y_pred_proba_val_cnn = joblib.load("saved_predictions/y_pred_proba_val_cnn.pkl")          # Model 3 (CNN)
y_pred_proba_val_bert = joblib.load("saved_predictions/y_pred_proba_val_bert.pkl")        # Model 4 (BERT)

## 6.2. Model 5: Stacking Ensemble (Meta Classifier) - Validation

In [3]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import numpy as np                                                        # for numerical operations
from sklearn.linear_model import LogisticRegression                       # Statistical model - Logistic Regression
from importlib import reload                                              # to reload updated Python modules
import utils.evaluate_model_performance                                   # import module (needed for reload)
reload(utils.evaluate_model_performance)                                  # reload to reflect any latest changes in file
from utils.evaluate_model_performance import evaluate_model_performance   # custom function to evaluate models
import utils.store_evaluation_results                                     # import custom function (needed for reload)
reload(utils.store_evaluation_results)                                    # reload to reflect any latest changes in file
from utils.store_evaluation_results import store_evaluation_results       # custom function to store evaluation results

In [4]:
# --------------------------------------------------------------------------------------------------
# Stack predicted probabilities from multiple base models (Naïve Bayes, Logistic Regression, CNN, 
# and BERT). These probabilities are used as features to train a meta-model (Logistic 
# Regression). The meta-model learns to combine base model predictions for improved performance.
# --------------------------------------------------------------------------------------------------
# Create a new feature matrix with probabilities of class 1 from each base model
X_val_meta = np.column_stack((
    y_pred_proba_val_nb[:, 1],        # Naïve Bayes probabilities
    y_pred_proba_val_lr[:, 1],        # Logistic Regression probabilities
    y_pred_proba_val_cnn[:, 1],       # CNN probabilities
    y_pred_proba_val_bert[:, 1]       # BERT probabilities
))

# Train a Logistic Regression model on the stacked probabilities
model_meta = LogisticRegression()
model_meta.fit(X_val_meta, y_val)

# Save the trained meta-model
joblib.dump(model_meta, "saved_models/model_meta.pkl")

# Evaluate on the validation set to predict and get probabilities using the meta-model
y_pred_meta_val = model_meta.predict(X_val_meta)
y_pred_proba_meta_val = model_meta.predict_proba(X_val_meta)

# Call function - to evaluate the effectiveness of the model
model_num = 5
model_name = 'Stacking Ensemble'
model_type = 'Meta Classifier'
model_desc = f'Model {model_num}: {model_name} ({model_type} model)\nValidation Set'
result     = evaluate_model_performance(model_num, model_name, model_type, model_desc, y_val, y_pred_meta_val)

# Call function - to store validation results
phase = 1                  # 1 for validation phase / 2 for testing phase
test_set = 0               # test set 1 or 2 for testing phase
first_model = False        # True for first model for that phase / False for subsequent model for the same phase
store_evaluation_results(phase, test_set, first_model, model_num, result)

[1m
Model 5: Stacking Ensemble (Meta Classifier model)
Validation Set:
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,5,Stacking Ensemble,0.97,0.97,0.97,0.97,Meta Classifier


Classification Report:
              precision    recall  f1-score   support

        Real       0.97      0.97      0.97      2479
        Fake       0.97      0.97      0.97      2696

    accuracy                           0.97      5175
   macro avg       0.97      0.97      0.97      5175
weighted avg       0.97      0.97      0.97      5175



## 6.3. Validation Scores Comparison for All Models

In [5]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import pandas as pd  # to read CSV files into pandas dataframe

In [6]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Load saved model results on Validation, Test 1 or Test 2 Sets, 
#            combine them into a single DataFrame, style the 'Accuracy' column,
#            and format all values to 2 decimal places, and display the styled table with a header
# --------------------------------------------------------------------------------------------------
def load_and_display_metric_scores(file_path, table_desc):
    # Load saved results
    all_model_results = joblib.load(file_path)

    # Combine into one DataFrame
    final_results_df = pd.concat(all_model_results, ignore_index=True)

    # Sort by 'Model' column in ascending order
    final_results_df = final_results_df.sort_values(by='Model')
    
    # Style: bold 'Accuracy' column and format all values to 2 decimal places
    styled_df = final_results_df.style \
        .map(lambda val: 'font-weight: bold', subset=['Accuracy']) \
        .format(precision=2)

    # Display the full table with bolded Accuracy column
    print(f'\033[1m\n{table_desc}\n\033[0m') 
    display(styled_df)

In [7]:
# --------------------------------------------------------------------------------------------------
# Call function to load and display all model results on Validation set
# --------------------------------------------------------------------------------------------------
file_path = 'results/all_model_results_val.pkl'
table_desc = 'Validation Scores Comparison for All Models'
load_and_display_metric_scores(file_path, table_desc)

[1m
Validation Scores Comparison for All Models
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,1,Naive Bayes,0.93,0.94,0.93,0.94,Baseline
1,2,Logistic Regression,0.96,0.96,0.96,0.96,Traditional Machine Learning
2,3,CNN,0.96,0.97,0.95,0.96,Deep Learning
3,4,BERT,0.97,0.95,0.99,0.97,Transformer-Based
4,5,Stacking Ensemble,0.97,0.97,0.97,0.97,Meta Classifier


# 7. Test Set Evaluation and Accuracy Comparison

## 7.1. Retrieval of Test Sets

In [10]:
# --------------------------------------------------------------------------------------------------
# Load test sets, including padded inputs and encoded labels, for evaluating the Naive Bayes, 
# Logistic Regression, CNN and Ensemble models
# --------------------------------------------------------------------------------------------------
# Load X and y from test sets
X_test_1 = joblib.load('datasets/X_test_1.pkl')
y_test_1 = joblib.load('datasets/y_test_1.pkl')
X_test_2 = joblib.load('datasets/X_test_2.pkl')
y_test_2 = joblib.load('datasets/y_test_2.pkl')

# Load TF-IDF transformed test sets - for models 1 (Naive Bayes) and 2 (Logistic Regression)
X_test_1_tfidf = joblib.load('datasets/X_test_1_tfidf.pkl')
X_test_2_tfidf = joblib.load('datasets/X_test_2_tfidf.pkl')

# Load the padded sequences - for model 3 (CNN)
X_test_1_padded = joblib.load('datasets/X_test_1_padded.pkl')
X_test_2_padded = joblib.load('datasets/X_test_2_padded.pkl')

# Load the encoded labels - for model 3 (CNN)
y_test_1_encoded = joblib.load('datasets/y_test_1_encoded.pkl')
y_test_2_encoded = joblib.load('datasets/y_test_2_encoded.pkl')

# Load predicted probabilities for ensemble stacking (test sets 1 and 2) - for model 6 (Stacking)
y_pred_proba_test_1_bert = joblib.load("saved_predictions/y_pred_proba_test_1_bert.pkl")        # Model 4 (BERT) - Test set 1
y_pred_proba_test_2_bert = joblib.load("saved_predictions/y_pred_proba_test_2_bert.pkl")        # Model 4 (BERT) - Test set 2

## 7.2. General Evaluation Function for Multiple Test Sets

In [11]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from utils.evaluate_model_performance import evaluate_model_performance  # custom function to evaluate models

In [12]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Loop through multiple test sets to generate predictions and evaluate model performance
# --------------------------------------------------------------------------------------------------
def evaluate_model_on_test_sets(model_num, model_name, model_type, model_desc, test_sets):

    test_num = 0

    # Loop through each test set
    for X_test, y_test in test_sets:

        # Call respective functions - to generate class probabilities and predicted labels
        # For model 1 (Naive Bayes)
        if model_num == 1:
            y_pred_test, y_pred_proba_test = predict_nb_test_labels(X_test)

        # For model 2 (Logistic Regression)
        elif model_num == 2:
            y_pred_test, y_pred_proba_test = predict_lr_test_labels(X_test)

        # For model 3 (CNN)
        elif model_num == 3:
            y_pred_test, y_pred_proba_test = predict_cnn_test_labels(X_test)

        # Increment test_num by 1 (for Test 1 or Test 2 sets)
        test_num += 1
        if test_num == 1:
            y_pred_test_1 = y_pred_test
            y_pred_proba_test_1 = y_pred_proba_test
            domain = 'News'
        else:
            y_pred_test_2 = y_pred_test
            y_pred_proba_test_2 = y_pred_proba_test
            domain = 'Football'
        
        # Call function - to evaluate the effectiveness of the model
        model_desc2 = model_desc + f'\nTest {str(test_num)} Set ({domain} Domain)'
        result = evaluate_model_performance(model_num, model_name, model_type, model_desc2, y_test, y_pred_test)
        print ()

        # Save the results (Test 1 or Test 2 sets)
        # Call function - to store test results
        phase = 2                   # 1 for validation phase / 2 for testing phase
        test_set = test_num         # test set 1 or 2 for testing phase
        first_model = False         # True for first model for that phase / False for subsequent model for the same phase
        store_evaluation_results(phase, test_set, first_model, model_num, result)
    
    return y_pred_test_1, y_pred_proba_test_1, y_pred_test_2, y_pred_proba_test_2

## 7.3. Model 1: Naive Bayes - Testing

In [13]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from sklearn.naive_bayes import MultinomialNB         # Multinomial Naive Bayes model

In [14]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Generate class probabilities and predicted labels for the Naive Bayes model 
#            on the test set
# --------------------------------------------------------------------------------------------------
def predict_nb_test_labels(X_test_tfidf):
    y_pred_nb_test = model_test_nb.predict(X_test_tfidf)
    y_pred_proba_nb_test = model_test_nb.predict_proba(X_test_tfidf.toarray())
    return y_pred_nb_test, y_pred_proba_nb_test

In [15]:
# --------------------------------------------------------------------------------------------------
# Load the trained Naive Bayes model and evaluate its performance on multiple test sets
# --------------------------------------------------------------------------------------------------
# Load the saved Naive Bayes model
model_test_nb = joblib.load("saved_models/model_nb.pkl")

# Test sets 1 and 2
test_sets = [(X_test_1_tfidf, y_test_1),   # Test set 1
             (X_test_2_tfidf, y_test_2)]   # Test set 2

# Call function - to generate predictions and evaluate performance of Naive Bayes model
model_num = 1
model_name = 'Naive Bayes'
model_type = 'Baseline'
model_desc = f'Model {model_num}: {model_name} ({model_type} model) using TF-IDF Vectoriser'
y_pred_test_1_nb, y_pred_proba_test_1_nb, y_pred_test_2_nb, y_pred_proba_test_2_nb = evaluate_model_on_test_sets(model_num, 
                                                                                                                 model_name, 
                                                                                                                 model_type,
                                                                                                                 model_desc,
                                                                                                                 test_sets)

[1m
Model 1: Naive Bayes (Baseline model) using TF-IDF Vectoriser
Test 1 Set (News Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,1,Naive Bayes,0.94,0.99,0.88,0.93,Baseline


Classification Report:
              precision    recall  f1-score   support

        Real       0.90      0.99      0.94       508
        Fake       0.99      0.88      0.93       482

    accuracy                           0.94       990
   macro avg       0.95      0.94      0.94       990
weighted avg       0.94      0.94      0.94       990


[1m
Model 1: Naive Bayes (Baseline model) using TF-IDF Vectoriser
Test 2 Set (Football Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,1,Naive Bayes,0.93,0.93,0.94,0.93,Baseline


Classification Report:
              precision    recall  f1-score   support

        Real       0.94      0.92      0.93      2004
        Fake       0.93      0.94      0.93      2182

    accuracy                           0.93      4186
   macro avg       0.93      0.93      0.93      4186
weighted avg       0.93      0.93      0.93      4186




## 7.4. Model 2: Logistic Regression - Testing

In [16]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
from sklearn.linear_model import LogisticRegression   # Statistical model - Logistic Regression

In [17]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Generate class probabilities and predicted labels for the Logistic Regression model 
#            on the test set
# --------------------------------------------------------------------------------------------------
def predict_lr_test_labels(X_test_tfidf):
    y_pred_lr_test = model_test_lr_best.predict(X_test_tfidf)
    y_pred_proba_lr_test = model_test_lr_best.predict_proba(X_test_tfidf.toarray())
    return y_pred_lr_test, y_pred_proba_lr_test

In [18]:
# --------------------------------------------------------------------------------------------------
# Load the trained Logistic Regression model and evaluate its performance on multiple test sets
# --------------------------------------------------------------------------------------------------
# Load the saved Logistic Regression model
model_test_lr_best = joblib.load("saved_models/model_lr_best.pkl")

# Test sets 1 and 2
test_sets = [(X_test_1_tfidf, y_test_1),   # Test set 1
             (X_test_2_tfidf, y_test_2)]   # Test set 2

# Call function - to generate predictions and evaluate performance of Logistic Regression model
model_num = 2
model_name = 'Logistic Regression'
model_type = 'Traditional Machine Learning'
model_desc = f'Model {model_num}: {model_name} ({model_type} model) using TF-IDF Vectoriser'
y_pred_test_1_lr, y_pred_proba_test_1_lr, y_pred_test_2_lr, y_pred_proba_test_2_lr = evaluate_model_on_test_sets(model_num, 
                                                                                                                 model_name, 
                                                                                                                 model_type,
                                                                                                                 model_desc,
                                                                                                                 test_sets)

[1m
Model 2: Logistic Regression (Traditional Machine Learning model) using TF-IDF Vectoriser
Test 1 Set (News Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,2,Logistic Regression,0.99,0.99,0.99,0.99,Traditional Machine Learning


Classification Report:
              precision    recall  f1-score   support

        Real       0.99      0.99      0.99       508
        Fake       0.99      0.99      0.99       482

    accuracy                           0.99       990
   macro avg       0.99      0.99      0.99       990
weighted avg       0.99      0.99      0.99       990


[1m
Model 2: Logistic Regression (Traditional Machine Learning model) using TF-IDF Vectoriser
Test 2 Set (Football Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,2,Logistic Regression,0.95,0.95,0.95,0.95,Traditional Machine Learning


Classification Report:
              precision    recall  f1-score   support

        Real       0.95      0.95      0.95      2004
        Fake       0.95      0.95      0.95      2182

    accuracy                           0.95      4186
   macro avg       0.95      0.95      0.95      4186
weighted avg       0.95      0.95      0.95      4186




## 7.5. Model 3: CNN - Testing

In [19]:
# --------------------------------------------------------------------------------------------------
# Libraries
# --------------------------------------------------------------------------------------------------
import tensorflow as tf                          # for deep learning models
from tensorflow.keras.models import load_model   # load a saved Keras model
import numpy as np                               # for numerical operations

2025-03-30 15:26:42.862860: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743319602.884887   19178 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743319602.889837   19178 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743319602.902723   19178 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743319602.902751   19178 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1743319602.902754   19178 computation_placer.cc:177] computation placer alr

In [20]:
# --------------------------------------------------------------------------------------------------
# FUNCTION - Generate class probabilities and predicted labels for the CNN model on the test set
# --------------------------------------------------------------------------------------------------
def predict_cnn_test_labels(X_test_padded):
    y_pred_proba_cnn_test = model_test_cnn.predict(X_test_padded)                                # Original: (num_samples, 1)
    y_pred_proba_cnn_test = np.column_stack([1 - y_pred_proba_cnn_test, y_pred_proba_cnn_test])  # Convert to (num_samples, 2)
    y_pred_cnn_test = np.argmax(y_pred_proba_cnn_test, axis=1)                                   # Class labels    
    return y_pred_cnn_test, y_pred_proba_cnn_test

In [21]:
# --------------------------------------------------------------------------------------------------
# Load the trained CNN model and evaluate its performance on multiple test sets
# --------------------------------------------------------------------------------------------------
# Load the saved CNN model
model_test_cnn = load_model("saved_models/model_cnn.keras")

# Test sets 1 and 2
test_sets = [(X_test_1_padded, y_test_1),   # Test set 1
             (X_test_2_padded, y_test_2)]   # Test set 2

# Call function - to generate predictions and evaluate performance of CNN model
model_num = 3
model_name = 'CNN'
model_type = 'Deep Learning'
model_desc = f'Model {model_num}: {model_name} ({model_type} model)'
y_pred_test_1_cnn, y_pred_proba_test_1_cnn, y_pred_test_2_cnn, y_pred_proba_test_2_cnn = evaluate_model_on_test_sets(model_num, 
                                                                                                                     model_name, 
                                                                                                                     model_type,
                                                                                                                     model_desc,
                                                                                                                     test_sets)

# Clear session to release memory
tf.keras.backend.clear_session()

I0000 00:00:1743319605.190024   19178 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7220 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1
I0000 00:00:1743319607.764023   19354 service.cc:152] XLA service 0x759358006050 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1743319607.764045   19354 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce GTX 1080, Compute Capability 6.1
2025-03-30 15:26:47.775359: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1743319607.804159   19354 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m 1/31[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m16s[0m 562ms/step

I0000 00:00:1743319608.246203   19354 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step
[1m
Model 3: CNN (Deep Learning model)
Test 1 Set (News Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,3,CNN,1.0,1.0,1.0,1.0,Deep Learning


Classification Report:
              precision    recall  f1-score   support

        Real       1.00      1.00      1.00       508
        Fake       1.00      1.00      1.00       482

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990


[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m
Model 3: CNN (Deep Learning model)
Test 2 Set (Football Domain):
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,3,CNN,0.94,0.96,0.94,0.95,Deep Learning


Classification Report:
              precision    recall  f1-score   support

        Real       0.93      0.95      0.94      2004
        Fake       0.96      0.94      0.95      2182

    accuracy                           0.94      4186
   macro avg       0.94      0.94      0.94      4186
weighted avg       0.94      0.94      0.94      4186




In [22]:
# --------------------------------------------------------------------------------------------------
# Step 3 - Delete the CNN test model and call garbage collection to release the memory held by Python
# --------------------------------------------------------------------------------------------------
import gc
import tensorflow as tf

# Delete the CNN test model
del model_test_cnn

# Call garbage collection to free memory
gc.collect()

# Clear session
tf.keras.backend.clear_session()

## 7.6. Model 5: Stacking Ensemble - Testing

In [23]:
# --------------------------------------------------------------------------------------------------
# Stack predicted probabilities from multiple base models (Naïve Bayes, Logistic Regression, CNN, 
# BERT, and RoBERTa). These probabilities are used as features to train a meta-model (Logistic 
# Regression). The meta-model learns to combine base model predictions for improved performance.
# --------------------------------------------------------------------------------------------------
# Create new feature matrices with probabilities of class 1 from each base model
# For Test set 1
X_test_1_meta = np.column_stack((
    y_pred_proba_test_1_nb[:, 1],        # Naïve Bayes probabilities
    y_pred_proba_test_1_lr[:, 1],        # Logistic Regression probabilities
    y_pred_proba_test_1_cnn[:, 1],       # CNN probabilities
    y_pred_proba_test_1_bert[:, 1]       # BERT probabilities
))

# For Test set 2
X_test_2_meta = np.column_stack((
    y_pred_proba_test_2_nb[:, 1],        # Naïve Bayes probabilities
    y_pred_proba_test_2_lr[:, 1],        # Logistic Regression probabilities
    y_pred_proba_test_2_cnn[:, 1],       # CNN probabilities
    y_pred_proba_test_2_bert[:, 1]       # BERT probabilities
))

# Store test meta features in a labelled dictionary
X_test_meta_sets = {
    "Test 1 Dataset": X_test_1_meta,
    "Test 2 Dataset": X_test_2_meta
}

test_num = 0

# Loop through test sets to predict and evaluate model
for label, X_meta in X_test_meta_sets.items():
    # Make predictions on test data using the meta model
    y_pred_test_meta = model_meta.predict(X_meta)

    # Increment test_num
    test_num += 1
    if test_num == 1:
        y_test = y_test_1
    else:
        y_test = y_test_2
    
    # Call function - to evaluate the effectiveness of the model
    model_num = 5
    model_name = 'Stacking Ensemble'
    model_type = 'Meta Classifier'
    model_desc = f'Model {model_num}: {model_name} ({model_type} model)\nTest {str(test_num)} Set'
    result = evaluate_model_performance(model_num, model_name, model_type, model_desc, y_test, y_pred_test_meta)

    # Save the results (Test 1 or Test 2 sets)
    # Call function - to store test results
    phase = 2                   # 1 for validation phase / 2 for testing phase
    test_set = test_num         # test set 1 or 2 for testing phase
    first_model = False         # True for first model for that phase / False for subsequent model for the same phase
    store_evaluation_results(phase, test_set, first_model, model_num, result)

[1m
Model 5: Stacking Ensemble (Meta Classifier model)
Test 1 Set:
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,5,Stacking Ensemble,1.0,1.0,1.0,1.0,Meta Classifier


Classification Report:
              precision    recall  f1-score   support

        Real       1.00      1.00      1.00       508
        Fake       1.00      1.00      1.00       482

    accuracy                           1.00       990
   macro avg       1.00      1.00      1.00       990
weighted avg       1.00      1.00      1.00       990

[1m
Model 5: Stacking Ensemble (Meta Classifier model)
Test 2 Set:
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
0,5,Stacking Ensemble,0.96,0.97,0.95,0.96,Meta Classifier


Classification Report:
              precision    recall  f1-score   support

        Real       0.95      0.97      0.96      2004
        Fake       0.97      0.95      0.96      2182

    accuracy                           0.96      4186
   macro avg       0.96      0.96      0.96      4186
weighted avg       0.96      0.96      0.96      4186



## 7.7. Test Scores Comparison for All Models

In [24]:
# --------------------------------------------------------------------------------------------------
# Load and display all model results on Test 1 and Test 2 sets
# --------------------------------------------------------------------------------------------------
for i in range(2):
    if (i+1 == 1):
        domain = 'News'
    else:
        domain = 'Football'

    # Call function - to load and display all model results on Validation set
    file_path = f'results/all_model_results_test_{i+1}.pkl'
    table_desc = f'Test Scores Comparison for All Models - Test {i+1} Set ({domain} Domain)'
    load_and_display_metric_scores(file_path, table_desc)

[1m
Test Scores Comparison for All Models - Test 1 Set (News Domain)
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
1,1,Naive Bayes,0.94,0.99,0.88,0.93,Baseline
2,2,Logistic Regression,0.99,0.99,0.99,0.99,Traditional Machine Learning
3,3,CNN,1.0,1.0,1.0,1.0,Deep Learning
0,4,BERT,1.0,0.99,1.0,1.0,Transformer-Based
4,5,Stacking Ensemble,1.0,1.0,1.0,1.0,Meta Classifier


[1m
Test Scores Comparison for All Models - Test 2 Set (Football Domain)
[0m


Unnamed: 0,Model,Model name,Accuracy,Precision,Recall,F1 Score,Model type
1,1,Naive Bayes,0.93,0.93,0.94,0.93,Baseline
2,2,Logistic Regression,0.95,0.95,0.95,0.95,Traditional Machine Learning
3,3,CNN,0.94,0.96,0.94,0.95,Deep Learning
0,4,BERT,0.96,0.94,0.98,0.96,Transformer-Based
4,5,Stacking Ensemble,0.96,0.97,0.95,0.96,Meta Classifier
