In [None]:
# Install TensorFlow
!pip install tensorflow

# Install Pandas
!pip install pandas

# Install Hugging Face Transformers
!pip install transformers

# Install NumPy
!pip install numpy


In [4]:
import tensorflow as tf
import pandas as pd
from transformers import (
    DistilBertTokenizer, TFDistilBertForSequenceClassification,
    RobertaTokenizer, TFRobertaForSequenceClassification,
)
import tensorflow as tf
import numpy as np




  from .autonotebook import tqdm as notebook_tqdm


In [6]:
val_data = pd.read_csv("./Data/Raw/Constraint_Val.csv")[["tweet", "label"]].rename(columns={"tweet": "text"})
train_data = pd.read_csv("./Data/Raw/Constraint_Train.csv")[["tweet", "label"]].rename(columns={"tweet": "text"})
def clean(df):
    df = df.rename(columns={"tweet": "text"})
    df['label'] = df['label'].map({'real': True, 'fake': False})
    return df
val_data = clean(val_data)
train_data = clean(train_data)

In [3]:
display(val_data)

Unnamed: 0,text,label
0,Chinese converting to Islam after realising th...,False
1,11 out of 13 people (from the Diamond Princess...,False
2,"COVID-19 Is Caused By A Bacterium, Not Virus A...",False
3,Mike Pence in RNC speech praises Donald Trump’...,False
4,6/10 Sky's @EdConwaySky explains the latest #C...,True
...,...,...
2135,Donald Trump wrongly claimed that New Zealand ...,False
2136,Current understanding is #COVID19 spreads most...,True
2137,Nothing screams “I am sat around doing fuck al...,False
2138,Birx says COVID-19 outbreak not under control ...,False


In [4]:
# Load models and tokenizers as before
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('jojo0616/my_Misinformation_distilbert_model')
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('jojo0616/my_Misinformation_distilbert_model', from_pt=True)

roberta_tokenizer_v1 = RobertaTokenizer.from_pretrained('vikram71198/distilroberta-base-finetuned-fake-news-detection')
roberta_model_v1 = TFRobertaForSequenceClassification.from_pretrained('vikram71198/distilroberta-base-finetuned-fake-news-detection', from_pt=True)

roberta_tokenizer_v2 = RobertaTokenizer.from_pretrained('hamzab/roberta-fake-news-classification')
roberta_model_v2 = TFRobertaForSequenceClassification.from_pretrained('hamzab/roberta-fake-news-classification', from_pt=True)




All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClas

In [5]:
train_labels = train_data['label'].tolist()
train_text = train_data['text'].tolist()
val_labels = val_data['label'].tolist()
val_text = val_data['text'].tolist()

In [6]:
# Extended training function with saving functionality
def train_and_save_model(model, tokenizer, texts, labels, model_save_path, epochs=3):
    # Tokenize texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="tf", max_length=512)
    
    # Convert labels to TensorFlow tensors
    labels = tf.convert_to_tensor(labels)

    # Create a TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), labels)).batch(8)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    
    # Train the model
    model.fit(dataset, epochs=epochs)
    
    # Save the model and tokenizer
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

# Convert labels to TensorFlow tensors
labels_tensor = tf.convert_to_tensor(train_labels)

# No need to pre-tokenize the train_text. Pass raw text directly.
# Specify the save paths for each model
distilbert_save_path = './distilbert_finetuned'
roberta_v1_save_path = './roberta_v1_finetuned'
roberta_v2_save_path = './roberta_v2_finetuned'

# Train and save each model. Note that we're now passing `train_text` and `labels_tensor` directly.
train_and_save_model(distilbert_model, distilbert_tokenizer, train_text, labels_tensor, distilbert_save_path, epochs=1)
train_and_save_model(roberta_model_v1, roberta_tokenizer_v1, train_text, labels_tensor, roberta_v1_save_path, epochs=1)
train_and_save_model(roberta_model_v2, roberta_tokenizer_v2, train_text, labels_tensor, roberta_v2_save_path, epochs=1)

print("Models and tokenizers have been saved.")



Models and tokenizers have been saved.


In [5]:
# Load the fine-tuned DistilBERT model and tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('./distilbert_finetuned')
distilbert_model = TFDistilBertForSequenceClassification.from_pretrained('./distilbert_finetuned')

# Load the fine-tuned RoBERTa model and tokenizer for the first variant
roberta_tokenizer_v1 = RobertaTokenizer.from_pretrained('./roberta_v1_finetuned')
roberta_model_v1 = TFRobertaForSequenceClassification.from_pretrained('./roberta_v1_finetuned')

# Load the fine-tuned RoBERTa model and tokenizer for the second variant
roberta_tokenizer_v2 = RobertaTokenizer.from_pretrained('./roberta_v2_finetuned')
roberta_model_v2 = TFRobertaForSequenceClassification.from_pretrained('./roberta_v2_finetuned')

def ensemble_classify_news_and_evaluate_accuracy(df):
    # Lists to store individual model predictions
    distilbert_predictions = []
    roberta_v1_predictions = []
    roberta_v2_predictions = []
    ensemble_predictions = []
    
    for _, row in df.iterrows():
        text_input = row['text']

        # Prepare inputs and get probabilities for DistilBERT
        distilbert_inputs = distilbert_tokenizer(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        distilbert_outputs = distilbert_model(distilbert_inputs)
        distilbert_probabilities = tf.nn.softmax(distilbert_outputs.logits, axis=-1)
        distilbert_predicted_class_index = tf.argmax(distilbert_probabilities, axis=-1).numpy()[0]
        distilbert_predictions.append(True if distilbert_predicted_class_index == 1 else False)

        # Prepare inputs and get probabilities for RoBERTa variant 1
        roberta_inputs_v1 = roberta_tokenizer_v1(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        roberta_outputs_v1 = roberta_model_v1(roberta_inputs_v1)
        roberta_probabilities_v1 = tf.nn.softmax(roberta_outputs_v1.logits, axis=-1)
        roberta_v1_predicted_class_index = tf.argmax(roberta_probabilities_v1, axis=-1).numpy()[0]
        roberta_v1_predictions.append(True if roberta_v1_predicted_class_index == 1 else False)

        # Prepare inputs and get probabilities for RoBERTa variant 2
        roberta_inputs_v2 = roberta_tokenizer_v2(text_input, return_tensors="tf", truncation=True, padding='max_length', max_length=512)
        roberta_outputs_v2 = roberta_model_v2(roberta_inputs_v2)
        roberta_probabilities_v2 = tf.nn.softmax(roberta_outputs_v2.logits, axis=-1)
        roberta_v2_predicted_class_index = tf.argmax(roberta_probabilities_v2, axis=-1).numpy()[0]
        roberta_v2_predictions.append(True if roberta_v2_predicted_class_index == 1 else False)

        # Ensemble: Average the probabilities from all models
        avg_probabilities = (distilbert_probabilities + roberta_probabilities_v1 + roberta_probabilities_v2) / 3
        predicted_class_index = tf.argmax(avg_probabilities, axis=-1).numpy()[0]
        ensemble_predictions.append(True if predicted_class_index == 1 else False)

    # Adding predictions to the DataFrame
    df['DistilBERTPrediction'] = distilbert_predictions
    df['RoBERTaV1Prediction'] = roberta_v1_predictions
    df['RoBERTaV2Prediction'] = roberta_v2_predictions
    df['EnsemblePrediction'] = ensemble_predictions
    
    # Calculate and print the accuracy for the ensemble predictions
    correct_predictions = (df['EnsemblePrediction'] == df['label']).sum()
    total_predictions = len(df)
    accuracy = correct_predictions / total_predictions
    print(f"Accuracy: {accuracy:.4f}")
    
    return df




All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at ./distilbert_finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ./roberta_v1_finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at ./roberta_v2_

In [8]:
df_updated = ensemble_classify_news_and_evaluate_accuracy(val_data)
display(df_updated)

Accuracy: 0.9799


Unnamed: 0,text,label,DistilBERTPrediction,RoBERTaV1Prediction,RoBERTaV2Prediction,EnsemblePrediction
0,Chinese converting to Islam after realising th...,False,False,False,False,False
1,11 out of 13 people (from the Diamond Princess...,False,False,False,False,False
2,"COVID-19 Is Caused By A Bacterium, Not Virus A...",False,False,False,False,False
3,Mike Pence in RNC speech praises Donald Trump’...,False,False,False,False,False
4,6/10 Sky's @EdConwaySky explains the latest #C...,True,True,True,True,True
...,...,...,...,...,...,...
2135,Donald Trump wrongly claimed that New Zealand ...,False,False,False,False,False
2136,Current understanding is #COVID19 spreads most...,True,True,True,True,True
2137,Nothing screams “I am sat around doing fuck al...,False,False,False,False,False
2138,Birx says COVID-19 outbreak not under control ...,False,False,True,False,False


In [12]:
df_updated.rename(columns={
    'DistilBERTPrediction': 'DistilBERTPrediction_Text',
    'RoBERTaV1Prediction': 'RoBERTaV1Prediction_Text',
    'RoBERTaV2Prediction': 'RoBERTaV2Prediction_Text',
    'EnsemblePrediction': 'EnsemblePrediction_Text'
}, inplace=True)
df_updated.to_csv("./Data/Processed/Constraint_Val_Labeled.csv")