In [1]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf




In [2]:
# Load in DataFrame
url = 'C:/Users/lucas/OneDrive - The Pennsylvania State University/DS340W/Fake_News_Detection_340w.csv'
df = pd.read_csv(url, encoding = 'latin1')

In [3]:
# Test/train/validation split
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print("Train data shape:", train_data.shape)
print("Validation data shape:", val_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (7883, 6)
Validation data shape: (985, 6)
Test data shape: (986, 6)


In [4]:
# Load pre-trained DistilBERT model and tokenizer
disbert_tr = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(disbert_tr)
model = TFDistilBertForSequenceClassification.from_pretrained(disbert_tr)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [5]:
# Tokenize headlines
max_length = 128

def tokenize_headlines(data):
    return tokenizer(
        data["News_Headline"].tolist(),
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )

train_tokenized = tokenize_headlines(train_data)
val_tokenized = tokenize_headlines(val_data)
test_tokenized = tokenize_headlines(test_data)

In [6]:
# Convert labels to numerical values
def map_label(label):
    return 1 if label == 'TRUE' else 0

train_labels = train_data['Label'].map(map_label).tolist()
val_labels = val_data['Label'].map(map_label).tolist()
test_labels = test_data['Label'].map(map_label).tolist()

In [7]:
# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_tokenized), train_labels)).shuffle(len(train_data)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_tokenized), val_labels)).batch(64)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_tokenized), test_labels)).batch(64)

In [8]:
# Fine-tune the distillBERT model
optimizer = tf.keras.optimizers.Adam(learning_rate=.005)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(train_dataset, validation_data=val_dataset, epochs=1)




<keras.src.callbacks.History at 0x28a87f9b970>

In [9]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.0, Test Accuracy: 1.0
