In [47]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import tensorflow as tf

In [34]:
# Load in DataFrame
url = 'C:/Users/lucas/OneDrive - The Pennsylvania State University/DS340W/Fake_News_Detection_340w.csv'
df = pd.read_csv(url, encoding = 'latin1')
df = df.drop(columns=['Stated_On', 'Date', 'Link_Of_News'])

In [52]:
# Test/train/validation split
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

Y_train = train_data.drop(columns=['News_Headline', 'Source'])
X_train = train_data.drop(columns=['Label'])
Y_val = val_data.drop(columns=['News_Headline', 'Source'])
X_val = val_data.drop(columns=['Label']) 

print("Train data shape:", train_data.shape)
print("Validation_X data shape:", X_val.shape)
print("Validation_Y data shape:", Y_val.shape)
print("Test data shape:", test_data.shape)

Train data shape: (7883, 3)
Validation_X data shape: (985, 2)
Validation_Y data shape: (985, 1)
Test data shape: (986, 3)


In [53]:
# Load pre-trained DistilBERT model and tokenizer
disbert_tr = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(disbert_tr)
model = TFDistilBertForSequenceClassification.from_pretrained(disbert_tr)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [54]:
max_length = 128
# Tokenize headlines
def tokenize_headlines(data):
    return tokenizer(
        data.tolist(),  # Tokenize headlines directly
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )

train_tokenized = tokenize_headlines(X_train["News_Headline"])
val_tokenized = tokenize_headlines(X_val["News_Headline"])

In [55]:
#Mapping Labels for Ground Truth
def map_label(label):
    return 1 if label == 'TRUE' else 0

train_labels = Y_train['Label'].map(map_label).tolist()
val_labels = Y_val['Label'].map(map_label).tolist()  

In [58]:
# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_tokenized), train_labels)).shuffle(len(X_train)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_tokenized), val_labels)).batch(32)

In [59]:
# Compile and fit the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(train_dataset, epochs=1)

# Save the model
model.save("distilbert_fine_tuned_model")

























INFO:tensorflow:Assets written to: distilbert_fine_tuned_model\assets


INFO:tensorflow:Assets written to: distilbert_fine_tuned_model\assets


In [60]:
loss, accuracy = model.evaluate(val_dataset)

print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

Validation Loss: 0.0
Validation Accuracy: 1.0


In [9]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.0, Test Accuracy: 1.0
