In [12]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizer
import tensorflow as tf

In [13]:
# Load in DataFrame
url = "C:/Users/lucas/OneDrive - The Pennsylvania State University/DS340W/WELFake_Dataset.csv"
df = pd.read_csv(url, encoding = 'latin1')
df.head()
df.drop(columns=["Unnamed: 0"])

# Sample an even number of rows for each label
sample_size_per_label = 5000  

# Separate the DataFrame into two based on the labels
df_label_0 = df[df['label'] == 0]
df_label_1 = df[df['label'] == 1]

# Sample from each of these DataFrames
df_label_0_sample = df_label_0.sample(n=sample_size_per_label, random_state=1)
df_label_1_sample = df_label_1.sample(n=sample_size_per_label, random_state=1)

# Concatenate the samples back into a single DataFrame
df_sampled = pd.concat([df_label_0_sample, df_label_1_sample])

# Shuffle the sampled DataFrame to mix the rows from the two labels
df_sampled = df_sampled.sample(frac=1, random_state=1).reset_index(drop=True)



In [14]:
# Test/train/validation split
train_data, val_data = train_test_split(df_sampled, test_size=0.2, random_state=42)

Y_train = pd.DataFrame(train_data['label'])
X_train = train_data[['text', 'title']]
Y_val = pd.DataFrame(val_data['label'])
X_val = val_data[['text', 'title']]


print("Train data shape:", train_data.shape)
print("Validation_X data shape:", X_val.shape)
print("Validation_Y data shape:", Y_val.shape)

Train data shape: (8000, 4)
Validation_X data shape: (2000, 2)
Validation_Y data shape: (2000, 1)


In [16]:
# Load pre-trained BERT model and tokenizer
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
model = TFBertForSequenceClassification.from_pretrained(bert_model_name)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
max_length = 128

def tokenize_headlines(data):
    # Ensure all entries are converted to strings
    text_data = data.astype(str)
    
    return tokenizer(
        text_data.tolist(),  # Convert the text data into a list of strings directly
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='tf'
    )

# Assuming X_train and X_val DataFrames have a 'text' column
train_tokenized = tokenize_headlines(X_train["text"])
val_tokenized = tokenize_headlines(X_val["text"])


In [18]:
train_labels = Y_train['label'].tolist()
val_labels = Y_val['label'].tolist()

In [19]:
# Prepare TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_tokenized), train_labels)).shuffle(len(X_train)).batch(32)
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_tokenized), val_labels)).batch(32)

In [20]:
#Help find the best learning rate
#initial_learning_rate = 0.005
#decay_steps = len(train_dataset)  # Number of steps in one epoch
#decay_rate = 0.9

#def lr_scheduler(epoch, lr):
#    return lr * decay_rate

#scheduler_callback = tf.keras.callbacks.LearningRateScheduler(lr_scheduler)

#Optimization and Evaluation
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Fit the model with learning rate scheduling
model.fit(train_dataset, epochs=1)

# Save the model
model.save("bert_fine_tuned_model")

 16/250 [>.............................] - ETA: 4:28:43 - loss: 1.5137 - accuracy: 0.5254

KeyboardInterrupt: 

In [10]:
loss, accuracy = model.evaluate(val_dataset)

print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

Validation Loss: 0.6935189366340637
Validation Accuracy: 0.492000013589859


In [9]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_dataset)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Test Loss: 0.0, Test Accuracy: 1.0


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the hyperparameter distributions
param_dist = {
    'learning_rate': uniform(0.001, 0.01),
    'dropout_rate': uniform(0.1, 0.5),
    'num_layers': [1, 2, 3]
}

# Perform random search
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=10, cv=3, n_jobs=-1)
random_search_result = random_search.fit(X_train, Y_train)

# Summarize results
print("Best: %f using %s" % (random_search_result.best_score_, random_search_result.best_params_))


TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForSequenceClassification object at 0x0000024991DC1960> does not.