In [2]:
import os
os.listdir('/content/')

['.config', 'Fake.csv', 'True.csv', 'sample_data']

In [3]:
import pandas as pd
#load datasets
true_data = pd.read_csv('/content/True.csv')
fake_data = pd.read_csv('/content/Fake.csv')
#labels: 0 for real and 1 for fake
true_data['label']=0
fake_data['label'] = 1

#combine 2 datasets
data = pd.concat([true_data, fake_data])
#shuffle them
data = data.sample(frac=1).reset_index(drop=True)
print(data.head())

                                               title  \
0  ‘ENABLING HILLARY’ Creates Insane Reason to Pr...   
1   Texas Congressman HILARIOUSLY Trolls Trump, S...   
2  Trump tweets mock video of himself tackling, p...   
3  Episode #9 – ON THE QT: ‘Cozy Bears & Eggnog’ ...   
4  Democrats dig in, delay against Dodd-Frank ove...   

                                                text       subject  \
0   ENABLING HILLARY  came to Senator Al Franken ...      politics   
1  Joaquin Castro of Texas is a rising star in th...          News   
2   BRIDGEWATER, N.J. (Reuters) - President Donal...  politicsNews   
3   Here is the FULL Episode #9 of this podcast  ...   Middle-east   
4  WASHINGTON (Reuters) - Democrats in the U.S. H...  politicsNews   

                date  label  
0       Nov 17, 2017      1  
1  November 28, 2016      1  
2      July 2, 2017       0  
3  December 20, 2016      1  
4       May 2, 2017       0  


**Check the data structure**

In [4]:
#check column names
print(data.columns)
data = data[['title', 'text', 'label']]

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


**Text processing**

In [5]:
import re

def preprocess_text(text):
  text = re.sub(r'\W', ' ', text)
  text = re.sub(r'\d+', '', text)
  text = text.lower().strip()
  return text

data['text'] = data['text'].apply(preprocess_text)
data['title'] = data['title'].apply(preprocess_text)


**Split data into training and testing**

In [6]:
from sklearn.model_selection import train_test_split
#split data
x_train, x_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size = 0.2, random_state=42)
print(f'Training set size: {len(x_train)}, Test set size: {len(x_test)}')


Training set size: 35918, Test set size: 8980


**Tokenization and feature extraction**

In [None]:
from transformers import BertTokenizer
#load pretrained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#tokenize the text
train_encodings = tokenizer(list(x_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(x_test), truncation=True, padding=True, max_length=512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [None]:
!pip install datasets


**Fine tune a pre-trained BERT model**

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Assuming train_encodings and y_train are already defined
# Prepare train and test encodings and labels as dictionaries
train_data = {
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train.values  # Labels should be named 'labels' for compatibility
}

test_data = {
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': y_test.values
}

# Convert them into Hugging Face's Dataset format
train_dataset = Dataset.from_dict(train_data)
test_dataset = Dataset.from_dict(test_data)

# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up Trainer
training_args = TrainingArguments(
    output_dir='./results',        # Output directory
    num_train_epochs=3,            # Number of epochs
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=64,  # Batch size for evaluation
    warmup_steps=500,              # Number of warmup steps
    weight_decay=0.01,             # Strength of weight decay
    logging_dir='./logs',          # Directory for storing logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()


**Model Evaluation**

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on test set
predictions = trainer.predict(test_dataset)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions.predictions.argmax(-1))
print(f"Accuracy: {accuracy}")

# Get detailed classification report
print(classification_report(y_test, predictions.predictions.argmax(-1)))
