In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/politifact-dataset-for-fake-news-detection/politifact_real.csv
/kaggle/input/politifact-dataset-for-fake-news-detection/politifact_fake.csv


## Importing the relevant packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm 
from torch.cuda.amp import autocast, GradScaler

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## Importing the dataset

In [3]:
true = pd.read_csv('/kaggle/input/politifact-dataset-for-fake-news-detection/politifact_real.csv')
fake = pd.read_csv('/kaggle/input/politifact-dataset-for-fake-news-detection/politifact_fake.csv')

# Add labels: 1 for fake, 0 for true
true['label'] = 0
fake['label'] = 1

dataset = pd.concat([true, fake], axis=0, ignore_index=False)

## Preprocessing the dataset

In [4]:
print(dataset.isna().sum(), '\n')
print(dataset['label'].value_counts())

id             0
news_url      61
title          0
tweet_ids    255
label          0
dtype: int64 

label
0    624
1    432
Name: count, dtype: int64


In [5]:
dataset.dropna(inplace=True)

In [6]:
dataset['statement'] = 'id: ' + dataset['id'] + ' news_url: ' + dataset['news_url'] + ' title: ' + dataset['title'] + ' tweet_ids: ' + dataset['tweet_ids']

In [7]:
dataset.drop(['id', 'news_url', 'title', 'tweet_ids'], axis=1, inplace=True)
dataset = dataset[['statement', 'label']]
dataset.reset_index(drop=True, inplace=True)

## Tokenization using ROBERTa tokenizer

In [8]:
# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

MAX_LEN = 256

input_ids_list = []
attention_masks_list = []

for text in tqdm(dataset['statement'].values, desc="Tokenizing", unit="text"):
    # Clear the GPU cache to free up memory
    torch.cuda.empty_cache()
    
    # Run garbage collection
#     gc.collect()
    
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Append results to lists
    input_ids_list.append(encoded['input_ids'])
    attention_masks_list.append(encoded['attention_mask'])

# Convert lists to tensors and move to GPU if available
input_ids = torch.cat(input_ids_list).to(device)
attention_masks = torch.cat(attention_masks_list).to(device)
labels = torch.tensor(dataset['label'].values).to(device)

print("Tokenization complete!")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenizing: 100%|██████████| 762/762 [01:30<00:00,  8.40text/s]


Tokenization complete!


## Train and Test split

In [12]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, val_masks, _, _ = train_test_split(attention_masks, labels, test_size=0.2, random_state=42)

# Create DataLoader
batch_size = 8

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

## Fine tuning for ROBERTa Model

In [9]:
torch.cuda.device_count()

2

In [10]:
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base', 
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)

epochs = 10
total_steps = len(train_dataloader) * epochs

num_warmup_steps = int(0.1 * total_steps)  # 10% of total steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)

## Training and visualization

In [14]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import plotly.graph_objects as go

In [15]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

In [16]:
scaler = GradScaler()  # For mixed precision
accumulation_steps = 4  # Number of steps for gradient accumulation

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    model.train()

    total_loss = 0
    correct_predictions = 0
    total_train = 0

    # Create a single progress bar
    pbar = tqdm(total=len(train_dataloader), desc="Training")
    
    for step, batch in enumerate(train_dataloader):
        torch.cuda.empty_cache()
        
        batch = tuple(b.to(device) for b in batch)
        inputs, masks, labels = batch
        
        # Enable autocasting for mixed precision
        with autocast():
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)  # Updated for RoBERTa
            loss = outputs.loss
            
        # Accumulate gradients
        total_loss += loss.item()

        # Calculate training accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_train += labels.size(0)

        # Scale the loss and call backward() to create the gradients
        scaler.scale(loss).backward()
        
        # Update weights after accumulating gradients
        if (step + 1) % accumulation_steps == 0:
            # Clip gradients
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()  # Clear gradients for the next step
            scheduler.step()  # Step the scheduler

        # Update progress bar
        pbar.update(1)  # Increment progress bar

    pbar.close()  # Close the progress bar after training

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_accuracy = correct_predictions / total_train
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_accuracy)

    print(f'Average training loss: {avg_train_loss:.2f}')
    print(f'Training accuracy: {avg_train_accuracy:.2f}')
    
    model.eval()
    eval_loss = 0
    correct_predictions_val = 0
    total_val = 0
    predictions_all = []
    true_labels_all = []

    # Evaluate without gradient tracking
    for batch in val_dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs, masks, labels = batch

        with torch.no_grad():
            # Pass labels for loss calculation
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels) 

        logits = outputs.logits
        eval_loss += outputs.loss.item() 

        # Calculate validation accuracy
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions_val += (predictions == labels).sum().item()
        total_val += labels.size(0)

        predictions_all.extend(predictions.cpu().numpy())
        true_labels_all.extend(labels.cpu().numpy())

    avg_val_loss = eval_loss / len(val_dataloader)
    avg_val_accuracy = correct_predictions_val / total_val
    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_accuracy)

    print(f'Validation Loss: {avg_val_loss:.2f}')
    print(f'Validation Accuracy: {avg_val_accuracy:.2f}')

  scaler = GradScaler()  # For mixed precision


Epoch 1/10


  with autocast():
Training: 100%|██████████| 77/77 [00:10<00:00,  7.25it/s]


Average training loss: 0.70
Training accuracy: 0.50
Validation Loss: 0.69
Validation Accuracy: 0.48
Epoch 2/10


Training: 100%|██████████| 77/77 [00:09<00:00,  7.85it/s]


Average training loss: 0.68
Training accuracy: 0.59
Validation Loss: 0.66
Validation Accuracy: 0.52
Epoch 3/10


Training: 100%|██████████| 77/77 [00:09<00:00,  7.78it/s]


Average training loss: 0.46
Training accuracy: 0.77
Validation Loss: 0.28
Validation Accuracy: 0.88
Epoch 4/10


Training: 100%|██████████| 77/77 [00:09<00:00,  7.73it/s]


Average training loss: 0.20
Training accuracy: 0.93
Validation Loss: 0.30
Validation Accuracy: 0.87
Epoch 5/10


Training: 100%|██████████| 77/77 [00:10<00:00,  7.69it/s]


Average training loss: 0.15
Training accuracy: 0.94
Validation Loss: 0.28
Validation Accuracy: 0.88
Epoch 6/10


Training: 100%|██████████| 77/77 [00:10<00:00,  7.62it/s]


Average training loss: 0.07
Training accuracy: 0.98
Validation Loss: 0.32
Validation Accuracy: 0.92
Epoch 7/10


Training: 100%|██████████| 77/77 [00:10<00:00,  7.61it/s]


Average training loss: 0.06
Training accuracy: 0.98
Validation Loss: 0.32
Validation Accuracy: 0.94
Epoch 8/10


Training: 100%|██████████| 77/77 [00:10<00:00,  7.55it/s]


Average training loss: 0.08
Training accuracy: 0.98
Validation Loss: 0.44
Validation Accuracy: 0.91
Epoch 9/10


Training: 100%|██████████| 77/77 [00:10<00:00,  7.53it/s]


Average training loss: 0.05
Training accuracy: 0.99
Validation Loss: 0.52
Validation Accuracy: 0.91
Epoch 10/10


Training: 100%|██████████| 77/77 [00:10<00:00,  7.56it/s]


Average training loss: 0.03
Training accuracy: 1.00
Validation Loss: 0.54
Validation Accuracy: 0.92


In [17]:
# After the training loop, calculate other metrics
print("\nClassification Report:")
print(classification_report(true_labels_all, predictions_all))

# Calculate ROC-AUC score if the task is binary classification
roc_auc = roc_auc_score(true_labels_all, predictions_all)
print(f'ROC-AUC Score: {roc_auc:.2f}')


Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.97      0.92        74
           1       0.97      0.86      0.91        79

    accuracy                           0.92       153
   macro avg       0.92      0.92      0.91       153
weighted avg       0.92      0.92      0.91       153

ROC-AUC Score: 0.92


In [18]:
fig = go.Figure()

# Loss
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=train_losses, mode='lines+markers', name='Training Loss'))
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=val_losses, mode='lines+markers', name='Validation Loss'))

# Accuracy
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=train_accuracies, mode='lines+markers', name='Training Accuracy'))
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=val_accuracies, mode='lines+markers', name='Validation Accuracy'))

# Update layout
fig.update_layout(
    title='Training and Validation Loss and Accuracy',
    xaxis_title='Epochs',
    yaxis_title='Loss / Accuracy',
    legend_title='Metrics'
)

fig.show()

In [22]:
import os
import shutil

save_directory = "/kaggle/working/roberta-politifact"
os.makedirs(save_directory, exist_ok=True)

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


zip_file_path = "/kaggle/working/roberta-politifact.zip"
shutil.make_archive(zip_file_path.replace('.zip', ''), 'zip', save_directory)

'/kaggle/working/roberta-politifact.zip'

In [24]:
os.remove('/kaggle/working/robera-politifact.zip')

NotADirectoryError: [Errno 20] Not a directory: '/kaggle/working/robera-politifact.zip'