In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pheme-dataset-for-rumour-detection/dataset.csv


## Importing the packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm  # For progress bar
from torch.cuda.amp import autocast, GradScaler
import gc

# Set GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
dataset = pd.read_csv('/kaggle/input/pheme-dataset-for-rumour-detection/dataset.csv')
dataset

Unnamed: 0,text,is_rumor,user.handle,topic
0,Charlie Hebdo became well known for publishing...,0.0,BBCDanielS,charliehebdo
1,"Now 10 dead in a shooting there today RT ""@BBC...",0.0,robbylevy,charliehebdo
2,@BBCDanielS @BBCWorld I'm guessing this is bei...,0.0,ModerateInAll,charliehebdo
3,@BBCDanielS @BBCWorld why would you mention th...,0.0,GabTarquini,charliehebdo
4,@BBCDanielS @BBCWorld perps identified?,0.0,freethought41,charliehebdo
...,...,...,...,...
62440,@AnonyOps @Xplant So that means its ok to torc...,1.0,RianAlden,ferguson
62441,"@RianAlden not at all, but they need to change...",1.0,Xplant,ferguson
62442,@Xplant @AnonyOps Absoluteky. But it pains me...,1.0,RianAlden,ferguson
62443,@Xplant @AnonyOps I'm curious how many of thes...,1.0,RianAlden,ferguson


## Preprocessing the dataset

In [4]:
dataset.rename(columns={'is_rumor': 'label'}, inplace=True)

In [5]:
print(dataset['label'].value_counts(), '\n')
print(dataset.isna().sum())

label
0.0    48619
1.0    13824
Name: count, dtype: int64 

text               0
label              2
user.handle        2
topic          12777
dtype: int64


In [6]:
dataset.dropna(inplace=True)

In [7]:
label_0 = dataset[dataset['label'] == 0].sample(n=8000, random_state=42)
label_1 = dataset[dataset['label'] == 1]
dataset = pd.concat([label_0, label_1], ignore_index=True)

In [8]:
dataset['statement'] = 'text: ' + dataset['text'] + ' user_handle: ' + dataset['user.handle'] + ' topic: ' + dataset['topic']
dataset.drop(['text', 'user.handle', 'topic'], axis=1, inplace=True)
dataset = dataset[['statement', 'label']]
dataset.reset_index(drop=True, inplace=True)

In [9]:
dataset['label'] = dataset['label'].astype('int64')

## Tokenization using ROBERTa tokenizer

In [10]:
# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

MAX_LEN = 256

input_ids_list = []
attention_masks_list = []

for text in tqdm(dataset['statement'].values, desc="Tokenizing", unit="text"):
    # Clear the GPU cache to free up memory
    torch.cuda.empty_cache()
    
    # Run garbage collection
#     gc.collect()
    
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    # Append results to lists
    input_ids_list.append(encoded['input_ids'])
    attention_masks_list.append(encoded['attention_mask'])

# Convert lists to tensors and move to GPU if available
input_ids = torch.cat(input_ids_list).to(device)
attention_masks = torch.cat(attention_masks_list).to(device)
labels = torch.tensor(dataset['label'].values).to(device)

print("Tokenization complete!")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenizing: 100%|██████████| 14479/14479 [00:12<00:00, 1156.93text/s]


Tokenization complete!


## Train and Test split

In [11]:
train_inputs, val_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=42)
train_masks, val_masks, _, _ = train_test_split(attention_masks, labels, test_size=0.2, random_state=42)

# Create DataLoader
batch_size = 8

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

## Fine tuning for ROBERTa Model

In [12]:
torch.cuda.device_count()

1

In [13]:
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base', 
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 3
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

## Training and visualization

In [15]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import plotly.graph_objects as go

In [16]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

In [17]:
scaler = GradScaler()  # For mixed precision
accumulation_steps = 4  # Number of steps for gradient accumulation

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    model.train()

    total_loss = 0
    correct_predictions = 0
    total_train = 0

    # Create a single progress bar
    pbar = tqdm(total=len(train_dataloader), desc="Training")
    
    for step, batch in enumerate(train_dataloader):
        torch.cuda.empty_cache()
        
        batch = tuple(b.to(device) for b in batch)
        inputs, masks, labels = batch
        
        # Enable autocasting for mixed precision
        with autocast():
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels)  # Updated for RoBERTa
            loss = outputs.loss
            
        # Accumulate gradients
        total_loss += loss.item()

        # Calculate training accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_train += labels.size(0)

        # Scale the loss and call backward() to create the gradients
        scaler.scale(loss).backward()
        
        # Update weights after accumulating gradients
        if (step + 1) % accumulation_steps == 0:
            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()  # Clear gradients for the next step

        # Update progress bar
        pbar.update(1)  # Increment progress bar

    pbar.close()  # Close the progress bar after training

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_accuracy = correct_predictions / total_train
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_accuracy)

    print(f'Average training loss: {avg_train_loss:.2f}')
    print(f'Training accuracy: {avg_train_accuracy:.2f}')
    
    model.eval()
    eval_loss = 0
    correct_predictions_val = 0
    total_val = 0
    predictions_all = []
    true_labels_all = []

    # Evaluate without gradient tracking
    for batch in val_dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs, masks, labels = batch

        with torch.no_grad():
            # Pass labels for loss calculation
            outputs = model(input_ids=inputs, attention_mask=masks, labels=labels) 

        logits = outputs.logits
        eval_loss += outputs.loss.item() 

        # Calculate validation accuracy
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions_val += (predictions == labels).sum().item()
        total_val += labels.size(0)

        predictions_all.extend(predictions.cpu().numpy())
        true_labels_all.extend(labels.cpu().numpy())

    avg_val_loss = eval_loss / len(val_dataloader)
    avg_val_accuracy = correct_predictions_val / total_val
    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_accuracy)

    print(f'Validation Loss: {avg_val_loss:.2f}')
    print(f'Validation Accuracy: {avg_val_accuracy:.2f}')

  scaler = GradScaler()  # For mixed precision


Epoch 1/3


  with autocast():
Training: 100%|██████████| 1448/1448 [05:34<00:00,  4.33it/s]


Average training loss: 0.48
Training accuracy: 0.72
Validation Loss: 0.32
Validation Accuracy: 0.85
Epoch 2/3


Training: 100%|██████████| 1448/1448 [05:33<00:00,  4.34it/s]


Average training loss: 0.29
Training accuracy: 0.87
Validation Loss: 0.22
Validation Accuracy: 0.91
Epoch 3/3


Training: 100%|██████████| 1448/1448 [05:34<00:00,  4.33it/s]


Average training loss: 0.19
Training accuracy: 0.92
Validation Loss: 0.20
Validation Accuracy: 0.92


## Evaluation

In [18]:
# After the training loop, calculate other metrics
print("\nClassification Report:")
print(classification_report(true_labels_all, predictions_all))

# Calculate ROC-AUC score if the task is binary classification
roc_auc = roc_auc_score(true_labels_all, predictions_all)
print(f'ROC-AUC Score: {roc_auc:.2f}')


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1594
           1       0.91      0.91      0.91      1302

    accuracy                           0.92      2896
   macro avg       0.92      0.92      0.92      2896
weighted avg       0.92      0.92      0.92      2896

ROC-AUC Score: 0.92


In [19]:
fig = go.Figure()

# Loss
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=train_losses, mode='lines+markers', name='Training Loss'))
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=val_losses, mode='lines+markers', name='Validation Loss'))

# Accuracy
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=train_accuracies, mode='lines+markers', name='Training Accuracy'))
fig.add_trace(go.Scatter(x=list(range(1, epochs + 1)), y=val_accuracies, mode='lines+markers', name='Validation Accuracy'))

# Update layout
fig.update_layout(
    title='Training and Validation Loss and Accuracy',
    xaxis_title='Epochs',
    yaxis_title='Loss / Accuracy',
    legend_title='Metrics'
)

fig.show()

In [20]:
import os
import shutil

save_directory = "/kaggle/working/roberta-pheme"
os.makedirs(save_directory, exist_ok=True)

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


zip_file_path = "/kaggle/working/roberta-pheme.zip"
shutil.make_archive(zip_file_path.replace('.zip', ''), 'zip', save_directory)

'/kaggle/working/roberta-pheme.zip'