In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/liar-dataset/test.tsv
/kaggle/input/liar-dataset/README
/kaggle/input/liar-dataset/train.tsv
/kaggle/input/liar-dataset/valid.tsv


## Importing the packages

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm  # For progress bar
from torch.cuda.amp import autocast, GradScaler
import gc

# Set GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


## Importing and preprocessing the dataset

In [3]:
train = pd.read_csv('/kaggle/input/liar-dataset/train.tsv', delimiter='\t')
test = pd.read_csv('/kaggle/input/liar-dataset/test.tsv', delimiter='\t')
valid = pd.read_csv('/kaggle/input/liar-dataset/valid.tsv', delimiter='\t')

In [4]:
print('Train:\n', train.isna().sum(), '\n')
print('Test:\n', test.isna().sum(), '\n')
print('Valid:\n', valid.isna().sum(), '\n')

Train:
 2635.json                                                                                0
false                                                                                    0
Says the Annies List political group supports third-trimester abortions on demand.       0
abortion                                                                                 2
dwayne-bohac                                                                             2
State representative                                                                  2898
Texas                                                                                 2210
republican                                                                               2
0                                                                                        2
1                                                                                        2
0.1                                                                               

In [5]:
def preprocess(df):
    column_names = ["id", "label", "statement", "subjects", "speaker","speaker_job", "state", "party", "barely_true","false", "half_true", "mostly_true", "pants_on_fire", "context"]
    df.columns = column_names
    
    df = df.select_dtypes(exclude=['number'])
    df.fillna('unknown', inplace=True)
    df['statement'] = 'id: ' + df['statement'] + ', context: ' + df['context'] + ', subjects: ' + df['subjects'] + ', speaker: ' + df['speaker'] + ', speaker_job: ' + df['speaker_job'] + ', state: ' + df['state'] + ', party: ' + df['party']
    df.drop(['id', 'subjects', 'speaker', 'speaker_job', 'state', 'party', 'context'], axis=1, inplace=True)
    df['label'] = df['label'].apply(lambda x: 1 if x in ['false', 'pants-on-fire', 'barely-true'] else 0)
    df = df[['statement', 'label']]
    
    return df

train = preprocess(train)
test = preprocess(test)
valid = preprocess(valid)

## Tokenizing the train, test, and valid datasets by ROBERTa Tokenizer

In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

MAX_LEN = 256

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [7]:
def tokenize_data(dataset):
    input_ids_list = []
    attention_masks_list = []

    # Encode the dataset
    for text in tqdm(dataset['statement'].values, desc="Tokenizing", unit="text"):
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        input_ids_list.append(encoded['input_ids'])
        attention_masks_list.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids_list)
    attention_masks = torch.cat(attention_masks_list)

    labels = torch.tensor(dataset['label'].astype('category').cat.codes.values)

    return input_ids, attention_masks, labels

In [8]:
train_input_ids, train_attention_masks, train_labels = tokenize_data(train)
valid_input_ids, valid_attention_masks, valid_labels = tokenize_data(valid)
test_input_ids, test_attention_masks, test_labels = tokenize_data(test)

Tokenizing: 100%|██████████| 10239/10239 [00:09<00:00, 1082.19text/s]
Tokenizing: 100%|██████████| 1283/1283 [00:01<00:00, 1128.56text/s]
Tokenizing: 100%|██████████| 1266/1266 [00:01<00:00, 1167.05text/s]


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_input_ids, train_attention_masks, train_labels = train_input_ids.to(device), train_attention_masks.to(device), train_labels.to(device)
valid_input_ids, valid_attention_masks, valid_labels = valid_input_ids.to(device), valid_attention_masks.to(device), valid_labels.to(device)
test_input_ids, test_attention_masks, test_labels = test_input_ids.to(device), test_attention_masks.to(device), test_labels.to(device)

## Creating dataloaders

In [10]:
batch_size = 4

train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(valid_input_ids, valid_attention_masks, valid_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## ROBERTa model initialization

In [11]:
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base', 
    num_labels=2, 
    output_attentions=False,
    output_hidden_states=False
)

model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)

epochs = 3
total_steps = len(train_dataloader) * epochs

num_warmup_steps = int(0.1 * total_steps)  # 10% of total steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=total_steps
)

## Training and visualization

In [13]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import plotly.graph_objects as go

In [14]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

In [15]:
scaler = GradScaler()  # For mixed precision
accumulation_steps = 4  # Number of steps for gradient accumulation

# Define the total number of iterations for the progress bar
total_steps = len(train_dataloader) + len(val_dataloader)

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    model.train()

    total_loss = 0
    correct_predictions = 0
    total_train = 0

    # Create a single progress bar for the entire epoch
    pbar = tqdm(total=len(train_dataloader), desc="Training")
    
    # Training Phase
    for step, batch in enumerate(train_dataloader):
        # Move the batch to the appropriate device
        batch = tuple(b.to(device) for b in batch)
        inputs, masks, labels = batch
        
        # Ensure labels are of type long (integer)
        labels = labels.long()  # Convert labels to long type if they are not already

        # Enable autocasting for mixed precision
        with autocast():
            outputs = model(input_ids=inputs, attention_mask=masks)  # Updated for RoBERTa
            logits = outputs.logits
            
            # CrossEntropyLoss expects labels to be in the shape [batch_size] and integers
            loss = torch.nn.functional.cross_entropy(logits, labels)  # Use Cross Entropy Loss
            
        # Accumulate loss
        total_loss += loss.item()

        # Calculate training accuracy
        predictions = torch.argmax(logits, dim=-1)
        correct_predictions += (predictions == labels).sum().item()
        total_train += labels.size(0)

        # Scale the loss and call backward() to create the gradients
        scaler.scale(loss).backward()
        
        # Update weights after accumulating gradients
        if (step + 1) % accumulation_steps == 0 or step == len(train_dataloader) - 1:
            # Clip gradients to prevent explosion
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)  # Update model parameters
            scaler.update()         # Update the scaler for the next iteration
            optimizer.zero_grad()   # Clear gradients for the next step

        # Update progress bar for training
        pbar.update(1)  # Increment progress bar
    pbar.close()

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_accuracy = correct_predictions / total_train
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_accuracy)

    print(f'Average training loss: {avg_train_loss:.2f}')
    print(f'Training accuracy: {avg_train_accuracy:.2f}')

    # Evaluation Phase
    model.eval()
    eval_loss = 0
    correct_predictions_val = 0
    total_val = 0

    # Evaluate without gradient tracking
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(b.to(device) for b in batch)
            inputs, masks, labels = batch

            # Ensure labels are of type long (integer)
            labels = labels.long()  # Convert labels to long type if they are not already
            
            outputs = model(input_ids=inputs, attention_mask=masks) 
            logits = outputs.logits
            eval_loss += torch.nn.functional.cross_entropy(logits, labels).item()  # Use Cross Entropy Loss

            # Calculate validation accuracy
            predictions = torch.argmax(logits, dim=-1)
            correct_predictions_val += (predictions == labels).sum().item()
            total_val += labels.size(0)

    avg_val_loss = eval_loss / len(val_dataloader)
    avg_val_accuracy = correct_predictions_val / total_val
    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_accuracy)

    print(f'Validation Loss: {avg_val_loss:.2f}')
    print(f'Validation Accuracy: {avg_val_accuracy:.2f}')

  scaler = GradScaler()  # For mixed precision


Epoch 1/3


  with autocast():
Training: 100%|██████████| 2560/2560 [02:52<00:00, 14.82it/s]


Average training loss: 0.66
Training accuracy: 0.62
Validation Loss: 0.67
Validation Accuracy: 0.61
Epoch 2/3


Training: 100%|██████████| 2560/2560 [02:53<00:00, 14.78it/s]


Average training loss: 0.66
Training accuracy: 0.62
Validation Loss: 0.67
Validation Accuracy: 0.61
Epoch 3/3


Training: 100%|██████████| 2560/2560 [02:53<00:00, 14.78it/s]


Average training loss: 0.67
Training accuracy: 0.61
Validation Loss: 0.67
Validation Accuracy: 0.61


In [16]:
criterion = torch.nn.CrossEntropyLoss()

model.eval()  # Set model to evaluation mode
test_loss = 0
correct_predictions_test = 0
total_test = 0
test_predictions_all = []
test_true_labels_all = []

# Evaluate on the test set without gradient tracking
for batch in test_dataloader:
    batch = tuple(b.to(device) for b in batch)
    inputs, masks, labels = batch

    # Ensure labels are long type for CrossEntropyLoss
    labels = labels.long()

    with torch.no_grad():
        # Get model outputs (logits)
        outputs = model(input_ids=inputs, attention_mask=masks)
        
        logits = outputs.logits
        
        # Calculate loss using CrossEntropyLoss
        loss = criterion(logits, labels)
        test_loss += loss.item() 

    # Calculate test accuracy
    predictions = torch.argmax(logits, dim=-1)
    correct_predictions_test += (predictions == labels).sum().item()
    total_test += labels.size(0)

    test_predictions_all.extend(predictions.cpu().numpy())
    test_true_labels_all.extend(labels.cpu().numpy())

# Calculate average test loss and accuracy
avg_test_loss = test_loss / len(test_dataloader)
avg_test_accuracy = correct_predictions_test / total_test

print(f'Test Loss: {avg_test_loss:.2f}')
print(f'Test Accuracy: {avg_test_accuracy:.2f}')

Test Loss: 0.66
Test Accuracy: 0.64


In [17]:
print("\nClassification Report:")
print(classification_report(test_true_labels_all, test_predictions_all))

roc_auc = roc_auc_score(test_true_labels_all, test_predictions_all)
print(f'ROC-AUC Score: {roc_auc:.2f}')


Classification Report:
              precision    recall  f1-score   support

           0       0.64      1.00      0.78       805
           1       0.00      0.00      0.00       461

    accuracy                           0.64      1266
   macro avg       0.32      0.50      0.39      1266
weighted avg       0.40      0.64      0.49      1266

ROC-AUC Score: 0.50


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
import plotly.graph_objs as go

fig = go.Figure()

# Loss traces
fig.add_trace(go.Scatter(
    x=list(range(1, epochs + 1)),y=train_losses,mode='lines+markers',name='Training Loss',line=dict(color='blue', width=2),marker=dict(size=5)
))

fig.add_trace(go.Scatter(
    x=list(range(1, epochs + 1)),y=val_losses,mode='lines+markers',name='Validation Loss',line=dict(color='orange', width=2),marker=dict(size=5)
))

# Accuracy traces
fig.add_trace(go.Scatter(
    x=list(range(1, epochs + 1)),y=train_accuracies,mode='lines+markers',name='Training Accuracy',line=dict(color='green', width=2),marker=dict(size=5)
))

fig.add_trace(go.Scatter(
    x=list(range(1, epochs + 1)),y=val_accuracies,mode='lines+markers',name='Validation Accuracy',line=dict(color='red', width=2),marker=dict(size=5)
))

# Update layout
fig.update_layout(
    title='Training and Validation Loss and Accuracy',xaxis_title='Epochs',yaxis_title='Loss',legend_title='Metrics',yaxis=dict(title='Loss', titlefont=dict(color='blue'), tickfont=dict(color='blue')),yaxis2=dict(title='Accuracy', titlefont=dict(color='green'), tickfont=dict(color='green'), overlaying='y', side='right'),showlegend=True
)

# Update the y-axis for accuracy
fig.update_traces(yaxis='y', selector=dict(name='Training Accuracy'))
fig.update_traces(yaxis='y', selector=dict(name='Validation Accuracy'))

fig.show()


In [19]:
import os
import shutil

save_directory = "/kaggle/working/roberta-liar-binary"
os.makedirs(save_directory, exist_ok=True)

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


zip_file_path = "/kaggle/working/roberta-liar-binary.zip"
shutil.make_archive(zip_file_path.replace('.zip', ''), 'zip', save_directory)

'/kaggle/working/roberta-liar-binary.zip'