In [1]:
import os
import time
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# ======================= CONFIG ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Dataset path
liar_dataset_path = "/kaggle/input/fakenewcombined/liar_train_covid_format.csv"
output_base = "/kaggle/working/deberta_liar_model"

# Training parameters
num_epochs = 5
batch_size = 32
max_length = 128
learning_rate = 2e-5
base_model = "microsoft/deberta-base"

print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# =================== HELPER FUNCTIONS ===================
def prepare_dataframe(df):
    """Chuẩn hóa dataframe: chọn 2 cột tweet & label, đổi tên và map label"""
    df = df[['tweet', 'label']].copy()
    df.columns = ['Text', 'Label']
    # Map label: 'real' -> 0, 'fake' -> 1 (theo cách DeBERTa code gốc)
    df['Label'] = df['Label'].map({'real': 0, 'fake': 1})
    return df

def encode_data_with_attention(texts, tokenizer, max_len):
    """Tokenize text với attention mask (theo style DeBERTa gốc)"""
    input_ids_list = []
    attention_masks_list = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids_list.append(encoded_dict['input_ids'])
        attention_masks_list.append(encoded_dict['attention_mask'])
    
    return torch.tensor(input_ids_list), torch.tensor(attention_masks_list)

def flat_accuracy(preds, labels):
    """Tính accuracy (theo style DeBERTa gốc)"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def evaluate_model(model, val_loader):
    """Đánh giá model trên validation set"""
    model.eval()
    all_preds, all_labels = [], []
    eval_loss = 0
    nb_eval_steps = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
            logits = outputs[0]
            
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
            all_preds.extend(np.argmax(logits, axis=1))
            all_labels.extend(label_ids)
            nb_eval_steps += 1

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    return acc, prec, rec, f1

# =================== MAIN TRAINING ===================
print("🚀 Training DeBERTa model on LIAR dataset")

# Load & prepare data
print("📊 Loading and preparing LIAR dataset...")
df = pd.read_csv(liar_dataset_path)
df = prepare_dataframe(df)

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['Label'].value_counts()}")

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed, stratify=df['Label'])
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

# Initialize tokenizer
print("🔧 Initializing DeBERTa tokenizer...")
tokenizer = DebertaTokenizer.from_pretrained(base_model, do_lower_case=True)

# Encode data
print("🔤 Tokenizing texts...")
train_input_ids, train_attention_masks = encode_data_with_attention(
    train_df['Text'].tolist(), tokenizer, max_length
)
test_input_ids, test_attention_masks = encode_data_with_attention(
    test_df['Text'].tolist(), tokenizer, max_length
)

# Convert labels to tensors
train_labels = torch.tensor(train_df['Label'].values)
test_labels = torch.tensor(test_df['Label'].values)

# Create data loaders
print("📦 Creating data loaders...")
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Initialize model
print("🤖 Initializing DeBERTa model...")
model = DebertaForSequenceClassification.from_pretrained(base_model, num_labels=2)
model.to(device)

# Setup optimizer (theo style DeBERTa gốc)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Training loop
print(f"🏋️‍♂️ Starting training for {num_epochs} epochs...")
train_loss_set = []
best_val_accuracy = 0.0

for epoch in range(num_epochs):
    print(f"\n=== Epoch {epoch + 1}/{num_epochs} ===")
    
    # Training phase
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_loader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
    
    avg_train_loss = tr_loss / nb_tr_steps
    print(f"📈 Train loss: {avg_train_loss:.4f}")
    
    # Validation phase
    print("🔍 Evaluating on test set...")
    acc, prec, rec, f1 = evaluate_model(model, test_loader)
    
    print(f"📊 Test Results:")
    print(f"   Accuracy:  {acc:.4f}")
    print(f"   Precision: {prec:.4f}")
    print(f"   Recall:    {rec:.4f}")
    print(f"   F1 Score:  {f1:.4f}")
    
    # Save best model
    if acc > best_val_accuracy:
        print(f"🎉 New best accuracy: {acc:.4f} (previous: {best_val_accuracy:.4f})")
        best_val_accuracy = acc
        
        # Save model
        os.makedirs(output_base, exist_ok=True)
        model.save_pretrained(output_base)
        tokenizer.save_pretrained(output_base)
        
        # Save state dict (theo style DeBERTa gốc)
        torch.save(model.state_dict(), os.path.join(output_base, 'deberta_liar_best_model.ckpt'))
        print(f"💾 Model saved at: {output_base}")

print(f"\n✅ Training completed!")
print(f"🏆 Best validation accuracy: {best_val_accuracy:.4f}")
print(f"📁 Model saved at: {output_base}")

# Final evaluation
print(f"\n🔬 Final evaluation on test set...")
final_acc, final_prec, final_rec, final_f1 = evaluate_model(model, test_loader)
print(f"📊 Final Test Results:")
print(f"   Accuracy:  {final_acc:.4f}")
print(f"   Precision: {final_prec:.4f}")
print(f"   Recall:    {final_rec:.4f}")
print(f"   F1 Score:  {final_f1:.4f}")

# Save results to CSV
results_df = pd.DataFrame({
    'Dataset': ['LIAR'],
    'Model': ['DeBERTa-base'],
    'Accuracy': [final_acc],
    'Precision': [final_prec],
    'Recall': [final_rec],
    'F1_Score': [final_f1],
    'Best_Val_Accuracy': [best_val_accuracy]
})

results_df.to_csv(os.path.join(output_base, 'liar_deberta_results.csv'), index=False)
print(f"📈 Results saved to: {os.path.join(output_base, 'liar_deberta_results.csv')}")

2025-06-01 13:47:12.058066: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748785632.232254      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748785632.282261      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
GPU: Tesla P100-PCIE-16GB
🚀 Training DeBERTa model on LIAR dataset
📊 Loading and preparing LIAR dataset...
Dataset shape: (10240, 2)
Label distribution:
Label
1    6602
0    3638
Name: count, dtype: int64
Train size: 8192, Test size: 2048
🔧 Initializing DeBERTa tokenizer...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

🔤 Tokenizing texts...
📦 Creating data loaders...
🤖 Initializing DeBERTa model...


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🏋️‍♂️ Starting training for 5 epochs...

=== Epoch 1/5 ===


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

📈 Train loss: 0.6424
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.6445
   Precision: 0.6450
   Recall:    0.9977
   F1 Score:  0.7835
🎉 New best accuracy: 0.6445 (previous: 0.0000)
💾 Model saved at: /kaggle/working/deberta_liar_model

=== Epoch 2/5 ===
📈 Train loss: 0.6040
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.6611
   Precision: 0.6986
   Recall:    0.8341
   F1 Score:  0.7604
🎉 New best accuracy: 0.6611 (previous: 0.6445)
💾 Model saved at: /kaggle/working/deberta_liar_model

=== Epoch 3/5 ===
📈 Train loss: 0.5475
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.6455
   Precision: 0.7060
   Recall:    0.7712
   F1 Score:  0.7371

=== Epoch 4/5 ===
📈 Train loss: 0.4133
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.6533
   Precision: 0.7039
   Recall:    0.7977
   F1 Score:  0.7479

=== Epoch 5/5 ===
📈 Train loss: 0.2351
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.6514
   Precision: 0.7047
   Recall:    0.7902


In [2]:
import os
import time
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# ======================= CONFIG ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Dataset path
liar_dataset_path = "/kaggle/input/fakenewcombined/politifact_train.csv"
output_base = "/kaggle/working/deberta_politifact_model"

# Training parameters
num_epochs = 5
batch_size = 32
max_length = 128
learning_rate = 2e-5
base_model = "microsoft/deberta-base"

print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# =================== HELPER FUNCTIONS ===================
def prepare_dataframe(df):
    """Chuẩn hóa dataframe: chọn 2 cột tweet & label, đổi tên và map label"""
    df = df[['tweet', 'label']].copy()
    df.columns = ['Text', 'Label']
    # Map label: 'real' -> 0, 'fake' -> 1 (theo cách DeBERTa code gốc)
    df['Label'] = df['Label'].map({'real': 0, 'fake': 1})
    return df

def encode_data_with_attention(texts, tokenizer, max_len):
    """Tokenize text với attention mask (theo style DeBERTa gốc)"""
    input_ids_list = []
    attention_masks_list = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids_list.append(encoded_dict['input_ids'])
        attention_masks_list.append(encoded_dict['attention_mask'])
    
    return torch.tensor(input_ids_list), torch.tensor(attention_masks_list)

def flat_accuracy(preds, labels):
    """Tính accuracy (theo style DeBERTa gốc)"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def evaluate_model(model, val_loader):
    """Đánh giá model trên validation set"""
    model.eval()
    all_preds, all_labels = [], []
    eval_loss = 0
    nb_eval_steps = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
            logits = outputs[0]
            
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
            all_preds.extend(np.argmax(logits, axis=1))
            all_labels.extend(label_ids)
            nb_eval_steps += 1

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    return acc, prec, rec, f1

# =================== MAIN TRAINING ===================
print("🚀 Training DeBERTa model on LIAR dataset")

# Load & prepare data
print("📊 Loading and preparing Politifact dataset...")
df = pd.read_csv(liar_dataset_path)
df = prepare_dataframe(df)

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['Label'].value_counts()}")

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed, stratify=df['Label'])
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

# Initialize tokenizer
print("🔧 Initializing DeBERTa tokenizer...")
tokenizer = DebertaTokenizer.from_pretrained(base_model, do_lower_case=True)

# Encode data
print("🔤 Tokenizing texts...")
train_input_ids, train_attention_masks = encode_data_with_attention(
    train_df['Text'].tolist(), tokenizer, max_length
)
test_input_ids, test_attention_masks = encode_data_with_attention(
    test_df['Text'].tolist(), tokenizer, max_length
)

# Convert labels to tensors
train_labels = torch.tensor(train_df['Label'].values)
test_labels = torch.tensor(test_df['Label'].values)

# Create data loaders
print("📦 Creating data loaders...")
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Initialize model
print("🤖 Initializing DeBERTa model...")
model = DebertaForSequenceClassification.from_pretrained(base_model, num_labels=2)
model.to(device)

# Setup optimizer (theo style DeBERTa gốc)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Training loop
print(f"🏋️‍♂️ Starting training for {num_epochs} epochs...")
train_loss_set = []
best_val_accuracy = 0.0

for epoch in range(num_epochs):
    print(f"\n=== Epoch {epoch + 1}/{num_epochs} ===")
    
    # Training phase
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_loader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
    
    avg_train_loss = tr_loss / nb_tr_steps
    print(f"📈 Train loss: {avg_train_loss:.4f}")
    
    # Validation phase
    print("🔍 Evaluating on test set...")
    acc, prec, rec, f1 = evaluate_model(model, test_loader)
    
    print(f"📊 Test Results:")
    print(f"   Accuracy:  {acc:.4f}")
    print(f"   Precision: {prec:.4f}")
    print(f"   Recall:    {rec:.4f}")
    print(f"   F1 Score:  {f1:.4f}")
    
    # Save best model
    if acc > best_val_accuracy:
        print(f"🎉 New best accuracy: {acc:.4f} (previous: {best_val_accuracy:.4f})")
        best_val_accuracy = acc
        
        # Save model
        os.makedirs(output_base, exist_ok=True)
        model.save_pretrained(output_base)
        tokenizer.save_pretrained(output_base)
        
        # Save state dict (theo style DeBERTa gốc)
        torch.save(model.state_dict(), os.path.join(output_base, 'deberta_politifact_best_model.ckpt'))
        print(f"💾 Model saved at: {output_base}")

print(f"\n✅ Training completed!")
print(f"🏆 Best validation accuracy: {best_val_accuracy:.4f}")
print(f"📁 Model saved at: {output_base}")

# Final evaluation
print(f"\n🔬 Final evaluation on test set...")
final_acc, final_prec, final_rec, final_f1 = evaluate_model(model, test_loader)
print(f"📊 Final Test Results:")
print(f"   Accuracy:  {final_acc:.4f}")
print(f"   Precision: {final_prec:.4f}")
print(f"   Recall:    {final_rec:.4f}")
print(f"   F1 Score:  {final_f1:.4f}")

# Save results to CSV
results_df = pd.DataFrame({
    'Dataset': ['LIAR'],
    'Model': ['DeBERTa-base'],
    'Accuracy': [final_acc],
    'Precision': [final_prec],
    'Recall': [final_rec],
    'F1_Score': [final_f1],
    'Best_Val_Accuracy': [best_val_accuracy]
})

results_df.to_csv(os.path.join(output_base, 'liar_deberta_results.csv'), index=False)
print(f"📈 Results saved to: {os.path.join(output_base, 'politifact_deberta_results.csv')}")

Using device: cuda
GPU: Tesla P100-PCIE-16GB
🚀 Training DeBERTa model on LIAR dataset
📊 Loading and preparing Politifact dataset...
Dataset shape: (739, 2)
Label distribution:
Label
0    437
1    302
Name: count, dtype: int64
Train size: 591, Test size: 148
🔧 Initializing DeBERTa tokenizer...
🔤 Tokenizing texts...
📦 Creating data loaders...
🤖 Initializing DeBERTa model...


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🏋️‍♂️ Starting training for 5 epochs...

=== Epoch 1/5 ===
📈 Train loss: 0.6007
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.7230
   Precision: 0.6092
   Recall:    0.8833
   F1 Score:  0.7211
🎉 New best accuracy: 0.7230 (previous: 0.0000)
💾 Model saved at: /kaggle/working/deberta_politifact_model

=== Epoch 2/5 ===
📈 Train loss: 0.4909
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.8041
   Precision: 0.7067
   Recall:    0.8833
   F1 Score:  0.7852
🎉 New best accuracy: 0.8041 (previous: 0.7230)
💾 Model saved at: /kaggle/working/deberta_politifact_model

=== Epoch 3/5 ===
📈 Train loss: 0.4836
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.7973
   Precision: 0.8750
   Recall:    0.5833
   F1 Score:  0.7000

=== Epoch 4/5 ===
📈 Train loss: 0.4322
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.7838
   Precision: 0.6795
   Recall:    0.8833
   F1 Score:  0.7681

=== Epoch 5/5 ===
📈 Train loss: 0.3572
🔍 Evaluating on test set...
📊 Test R

In [3]:
import os
import time
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# ======================= CONFIG ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Dataset path
liar_dataset_path = "/kaggle/input/fakenewcombined/gossipcop_train.csv"
output_base = "/kaggle/working/deberta_gossipcop_model"

# Training parameters
num_epochs = 5
batch_size = 32
max_length = 128
learning_rate = 2e-5
base_model = "microsoft/deberta-base"

print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# =================== HELPER FUNCTIONS ===================
def prepare_dataframe(df):
    """Chuẩn hóa dataframe: chọn 2 cột tweet & label, đổi tên và map label"""
    df = df[['tweet', 'label']].copy()
    df.columns = ['Text', 'Label']
    # Map label: 'real' -> 0, 'fake' -> 1 (theo cách DeBERTa code gốc)
    df['Label'] = df['Label'].map({'real': 0, 'fake': 1})
    return df

def encode_data_with_attention(texts, tokenizer, max_len):
    """Tokenize text với attention mask (theo style DeBERTa gốc)"""
    input_ids_list = []
    attention_masks_list = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids_list.append(encoded_dict['input_ids'])
        attention_masks_list.append(encoded_dict['attention_mask'])
    
    return torch.tensor(input_ids_list), torch.tensor(attention_masks_list)

def flat_accuracy(preds, labels):
    """Tính accuracy (theo style DeBERTa gốc)"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def evaluate_model(model, val_loader):
    """Đánh giá model trên validation set"""
    model.eval()
    all_preds, all_labels = [], []
    eval_loss = 0
    nb_eval_steps = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
            logits = outputs[0]
            
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
            all_preds.extend(np.argmax(logits, axis=1))
            all_labels.extend(label_ids)
            nb_eval_steps += 1

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    return acc, prec, rec, f1

# =================== MAIN TRAINING ===================
print("🚀 Training DeBERTa model on gossipcop dataset")

# Load & prepare data
print("📊 Loading and preparing gossipcop dataset...")
df = pd.read_csv(liar_dataset_path)
df = prepare_dataframe(df)

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['Label'].value_counts()}")

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed, stratify=df['Label'])
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

# Initialize tokenizer
print("🔧 Initializing DeBERTa tokenizer...")
tokenizer = DebertaTokenizer.from_pretrained(base_model, do_lower_case=True)

# Encode data
print("🔤 Tokenizing texts...")
train_input_ids, train_attention_masks = encode_data_with_attention(
    train_df['Text'].tolist(), tokenizer, max_length
)
test_input_ids, test_attention_masks = encode_data_with_attention(
    test_df['Text'].tolist(), tokenizer, max_length
)

# Convert labels to tensors
train_labels = torch.tensor(train_df['Label'].values)
test_labels = torch.tensor(test_df['Label'].values)

# Create data loaders
print("📦 Creating data loaders...")
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Initialize model
print("🤖 Initializing DeBERTa model...")
model = DebertaForSequenceClassification.from_pretrained(base_model, num_labels=2)
model.to(device)

# Setup optimizer (theo style DeBERTa gốc)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Training loop
print(f"🏋️‍♂️ Starting training for {num_epochs} epochs...")
train_loss_set = []
best_val_accuracy = 0.0

for epoch in range(num_epochs):
    print(f"\n=== Epoch {epoch + 1}/{num_epochs} ===")
    
    # Training phase
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_loader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
    
    avg_train_loss = tr_loss / nb_tr_steps
    print(f"📈 Train loss: {avg_train_loss:.4f}")
    
    # Validation phase
    print("🔍 Evaluating on test set...")
    acc, prec, rec, f1 = evaluate_model(model, test_loader)
    
    print(f"📊 Test Results:")
    print(f"   Accuracy:  {acc:.4f}")
    print(f"   Precision: {prec:.4f}")
    print(f"   Recall:    {rec:.4f}")
    print(f"   F1 Score:  {f1:.4f}")
    
    # Save best model
    if acc > best_val_accuracy:
        print(f"🎉 New best accuracy: {acc:.4f} (previous: {best_val_accuracy:.4f})")
        best_val_accuracy = acc
        
        # Save model
        os.makedirs(output_base, exist_ok=True)
        model.save_pretrained(output_base)
        tokenizer.save_pretrained(output_base)
        
        # Save state dict (theo style DeBERTa gốc)
        torch.save(model.state_dict(), os.path.join(output_base, 'deberta_gossipcop_best_model.ckpt'))
        print(f"💾 Model saved at: {output_base}")

print(f"\n✅ Training completed!")
print(f"🏆 Best validation accuracy: {best_val_accuracy:.4f}")
print(f"📁 Model saved at: {output_base}")

# Final evaluation
print(f"\n🔬 Final evaluation on test set...")
final_acc, final_prec, final_rec, final_f1 = evaluate_model(model, test_loader)
print(f"📊 Final Test Results:")
print(f"   Accuracy:  {final_acc:.4f}")
print(f"   Precision: {final_prec:.4f}")
print(f"   Recall:    {final_rec:.4f}")
print(f"   F1 Score:  {final_f1:.4f}")

# Save results to CSV
results_df = pd.DataFrame({
    'Dataset': ['LIAR'],
    'Model': ['DeBERTa-base'],
    'Accuracy': [final_acc],
    'Precision': [final_prec],
    'Recall': [final_rec],
    'F1_Score': [final_f1],
    'Best_Val_Accuracy': [best_val_accuracy]
})

results_df.to_csv(os.path.join(output_base, 'gossipcop_deberta_results.csv'), index=False)
print(f"📈 Results saved to: {os.path.join(output_base, 'gossipcop_deberta_results.csv')}")

Using device: cuda
GPU: Tesla P100-PCIE-16GB
🚀 Training DeBERTa model on gossipcop dataset
📊 Loading and preparing gossipcop dataset...
Dataset shape: (15498, 2)
Label distribution:
Label
0    11772
1     3726
Name: count, dtype: int64
Train size: 12398, Test size: 3100
🔧 Initializing DeBERTa tokenizer...
🔤 Tokenizing texts...
📦 Creating data loaders...
🤖 Initializing DeBERTa model...


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🏋️‍♂️ Starting training for 5 epochs...

=== Epoch 1/5 ===
📈 Train loss: 0.4132
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.8377
   Precision: 0.6434
   Recall:    0.7289
   F1 Score:  0.6834
🎉 New best accuracy: 0.8377 (previous: 0.0000)
💾 Model saved at: /kaggle/working/deberta_gossipcop_model

=== Epoch 2/5 ===
📈 Train loss: 0.2986
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.8658
   Precision: 0.7851
   Recall:    0.6081
   F1 Score:  0.6853
🎉 New best accuracy: 0.8658 (previous: 0.8377)
💾 Model saved at: /kaggle/working/deberta_gossipcop_model

=== Epoch 3/5 ===
📈 Train loss: 0.2268
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.8597
   Precision: 0.7165
   Recall:    0.6886
   F1 Score:  0.7023

=== Epoch 4/5 ===
📈 Train loss: 0.1557
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.8590
   Precision: 0.7098
   Recall:    0.6993
   F1 Score:  0.7045

=== Epoch 5/5 ===
📈 Train loss: 0.1133
🔍 Evaluating on test set...
📊 Test Res

In [4]:
import os
import time
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import DebertaTokenizer, DebertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings("ignore")

# ======================= CONFIG ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Dataset path
liar_dataset_path = "/kaggle/input/covid-19-fake-news-dataset/Train.csv"
output_base = "/kaggle/working/deberta_covid_model"

# Training parameters
num_epochs = 5
batch_size = 32
max_length = 128
learning_rate = 2e-5
base_model = "microsoft/deberta-base"

print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# =================== HELPER FUNCTIONS ===================
def prepare_dataframe(df):
    """Chuẩn hóa dataframe: chọn 2 cột tweet & label, đổi tên và map label"""
    df = df[['tweet', 'label']].copy()
    df.columns = ['Text', 'Label']
    # Map label: 'real' -> 0, 'fake' -> 1 (theo cách DeBERTa code gốc)
    df['Label'] = df['Label'].map({'real': 0, 'fake': 1})
    return df

def encode_data_with_attention(texts, tokenizer, max_len):
    """Tokenize text với attention mask (theo style DeBERTa gốc)"""
    input_ids_list = []
    attention_masks_list = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids_list.append(encoded_dict['input_ids'])
        attention_masks_list.append(encoded_dict['attention_mask'])
    
    return torch.tensor(input_ids_list), torch.tensor(attention_masks_list)

def flat_accuracy(preds, labels):
    """Tính accuracy (theo style DeBERTa gốc)"""
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def evaluate_model(model, val_loader):
    """Đánh giá model trên validation set"""
    model.eval()
    all_preds, all_labels = [], []
    eval_loss = 0
    nb_eval_steps = 0
    
    with torch.no_grad():
        for batch in val_loader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_mask, labels = batch
            
            outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)
            logits = outputs[0]
            
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            
            all_preds.extend(np.argmax(logits, axis=1))
            all_labels.extend(label_ids)
            nb_eval_steps += 1

    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, zero_division=0)
    rec = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)
    return acc, prec, rec, f1

# =================== MAIN TRAINING ===================
print("🚀 Training DeBERTa model on covid dataset")

# Load & prepare data
print("📊 Loading and preparing covid dataset...")
df = pd.read_csv(liar_dataset_path)
df = prepare_dataframe(df)

print(f"Dataset shape: {df.shape}")
print(f"Label distribution:\n{df['Label'].value_counts()}")

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=seed, stratify=df['Label'])
print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

# Initialize tokenizer
print("🔧 Initializing DeBERTa tokenizer...")
tokenizer = DebertaTokenizer.from_pretrained(base_model, do_lower_case=True)

# Encode data
print("🔤 Tokenizing texts...")
train_input_ids, train_attention_masks = encode_data_with_attention(
    train_df['Text'].tolist(), tokenizer, max_length
)
test_input_ids, test_attention_masks = encode_data_with_attention(
    test_df['Text'].tolist(), tokenizer, max_length
)

# Convert labels to tensors
train_labels = torch.tensor(train_df['Label'].values)
test_labels = torch.tensor(test_df['Label'].values)

# Create data loaders
print("📦 Creating data loaders...")
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Initialize model
print("🤖 Initializing DeBERTa model...")
model = DebertaForSequenceClassification.from_pretrained(base_model, num_labels=2)
model.to(device)

# Setup optimizer (theo style DeBERTa gốc)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

# Training loop
print(f"🏋️‍♂️ Starting training for {num_epochs} epochs...")
train_loss_set = []
best_val_accuracy = 0.0

for epoch in range(num_epochs):
    print(f"\n=== Epoch {epoch + 1}/{num_epochs} ===")
    
    # Training phase
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    
    for step, batch in enumerate(train_loader):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()
        
        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
    
    avg_train_loss = tr_loss / nb_tr_steps
    print(f"📈 Train loss: {avg_train_loss:.4f}")
    
    # Validation phase
    print("🔍 Evaluating on test set...")
    acc, prec, rec, f1 = evaluate_model(model, test_loader)
    
    print(f"📊 Test Results:")
    print(f"   Accuracy:  {acc:.4f}")
    print(f"   Precision: {prec:.4f}")
    print(f"   Recall:    {rec:.4f}")
    print(f"   F1 Score:  {f1:.4f}")
    
    # Save best model
    if acc > best_val_accuracy:
        print(f"🎉 New best accuracy: {acc:.4f} (previous: {best_val_accuracy:.4f})")
        best_val_accuracy = acc
        
        # Save model
        os.makedirs(output_base, exist_ok=True)
        model.save_pretrained(output_base)
        tokenizer.save_pretrained(output_base)
        
        # Save state dict (theo style DeBERTa gốc)
        torch.save(model.state_dict(), os.path.join(output_base, 'deberta_covid_best_model.ckpt'))
        print(f"💾 Model saved at: {output_base}")

print(f"\n✅ Training completed!")
print(f"🏆 Best validation accuracy: {best_val_accuracy:.4f}")
print(f"📁 Model saved at: {output_base}")

# Final evaluation
print(f"\n🔬 Final evaluation on test set...")
final_acc, final_prec, final_rec, final_f1 = evaluate_model(model, test_loader)
print(f"📊 Final Test Results:")
print(f"   Accuracy:  {final_acc:.4f}")
print(f"   Precision: {final_prec:.4f}")
print(f"   Recall:    {final_rec:.4f}")
print(f"   F1 Score:  {final_f1:.4f}")

# Save results to CSV
results_df = pd.DataFrame({
    'Dataset': ['LIAR'],
    'Model': ['DeBERTa-base'],
    'Accuracy': [final_acc],
    'Precision': [final_prec],
    'Recall': [final_rec],
    'F1_Score': [final_f1],
    'Best_Val_Accuracy': [best_val_accuracy]
})

results_df.to_csv(os.path.join(output_base, 'covid_deberta_results.csv'), index=False)
print(f"📈 Results saved to: {os.path.join(output_base, 'covid_deberta_results.csv')}")

Using device: cuda
GPU: Tesla P100-PCIE-16GB
🚀 Training DeBERTa model on covid dataset
📊 Loading and preparing covid dataset...
Dataset shape: (6420, 2)
Label distribution:
Label
0    3360
1    3060
Name: count, dtype: int64
Train size: 5136, Test size: 1284
🔧 Initializing DeBERTa tokenizer...
🔤 Tokenizing texts...
📦 Creating data loaders...
🤖 Initializing DeBERTa model...


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🏋️‍♂️ Starting training for 5 epochs...

=== Epoch 1/5 ===
📈 Train loss: 0.3405
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.9720
   Precision: 0.9721
   Recall:    0.9690
   F1 Score:  0.9705
🎉 New best accuracy: 0.9720 (previous: 0.0000)
💾 Model saved at: /kaggle/working/deberta_covid_model

=== Epoch 2/5 ===
📈 Train loss: 0.0760
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.9782
   Precision: 0.9803
   Recall:    0.9739
   F1 Score:  0.9770
🎉 New best accuracy: 0.9782 (previous: 0.9720)
💾 Model saved at: /kaggle/working/deberta_covid_model

=== Epoch 3/5 ===
📈 Train loss: 0.0369
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.9735
   Precision: 0.9949
   Recall:    0.9493
   F1 Score:  0.9716

=== Epoch 4/5 ===
📈 Train loss: 0.0188
🔍 Evaluating on test set...
📊 Test Results:
   Accuracy:  0.9712
   Precision: 0.9948
   Recall:    0.9444
   F1 Score:  0.9690

=== Epoch 5/5 ===
📈 Train loss: 0.0135
🔍 Evaluating on test set...
📊 Test Results:
  