In [None]:
# Install required libraries
!pip install transformers datasets torch pandas numpy scikit-learn
!pip install onnx onnxruntime  # For model optimization
# Update transformers to the latest version
!pip install --upgrade transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizer
import kagglehub

# Download latest version

# Load the dataset
data = kagglehub.dataset_download("mlg-ulb/creditcardfraud")


# Verify the data
print("Dataset shape:", data.shape)
print("First few rows of the dataset:")
print(data.head())

# Create synthetic text descriptions
data['text'] = data.apply(lambda x: f"Transaction of ${x['Amount']:.2f} at time {x['Time']:.0f} seconds", axis=1)

# Select relevant columns
data = data[['text', 'Class']].rename(columns={'Class': 'label'})

# Handle class imbalance (optional: undersample non-fraudulent transactions)
fraud = data[data['label'] == 1]
non_fraud = data[data['label'] == 0].sample(n=len(fraud) * 5, random_state=42)  # 1:5 ratio
balanced_data = pd.concat([fraud, non_fraud]).sample(frac=1, random_state=42)

# Split into train and test sets
train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

AttributeError: 'str' object has no attribute 'shape'

In [None]:
!pip install --upgrade transformers

import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, TrainingArguments, Trainer

# Load tokenizer (fallback to distilbert-base-uncased due to local tokenizer incompatibility)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load model (use distilbert-base-uncased due to local model loading failure)
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2  # Binary classification (fraud vs. legitimate)
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none'  # Disable WandB logging to reduce overhead
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda pred: {
        'accuracy': (pred.predictions.argmax(-1) == pred.label_ids).mean()
    }
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'train_dataset' is not defined

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Get predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids

# Print classification report
print("Classification Report:")
print(classification_report(labels, preds, target_names=['Legitimate', 'Fraud']))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(labels, preds))

NameError: name 'trainer' is not defined

In [None]:
from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np
import torch

# Get prediction probabilities from the trainer
predictions = trainer.predict(test_dataset)
probs = torch.softmax(torch.tensor(predictions.predictions), dim=-1)[:, 1].numpy()  # Fraud class probabilities
labels = predictions.label_ids

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(labels, probs)

# Find threshold for high recall (e.g., >0.8)
optimal_idx = np.argmax(recall >= 0.8)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold for ~80% fraud recall: {optimal_threshold:.3f}")

# Apply threshold
preds_adjusted = (probs >= optimal_threshold).astype(int)

# Print adjusted classification report
print("Adjusted Classification Report:")
print(classification_report(labels, preds_adjusted, target_names=['Legitimate', 'Fraud']))

# Print adjusted confusion matrix
print("Adjusted Confusion Matrix:")
print(confusion_matrix(labels, preds_adjusted))

NameError: name 'trainer' is not defined

In [None]:
from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np
import torch

# Get prediction probabilities from the trainer
predictions = trainer.predict(test_dataset)
probs = torch.softmax(torch.tensor(predictions.predictions), dim=-1)[:, 1].numpy()  # Fraud class probabilities
labels = predictions.label_ids

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(labels, probs)

# Find threshold for high recall (e.g., >0.8)
optimal_idx = np.argmax(recall >= 0.8)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold for ~80% fraud recall: {optimal_threshold:.3f}")

# Apply threshold
preds_adjusted = (probs >= optimal_threshold).astype(int)

# Print adjusted classification report
print("Adjusted Classification Report:")
print(classification_report(labels, preds_adjusted, target_names=['Legitimate', 'Fraud']))

# Print adjusted confusion matrix
print("Adjusted Confusion Matrix:")
print(confusion_matrix(labels, preds_adjusted))

NameError: name 'trainer' is not defined

In [None]:
from transformers import Trainer, DistilBertForSequenceClassification, TrainingArguments
import torch
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load best checkpoint (epoch 1, 81.56% accuracy)
model = DistilBertForSequenceClassification.from_pretrained('/kaggle/working/results/checkpoint-148')

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Get device from model parameters to handle DataParallel
        device = next(model.parameters()).device
        # Class weights: 1.0 for legitimate, 10.0 for fraud
        class_weights = torch.tensor([1.0, 10.0]).to(device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/results_weighted',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs_weighted',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none'
)

# Initialize WeightedTrainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(-1) == pred.label_ids).mean()}
)

# Train
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print("Weighted Loss Evaluation Results:", eval_results)

# Detailed evaluation
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
print("Weighted Loss Classification Report:")
print(classification_report(labels, preds, target_names=['Legitimate', 'Fraud']))
print("Weighted Loss Confusion Matrix:")
print(confusion_matrix(labels, preds))


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/working/results/checkpoint-148'. Use `repo_type` argument if needed.

In [None]:
from sklearn.metrics import classification_report, precision_recall_curve
import numpy as np
import torch

# Get prediction probabilities
predictions = trainer.predict(test_dataset)
probs = torch.softmax(torch.tensor(predictions.predictions), dim=-1)[:, 1].numpy()
labels = predictions.label_ids

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(labels, probs)

# Find threshold for ~80% recall
optimal_idx = np.argmax(recall >= 0.8)
optimal_threshold = thresholds[optimal_idx]
print(f"Optimal threshold for ~80% fraud recall: {optimal_threshold:.3f}")

# Apply threshold
preds_adjusted = (probs >= optimal_threshold).astype(int)

# Print adjusted metrics
print("Adjusted Classification Report:")
print(classification_report(labels, preds_adjusted, target_names=['Legitimate', 'Fraud']))
print("Adjusted Confusion Matrix:")
print(confusion_matrix(labels, preds_adjusted))

NameError: name 'trainer' is not defined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizer

# Load dataset
data = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

# Create enhanced text
data['text'] = data.apply(
    lambda x: f"Transaction of ${x['Amount']:.2f} at time {x['Time']:.0f} seconds, "
              f"V1={x['V1']:.2f}, V2={x['V2']:.2f}, V3={x['V3']:.2f}, V4={x['V4']:.2f}, "
              f"V5={x['V5']:.2f}, V6={x['V6']:.2f}, V7={x['V7']:.2f}, V8={x['V8']:.2f}",
    axis=1
)
data = data[['text', 'Class']].rename(columns={'Class': 'label'})

# Balance dataset
fraud = data[data['label'] == 1]
non_fraud = data[data['label'] == 0].sample(n=len(fraud) * 5, random_state=42)
balanced_data = pd.concat([fraud, non_fraud]).sample(frac=1, random_state=42)

# Split
train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Tokenize
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/creditcardfraud/creditcard.csv'

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# List checkpoints
!ls /kaggle/working/results_weighted

# Load epoch 2 checkpoint
best_checkpoint = '/kaggle/working/results_weighted/checkpoint-296'  # Verify path
model = DistilBertForSequenceClassification.from_pretrained(best_checkpoint)

# Reinitialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(-1) == pred.label_ids).mean()}
)

# Evaluate
eval_results = trainer.evaluate()
print("Best Checkpoint Evaluation Results:", eval_results)

# Detailed evaluation
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
print("Best Checkpoint Classification Report:")
print(classification_report(labels, preds, target_names=['Legitimate', 'Fraud']))
print("Best Checkpoint Confusion Matrix:")
print(confusion_matrix(labels, preds))


ls: cannot access '/kaggle/working/results_weighted': No such file or directory


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/working/results_weighted/checkpoint-296'. Use `repo_type` argument if needed.

In [None]:
from transformers import Trainer, DistilBertForSequenceClassification, TrainingArguments
import torch
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Load best checkpoint (epoch 1, 81.56% accuracy)
model = DistilBertForSequenceClassification.from_pretrained('/kaggle/working/results/checkpoint-148')

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Get device from model parameters to handle DataParallel
        device = next(model.parameters()).device
        # Class weights: 1.0 for legitimate, 10.0 for fraud
        class_weights = torch.tensor([1.0, 10.0]).to(device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/results_weighted',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs_weighted',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    report_to='none'
)

# Initialize WeightedTrainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda pred: {'accuracy': (pred.predictions.argmax(-1) == pred.label_ids).mean()}
)

# Train
trainer.train()

# Evaluate
eval_results = trainer.evaluate()
print("Weighted Loss Evaluation Results:", eval_results)

# Detailed evaluation
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
labels = predictions.label_ids
print("Weighted Loss Classification Report:")
print(classification_report(labels, preds, target_names=['Legitimate', 'Fraud']))
print("Weighted Loss Confusion Matrix:")
print(confusion_matrix(labels, preds))

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/working/results/checkpoint-148'. Use `repo_type` argument if needed.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import numpy as np
import torch

# Confusion Matrix Heatmap
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Legitimate', 'Fraud'], yticklabels=['Legitimate', 'Fraud'])
plt.title('Confusion Matrix (Weighted Loss, Feature-Engineered)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('confusion_matrix.png')
plt.show()

# ROC Curve
probs = torch.softmax(torch.tensor(predictions.predictions), dim=-1)[:, 1].numpy()
fpr, tpr, _ = roc_curve(labels, probs)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.savefig('roc_curve.png')
plt.show()

# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(labels, probs)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.savefig('precision_recall_curve.png')
plt.show()

# Training Loss Plot
training_log = trainer.state.log_history
train_loss = [log['loss'] for log in training_log if 'loss' in log]
eval_loss = [log['eval_loss'] for log in training_log if 'eval_loss' in log]
epochs = range(1, len(eval_loss) + 1)
plt.figure(figsize=(8, 6))
plt.plot(epochs, train_loss[::len(train_loss)//len(epochs)], label='Training Loss')
plt.plot(epochs, eval_loss, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.savefig('loss_plot.png')
plt.show()


NameError: name 'labels' is not defined

In [None]:
new_data = pd.DataFrame({
    'Time': [0.0],
    'Amount': [149.62],
    'V1': [-1.359807], 'V2': [-0.072781], 'V3': [2.536347], 'V4': [1.378155],
    'V5': [-0.338321], 'V6': [0.462388], 'V7': [0.239599], 'V8': [0.098698],
    'text': ['Transaction of $149.62 at time 0 seconds, V1=-1.36, V2=-0.07, V3=2.54, V4=1.38, V5=-0.34, V6=0.46, V7=0.24, V8=0.10']
})
new_dataset = Dataset.from_pandas(new_data)
new_dataset = new_dataset.map(tokenize_function, batched=True)
new_dataset.set_format('torch', columns=['input_ids', 'attention_mask'])

# Predict
predictions = trainer.predict(new_dataset)
probs = torch.softmax(torch.tensor(predictions.predictions), dim=-1)[:, 1].numpy()
pred = 'Fraud' if probs[0] >= 0.5 else 'Legitimate'
print(f"Prediction: {pred}, Fraud Probability: {probs[0]:.3f}")

NameError: name 'tokenize_function' is not defined

In [None]:
!pip install onnx onnxruntime
from transformers import DistilBertForSequenceClassification
import torch
model = DistilBertForSequenceClassification.from_pretrained('/kaggle/working/results_weighted/checkpoint-444')
dummy_input = torch.zeros(1, 256, dtype=torch.long)
torch.onnx.export(model, dummy_input, "distilbert_fraud.onnx")



HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/working/results_weighted/checkpoint-444'. Use `repo_type` argument if needed.