In [1]:
# Install required libraries
!pip install transformers torch scikit-learn



In [2]:
# Disable WandB logging to avoid API key issues
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"
os.environ["WANDB_WATCH"] = "false"

In [3]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

In [4]:
# Load the dataset
file_path = "/home/fraud_call_cleaned.json"  # Adjust the path if needed
df = pd.read_json(file_path)

In [5]:
# Convert labels to binary format (fraud = 1, normal = 0)
df["label"] = df["label"].map({"fraud": 1, "normal": 0})

In [6]:
# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["transcript"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

In [7]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
# Define Dataset class for tokenization
class ScamCallDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

In [9]:
# Create dataset objects
train_dataset = ScamCallDataset(train_texts, train_labels, tokenizer)
test_dataset = ScamCallDataset(test_texts, test_labels, tokenizer)

In [10]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,  # Adjust for more training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=500,
    load_best_model_at_end=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [12]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [13]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.102,0.088393
2,0.0329,0.085855
3,0.0157,0.100467


TrainOutput(global_step=1779, training_loss=0.04443499290118933, metrics={'train_runtime': 418.8754, 'train_samples_per_second': 33.955, 'train_steps_per_second': 4.247, 'total_flos': 935557135096320.0, 'train_loss': 0.04443499290118933, 'epoch': 3.0})

In [14]:
model.save_pretrained("scam_call_model")
tokenizer.save_pretrained("scam_call_model")

# Zip the model folder for downloading
!zip -r scam_call_model.zip scam_call_model

  adding: scam_call_model/ (stored 0%)
  adding: scam_call_model/config.json (deflated 49%)
  adding: scam_call_model/vocab.txt (deflated 53%)
  adding: scam_call_model/special_tokens_map.json (deflated 42%)
  adding: scam_call_model/tokenizer_config.json (deflated 75%)
  adding: scam_call_model/model.safetensors (deflated 7%)


In [15]:
# Load the trained model
from transformers import pipeline

In [16]:
scam_classifier = pipeline("text-classification", model="scam_call_model", tokenizer="scam_call_model")

Device set to use cuda:0


In [17]:
# Example scam call transcripts for testing
examples = [
    "Hello, I am calling from your bank. Your account has been compromised. Please share your OTP to verify your identity.",
    "Hey, how are you? Let’s meet up for coffee this weekend.",
    "You have won a free trip to Dubai! Click this link now to claim your prize."
]

In [18]:
# Get predictions
predictions = scam_classifier(examples)
for text, pred in zip(examples, predictions):
    print(f"Text: {text}\nPrediction: {pred}\n")

Text: Hello, I am calling from your bank. Your account has been compromised. Please share your OTP to verify your identity.
Prediction: {'label': 'LABEL_1', 'score': 0.9983745813369751}

Text: Hey, how are you? Let’s meet up for coffee this weekend.
Prediction: {'label': 'LABEL_0', 'score': 0.9998738765716553}

Text: You have won a free trip to Dubai! Click this link now to claim your prize.
Prediction: {'label': 'LABEL_1', 'score': 0.9974332451820374}

