## Deep Learning Experiments for Fraud Detection
We use this notebook to run GPU-intensive experiments on Google Colab.

In [3]:
import pandas as pd
email_ds = pd.read_csv('spamassassin.csv', on_bad_lines='skip')
text_ds = pd.read_csv('spam.csv', encoding='latin-1')
call_ds = pd.read_csv('overall_transcript.csv')
url_ds = pd.read_csv('urls.csv')

text_ds = text_ds.sample(frac=1, random_state=42)[['label', 'text']]
email_ds = email_ds.sample(frac=1, random_state=42)[['label', 'text']]
call_ds = call_ds.sample(frac=1, random_state=42)[['label', 'text']]

In [4]:
url_ds = url_ds.sample(frac=1, random_state=42)[['label', 'url']]
url_ds['label'] = url_ds['label'].apply(lambda x: 0 if x.strip() == 'good' else 1)

Training a BERT-Classifier Model!

In [5]:
!pip install transformers
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, top_k_accuracy_score, recall_score
import pandas as pdb



In [6]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_length):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]
    encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
    return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
    super(BERTClassifier, self).__init__()
    # self.bert = BertModel.from_pretrained(bert_model_name)
    self.bert = AutoModel.from_pretrained('prajjwal1/bert-tiny').to(device)
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    x = self.dropout(pooled_output)
    logits = self.fc(x)
    return logits

In [7]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []

    all_outputs = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
            all_outputs.extend(outputs.cpu().tolist())

    outputs = torch.tensor(all_outputs)
    return accuracy_score(actual_labels, predictions), recall_score(actual_labels, predictions, pos_label=0)

def predict_label(text, model, tokenizer, device, max_length=256, k=1):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
      model.to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      outputs = torch.softmax(outputs, dim=1)
      print(outputs)
      _, preds = torch.max(outputs, dim=1)
      return preds.cpu().tolist()[0]

In [8]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2 # change this to be calculated dynamically
max_length = 512
batch_size = 16
num_epochs = 10
learning_rate = 2e-5

texts = list(text_ds['text']) + list(email_ds['text'])
labels = list(text_ds['label']) + list(email_ds['label'])

# texts = list(url_ds['url'])
# labels = list(url_ds['label'])
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.15, random_state=42)
# bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny')
train_dataset = TextClassificationDataset(train_texts, train_labels, bert_tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, bert_tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

device = torch.device("cuda")
bert_model = BERTClassifier(bert_model_name, num_classes).to(device)
optimizer = torch.optim.Adam(bert_model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [9]:
for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  train(bert_model, train_dataloader, optimizer, scheduler, device)
  accuracy, recall = evaluate(bert_model, val_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}, recall: {recall:.4f}")

accuracy, recall = evaluate(bert_model, train_dataloader, device)
print(f"Training Accuracy: {accuracy:.4f}, recall: {recall:.4f}")

Epoch 1/10
Validation Accuracy: 0.9444, recall: 0.8556
Epoch 2/10
Validation Accuracy: 0.9675, recall: 0.8770
Epoch 3/10
Validation Accuracy: 0.9704, recall: 0.8743
Epoch 4/10
Validation Accuracy: 0.9817, recall: 0.9439
Epoch 5/10
Validation Accuracy: 0.9840, recall: 0.9599
Epoch 6/10
Validation Accuracy: 0.9870, recall: 0.9652
Epoch 7/10
Validation Accuracy: 0.9834, recall: 0.9412
Epoch 8/10
Validation Accuracy: 0.9864, recall: 0.9626
Epoch 9/10
Validation Accuracy: 0.9864, recall: 0.9599
Epoch 10/10
Validation Accuracy: 0.9864, recall: 0.9599
Training Accuracy: 0.9930, recall: 0.9742


In [14]:
call_texts = list(call_ds['text'])
for i in call_texts:
  i.replace('\\n', '\n')
call_labels = list(call_ds['label'])

train_call_texts, val_call_texts, train_call_labels, val_call_labels = train_test_split(call_texts, call_labels, test_size=0.15, stratify=call_labels)
call_train_dataset = TextClassificationDataset(train_call_texts, train_call_labels, bert_tokenizer, max_length)
call_val_dataset = TextClassificationDataset(val_call_texts, val_call_labels, bert_tokenizer, max_length)
call_train_dataloader = DataLoader(call_train_dataset, batch_size=batch_size, shuffle=True)
call_val_dataloader = DataLoader(call_val_dataset, batch_size=batch_size, shuffle=True)

In [15]:
optimizer = torch.optim.Adam(bert_model.parameters(), lr=learning_rate)
total_steps = (len(call_train_dataloader) + len(train_dataloader)) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

accuracy, recall = evaluate(bert_model, call_val_dataloader, device)
print(f"Validation Accuracy for calls: {accuracy:.4f}, recall: {recall:.4f}")

for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  train(bert_model, call_train_dataloader, optimizer, scheduler, device)
  accuracy, recall = evaluate(bert_model, call_val_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}, recall: {recall:.4f}")

  accuracy, recall = evaluate(bert_model, call_train_dataloader, device)
  print(f"Training Accuracy: {accuracy:.4f}, recall: {recall:.4f}")

Validation Accuracy for emails: 0.2368, recall: 1.0000
Epoch 1/10
Validation Accuracy: 0.7632, recall: 0.0000
Training Accuracy: 0.7617, recall: 0.0000
Epoch 2/10
Validation Accuracy: 0.8947, recall: 1.0000
Training Accuracy: 0.9299, recall: 0.9902
Epoch 3/10
Validation Accuracy: 0.9342, recall: 0.8333
Training Accuracy: 0.9813, recall: 0.9804
Epoch 4/10
Validation Accuracy: 0.9342, recall: 0.8889
Training Accuracy: 0.9907, recall: 0.9902
Epoch 5/10
Validation Accuracy: 0.9474, recall: 0.7778
Training Accuracy: 0.9930, recall: 0.9706
Epoch 6/10
Validation Accuracy: 0.9474, recall: 0.9444
Training Accuracy: 0.9977, recall: 1.0000
Epoch 7/10
Validation Accuracy: 0.9474, recall: 0.7778
Training Accuracy: 1.0000, recall: 1.0000
Epoch 8/10
Validation Accuracy: 0.9342, recall: 0.7222
Training Accuracy: 1.0000, recall: 1.0000
Epoch 9/10
Validation Accuracy: 0.9211, recall: 0.9444
Training Accuracy: 1.0000, recall: 1.0000
Epoch 10/10
Validation Accuracy: 0.9474, recall: 0.7778
Training Accurac

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

def evaluate_roc(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(outputs, dim=1)[:, 1] # Probability of class 1 (assuming binary classification)
            predictions.extend(probabilities.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    fpr, tpr, thresholds = roc_curve(actual_labels, predictions)
    roc_auc = auc(fpr, tpr)

    return fpr, tpr, roc_auc


# Evaluate on validation set and get ROC curve data
fpr, tpr, roc_auc = evaluate_roc(bert_model, call_val_dataloader, device)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
predict_label('Speaker 1: Hi! How is your day going?\nSpeaker 2: It\'s going well', bert_model, bert_tokenizer, device)

tensor([[0.0289, 0.9711]], device='cuda:0')


1

In [None]:
bert_model.bert.num_parameters()

4385920

In [None]:
torch.save(bert_model, 'bert_model.pt')

In [None]:
model = torch.load('bert_model.pt', weights_only=False)

## Audio Modeling

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import torch
import numpy as np
from sklearn.neural_network import MLPClassifier
import json
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
audio_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# prompt: mount google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# audio_ds = pd.read_csv('drive/MyDrive/audio_ds.csv', on_bad_lines='skip')
audio_ds = audio_ds.sample(frac=1, random_state=42)
X = audio_ds.drop('label', axis=1)
y = audio_ds['label']

# Extract groups for the split (source files)
groups = X['source']

print(f"Number of unique source files: {len(groups.unique())}")
print(f"Class distribution: {sum(y)} / {len(y)} = {sum(y) / len(y):.4f}")

# Perform a group-based split to ensure sources appear in only one split
gss = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=42)

# Get indices for the train-test split
train_idx, test_idx = next(gss.split(X, y, groups=groups))
np.random.shuffle(train_idx)
np.random.shuffle(test_idx)
# Apply the split
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Verify we have properly separated sources
train_sources = set(X_train['source'])
test_sources = set(X_test['source'])
overlap = train_sources.intersection(test_sources)
print(f"Sources in training set: {len(train_sources)}")
print(f"Sources in testing set: {len(test_sources)}")
print(f"Overlapping sources: {len(overlap)}")
print(f"Train class distribution: {sum(y_train)} / {len(y_train)} = {sum(y_train) / len(y_train):.4f}")
print(f"Test class distribution: {sum(y_test)} / {len(y_test)} = {sum(y_test) / len(y_test):.4f}")

X_train = X_train.drop('source', axis=1)
X_test = X_test.drop('source', axis=1)

Number of unique source files: 41
Class distribution: 5831 / 8249 = 0.7069
Sources in training set: 34
Sources in testing set: 7
Overlapping sources: 0
Train class distribution: 5115 / 6918 = 0.7394
Test class distribution: 716 / 1331 = 0.5379


In [None]:
class AudioClassificationDataset(Dataset):
  def __init__(self, chunks, labels, processor, max_length=32000):
    self.chunks = chunks
    self.labels = labels
    self.processor = processor
    self.max_length = max_length

  def __len__(self):
    return len(self.chunks)

  def __getitem__(self, idx):
    chunk = self.chunks[idx]
    label = self.labels[idx]
    chunk = np.array(json.loads(chunk))
    inputs = self.processor(chunk, sampling_rate=16000, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length)
    input_values = inputs.input_values.squeeze().to(device)
    return {'input': input_values, 'label': torch.tensor(label)}

class Wave2Vec2Classifier(nn.Module):
  def __init__(self, num_classes):
    super().__init__()
    self.wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(768, num_classes)

  def forward(self, input):
    outputs = self.wav2vec(input)
    hidden_states = outputs.last_hidden_state.squeeze()
    pooled_output = torch.mean(hidden_states, dim=1)
    x = self.dropout(pooled_output)
    logits = self.fc(x)
    return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        inputs = batch['input'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input=inputs)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []

    all_outputs = []
    with torch.no_grad():
        for batch in data_loader:
            inputs = batch['input'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input=inputs)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
            all_outputs.extend(outputs.cpu().tolist())

    outputs = torch.tensor(all_outputs)
    return accuracy_score(actual_labels, predictions), recall_score(actual_labels, predictions, pos_label=0)

In [None]:
X_train = list(X_train['samples'])
X_test = list(X_test['samples'])

y_train = list(y_train)
y_test = list(y_test)

In [None]:
num_classes = 2
batch_size = 8
num_epochs = 8
learning_rate = 2e-5

train_dataset = AudioClassificationDataset(X_train, y_train, audio_processor)
val_dataset = AudioClassificationDataset(X_test, y_test, audio_processor)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, drop_last=True)

device = torch.device("cuda")
model = Wave2Vec2Classifier(num_classes=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 4.12 MiB is free. Process 131237 has 14.73 GiB memory in use. Of the allocated memory 14.12 GiB is allocated by PyTorch, and 493.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  train(model, train_dataloader, optimizer, scheduler, device)
  accuracy, recall = evaluate(model, val_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}, recall: {recall:.4f}")

  # accuracy, recall = evaluate(model, train_dataloader, device)
  # print(f"Training Accuracy: {accuracy:.4f}, recall: {recall:.4f}")

Epoch 1/8


OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 4.12 MiB is free. Process 131237 has 14.73 GiB memory in use. Of the allocated memory 14.05 GiB is allocated by PyTorch, and 565.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
print(accuracy)
print(recall)

0.8787650602409639
0.7899022801302932


In [None]:
torch.save(model.state_dict(), 'audio_model.pth')