In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the CSV into a DataFrame
df = pd.read_csv('sms.csv')

# Preprocess the data
# For simplicity, we'll just focus on 'sms' column and 'amount', 'date_time', 'category' columns
df = df[['sms', 'amount', 'date_time', 'category', 'fees', 'account', 'transaction_type']]

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Load pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['category'].unique()))

# Tokenize and encode the text data
def tokenize_text(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []

    for sms in df['sms']:
        encoded_sms = tokenizer.encode_plus(
            sms,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded_sms['input_ids'])
        attention_masks.append(encoded_sms['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

max_length = 128
train_inputs, train_masks = tokenize_text(train_df, tokenizer, max_length)
test_inputs, test_masks = tokenize_text(test_df, tokenizer, max_length)

# Convert labels to tensor
train_labels = torch.tensor(train_df['category'].astype('category').cat.codes.values)
test_labels = torch.tensor(test_df['category'].astype('category').cat.codes.values)

# Create DataLoader for training and testing
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Set up GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average training loss: {avg_train_loss}')
# Evaluation
model.eval()
eval_accuracy = 0
nb_eval_steps = 0

for batch in tqdm(test_dataloader, desc='Evaluating'):
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits

    logits = logits.detach().cpu().numpy()
    predictions = np.argmax(logits, axis=1)
    label_ids = batch[2].cpu().numpy()
    
    tmp_eval_accuracy = np.sum(predictions == label_ids) / len(label_ids)
    eval_accuracy += tmp_eval_accuracy

    nb_eval_steps += 1

eval_accuracy = eval_accuracy / nb_eval_steps
print(f'Accuracy: {eval_accuracy}')



  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3:   0%|                                                                                 | 0/1 [00:07<?, ?it/s]


ValueError: Target size (torch.Size([31])) must be the same as input size (torch.Size([31, 2]))