In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_data = pd.read_csv('/kaggle/working/train.csv')
test_data = pd.read_csv('/kaggle/working/test.csv')
train_data['Category'] = train_data['Category'].map({'spam': 1, 'ham': 0})  # Assuming 'spam' is 1 and 'ham' is 0
test_data['Category'] = test_data['Category'].map({'spam': 1, 'ham': 0})

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Message']
        label = self.data.iloc[idx]['Category']
        
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = CustomDataset(train_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=False).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

writer = SummaryWriter('logs')

def train(model, loader, optimizer, criterion, epoch, writer):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, batch in enumerate(loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        logits = outputs[1]
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total

    writer.add_scalar('Loss/train', epoch_loss / len(loader), epoch)
    writer.add_scalar('Accuracy/train', accuracy, epoch)

    return epoch_loss / len(loader), accuracy

def evaluate(model, loader, criterion, epoch, writer):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            logits = outputs[1]
            
            epoch_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total

    writer.add_scalar('Loss/test', epoch_loss / len(loader), epoch)
    writer.add_scalar('Accuracy/test', accuracy, epoch)

    return epoch_loss / len(loader), accuracy

N_EPOCHS = 5

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion, epoch, writer)
    test_loss, test_acc = evaluate(model, test_loader, criterion, epoch, writer)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

writer.close()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch: 01
	Train Loss: 0.092 | Train Acc: 97.53%
	Test Loss: 0.041 | Test Acc: 99.01%
Epoch: 02
	Train Loss: 0.029 | Train Acc: 99.33%
	Test Loss: 0.027 | Test Acc: 99.46%
Epoch: 03
	Train Loss: 0.012 | Train Acc: 99.64%
	Test Loss: 0.028 | Test Acc: 99.19%
Epoch: 04
	Train Loss: 0.007 | Train Acc: 99.82%
	Test Loss: 0.043 | Test Acc: 99.01%
Epoch: 05
	Train Loss: 0.002 | Train Acc: 99.93%
	Test Loss: 0.032 | Test Acc: 99.19%


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv')

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_data.to_csv('train.csv', index=False)
test_data.to_csv('test.csv', index=False)


In [9]:
data = pd.read_csv("/kaggle/input/spam-text-message-classification/SPAM text message 20170820 - Data.csv")
unique_categories = data['Category'].unique()
print(unique_categories)

['ham' 'spam']
