In [5]:
import torch
import nltk
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score
import numpy as np

# Download the names dataset from NLTK
nltk.download('names')

# Get the male and female names from the NLTK dataset
male_names = [name.lower() for name in nltk.corpus.names.words('male.txt')]
female_names = [name.lower() for name in nltk.corpus.names.words('female.txt')]

# Create the list of all names and corresponding labels
all_names = male_names + female_names
all_labels = [0] * len(male_names) + [1] * len(female_names)

# Encode labels into numerical format
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_labels)

# Split into training and test sets (80% train, 20% test)
train_names, test_names, train_labels, test_labels = train_test_split(
    all_names, encoded_labels, test_size=0.2, random_state=42
)

# Initialize the tokenizer from BERT
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a custom PyTorch dataset for the names
class NameClassificationDataset(Dataset):
    def __init__(self, names, labels, tokenizer):
        self.names = names
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        label = self.labels[idx]

        tokenized_data = self.tokenizer(
            name,
            padding='max_length',
            max_length=10,
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': tokenized_data['input_ids'].squeeze(0),
            'attention_mask': tokenized_data['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


train_data_loader = DataLoader(NameClassificationDataset(train_names, train_labels, bert_tokenizer), batch_size=32, shuffle=True)
test_data_loader = DataLoader(NameClassificationDataset(test_names, test_labels, bert_tokenizer), batch_size=32, shuffle=False)

bert_classifier = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


device = 'cuda' if torch.cuda.is_available() else 'cpu'
bert_classifier.to(device)


optimizer = AdamW(bert_classifier.parameters(), lr=1e-5)

# Train the model
bert_classifier.train()
num_epochs = 10

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in train_data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['label'].to(device)

        # Forward pass
        outputs = bert_classifier(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=label
        )

        loss = outputs.loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update model parameters

        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(train_data_loader):.4f}")


bert_classifier.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = bert_classifier(
            input_ids=input_ids,
            attention_mask=attention_mask
        )


        logits = outputs.logits
        batch_predictions = torch.argmax(logits, axis=-1).tolist()

        predictions.extend(batch_predictions)
        true_labels.extend(labels.tolist())


accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package names to /root/nltk_data...
[nltk_data]   Package names is already up-to-date!
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 0.4166
Epoch 2/10, Loss: 0.2769
Epoch 3/10, Loss: 0.2200
Epoch 4/10, Loss: 0.1768
Epoch 5/10, Loss: 0.1507
Epoch 6/10, Loss: 0.1282
Epoch 7/10, Loss: 0.1098
Epoch 8/10, Loss: 0.0995
Epoch 9/10, Loss: 0.0933
Epoch 10/10, Loss: 0.0904
Test Accuracy: 0.8433
