<a href="https://colab.research.google.com/github/Natural-Language-Processing-YU/M3_Assignment/blob/main/scripts/m3_assignment_part_III.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach. 

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications. 

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment. 

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
from nltk.corpus import names
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Step 1: Prepare the dataset
nltk.download('names')
male_names = names.words('male.txt')
female_names = names.words('female.txt')

# Create labeled dataset
data = [(name, 0) for name in male_names] + [(name, 1) for name in female_names]

# Split dataset into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Step 2: Preprocess the data
class NameDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.char2idx = {char: idx + 1 for idx, char in enumerate(set(''.join([name for name, label in data])))}
        self.idx2char = {idx + 1: char for idx, char in enumerate(set(''.join([name for name, label in data])))}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        name, label = self.data[idx]
        name_encoded = [self.char2idx[char] for char in name]
        return torch.LongTensor(name_encoded), torch.tensor(label)

def collate_fn(batch):
    names, labels = zip(*batch)
    names = pad_sequence(names, batch_first=True)
    labels = torch.stack(labels)
    return names, labels

train_dataset = NameDataset(train_data)
test_dataset = NameDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Step 3: Define the Transformer model architecture
class TransformerClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, num_heads=1, dropout=0.1):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(hidden_size, num_heads, hidden_size, dropout), num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(1, 0, 2)  # [seq_len, batch_size, embedding_dim]
        output = self.transformer(embedded)
        output = output.mean(dim=0)  # Aggregate across sequence length
        output = self.fc(output)
        return output

# Step 4: Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TransformerClassifier(len(train_dataset.char2idx) + 1, 128, 2)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for names, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        names, labels = names.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(names)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * names.size(0)
        
    epoch_loss = running_loss / len(train_dataset)
    print(f'Train Loss: {epoch_loss:.4f}')

# Step 5: Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for names, labels in test_loader:
        names, labels = names.to(device), labels.to(device)
        
        outputs = model(names)
        _, predicted = torch.max(outputs.data, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on test set: {accuracy:.4f}')


[nltk_data] Downloading package names to /home/fk-ubuntu/nltk_data...
[nltk_data]   Package names is already up-to-date!
Epoch 1/10:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1/10: 100%|██████████| 100/100 [00:00<00:00, 118.85it/s]


Train Loss: 0.5747


Epoch 2/10: 100%|██████████| 100/100 [00:00<00:00, 476.33it/s]


Train Loss: 0.5308


Epoch 3/10: 100%|██████████| 100/100 [00:00<00:00, 501.94it/s]


Train Loss: 0.5128


Epoch 4/10: 100%|██████████| 100/100 [00:00<00:00, 536.04it/s]


Train Loss: 0.4980


Epoch 5/10: 100%|██████████| 100/100 [00:00<00:00, 472.21it/s]


Train Loss: 0.4968


Epoch 6/10: 100%|██████████| 100/100 [00:00<00:00, 490.45it/s]


Train Loss: 0.4819


Epoch 7/10: 100%|██████████| 100/100 [00:00<00:00, 554.66it/s]


Train Loss: 0.4760


Epoch 8/10: 100%|██████████| 100/100 [00:00<00:00, 554.98it/s]


Train Loss: 0.4720


Epoch 9/10: 100%|██████████| 100/100 [00:00<00:00, 495.74it/s]


Train Loss: 0.4712


Epoch 10/10: 100%|██████████| 100/100 [00:00<00:00, 531.72it/s]


Train Loss: 0.4670
Accuracy on test set: 0.6602


In [7]:
# Step 5: Evaluate the model
model.eval()
correct = 0
total = 0
predictions = []

with torch.no_grad():
    for names, labels in test_loader:
        names, labels = names.to(device), labels.to(device)
        
        outputs = model(names)
        _, predicted = torch.max(outputs.data, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # Decode names and labels
        decoded_names = []
        for name_tensor in names.cpu():
            decoded_name = ''.join([test_dataset.idx2char[idx.item()] for idx in name_tensor if idx.item() != 0])
            decoded_names.append(decoded_name)
            
        predicted_labels = predicted.cpu().numpy().tolist()
        predictions.extend(list(zip(decoded_names, predicted_labels)))

accuracy = correct / total
print(f'Accuracy on test set: {accuracy:.4f}')

# Display test data input and output
print("\nTest Data Input and Predictions:")
for name, prediction in predictions:
    print(f"Name: {name}, Predicted Label: {'Male' if prediction == 0 else 'Female'}")


Accuracy on test set: 0.6602

Test Data Input and Predictions:
Name: Hersh, Predicted Label: Male
Name: Saxon, Predicted Label: Female
Name: Roselyn, Predicted Label: Male
Name: Karel, Predicted Label: Female
Name: Ariadne, Predicted Label: Female
Name: Hilary, Predicted Label: Female
Name: Sebastien, Predicted Label: Female
Name: Anallese, Predicted Label: Female
Name: Margurite, Predicted Label: Male
Name: Ruthi, Predicted Label: Male
Name: Kelcy, Predicted Label: Female
Name: Tuck, Predicted Label: Male
Name: Bernete, Predicted Label: Male
Name: Rodie, Predicted Label: Female
Name: Wilt, Predicted Label: Male
Name: Chloris, Predicted Label: Male
Name: Normand, Predicted Label: Male
Name: Fawne, Predicted Label: Female
Name: Bethany, Predicted Label: Female
Name: Lianna, Predicted Label: Female
Name: Pasquale, Predicted Label: Male
Name: Clifford, Predicted Label: Male
Name: Letti, Predicted Label: Female
Name: Briggs, Predicted Label: Male
Name: Danya, Predicted Label: Female
Name: 

# References
1. https://arxiv.org/pdf/2102.03692.pdf
2. https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/exercise/13-attention.html
3. https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044
4. https://www.nltk.org/book/ch02.html#sec-lexical-resources