# Part III
Using the previous two tutorials, please answer the following using an encorder-decoder approach and an LSTM compared approach. 

Please create a transformer-based classifier for English name classification into male or female.

There are several datasets for name for male or female classification. In subseuqent iterations, this could be expanded to included more classifications. 

Below is the source from NLTK, which only has male and female available but could be used for the purposes of this assignment. 

```
names = nltk.corpus.names
names.fileids()
['female.txt', 'male.txt']
male_names = names.words('male.txt')
female_names = names.words('female.txt')
[w for w in male_names if w in female_names]
['Abbey', 'Abbie', 'Abby', 'Addie', 'Adrian', 'Adrien', 'Ajay', 'Alex', 'Alexis',
'Alfie', 'Ali', 'Alix', 'Allie', 'Allyn', 'Andie', 'Andrea', 'Andy', 'Angel',
'Angie', 'Ariel', 'Ashley', 'Aubrey', 'Augustine', 'Austin', 'Averil', ...]
```

In [8]:
# !pip install nltk

# My Custom Encoder-Decoder

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from nltk.corpus import names

# Step 1: Prepare the dataset
male_names = names.words('male.txt')
female_names = names.words('female.txt')
names = male_names + female_names
labels = ['male'] * len(male_names) + ['female'] * len(female_names)

# Tokenization
word_to_idx = {}
for name in names:
    for word in name.split():
        word_to_idx.setdefault(word, len(word_to_idx))

# Convert names to sequences
sequences = [[word_to_idx[word] for word in name.split()] for name in names]
max_length = max(len(seq) for seq in sequences)
padded_sequences = [seq + [0] * (max_length - len(seq)) for seq in sequences]

# Convert labels to one-hot encoding
label_dict = {'male': 0, 'female': 1}
labels = torch.tensor([label_dict[label] for label in labels])

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Define a custom dataset
class NamesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X)
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create data loaders
train_dataset = NamesDataset(X_train, y_train)
val_dataset = NamesDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Define the custom encoder
class CustomEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers):
        super(CustomEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.num_layers = num_layers
        self.attention_layers = nn.ModuleList([
            CustomMultiheadAttention(embedding_dim, num_heads) for _ in range(num_layers)
        ])
        self.linear1 = nn.Linear(embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, embedding_dim)
        self.activation = nn.ReLU()

    def forward(self, x):
        embedded = self.embedding(x)
        for i in range(self.num_layers):
            # Multi-head self-attention
            attended = self.attention_layers[i](embedded, embedded, embedded)
            # Feedforward layer
            embedded = self.activation(self.linear1(attended))
            embedded = self.linear2(embedded)
        return embedded

# Define the custom Multihead Attention module
class CustomMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(CustomMultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.scale = self.head_dim ** -0.5
        self.query_weight = nn.Parameter(torch.Tensor(embed_dim, embed_dim))
        self.key_weight = nn.Parameter(torch.Tensor(embed_dim, embed_dim))
        self.value_weight = nn.Parameter(torch.Tensor(embed_dim, embed_dim))
        self.output_weight = nn.Parameter(torch.Tensor(embed_dim, embed_dim))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.query_weight)
        nn.init.xavier_uniform_(self.key_weight)
        nn.init.xavier_uniform_(self.value_weight)
        nn.init.xavier_uniform_(self.output_weight)

    def forward(self, query, key, value):

        query = torch.matmul(query, self.query_weight)
        key = torch.matmul(key, self.key_weight)
        value = torch.matmul(value, self.value_weight)

        query = query.view(query.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(key.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(value.size(0), -1, self.num_heads, self.head_dim).transpose(1, 2)

        attention_scores = torch.matmul(query, key.transpose(-2, -1)) * self.scale
        attention_weights = nn.functional.softmax(attention_scores, dim=-1)

        attention_output = torch.matmul(attention_weights, value)

        attention_output = attention_output.transpose(1, 2).contiguous().view(attention_output.size(0), -1, self.embed_dim)
        attention_output = torch.matmul(attention_output, self.output_weight)
        return attention_output

# Define the transformer model with custom encoder
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, num_classes):
        super(TransformerClassifier, self).__init__()
        self.encoder = CustomEncoder(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        encoded = self.encoder(x)
        encoded = encoded.mean(dim=1) 
        output = self.fc(encoded)
        return output

# Instantiate the model
vocab_size = len(word_to_idx)
embedding_dim = 128
num_heads = 4
hidden_dim = 256
num_layers = 2
num_classes = 2

model = TransformerClassifier(vocab_size, embedding_dim, num_heads, hidden_dim, num_layers, num_classes)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
predictions = []
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

         
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {val_loss/len(val_loader):.4f}, Accuracy: {(correct/total)*100:.2f}%")
    # Display test data input and output


Epoch 1/10, Loss: 0.6696, Accuracy: 60.92%
Epoch 2/10, Loss: 0.6717, Accuracy: 60.92%
Epoch 3/10, Loss: 0.6785, Accuracy: 61.11%
Epoch 4/10, Loss: 0.7292, Accuracy: 60.92%
Epoch 5/10, Loss: 0.8724, Accuracy: 55.44%
Epoch 6/10, Loss: 1.1155, Accuracy: 48.84%
Epoch 7/10, Loss: 1.2375, Accuracy: 48.58%
Epoch 8/10, Loss: 1.4974, Accuracy: 48.46%
Epoch 9/10, Loss: 1.6306, Accuracy: 46.82%
Epoch 10/10, Loss: 1.7966, Accuracy: 50.28%


In [10]:
# Validation loop with printing predicted outputs
model.eval()
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        for idx, name_seq in enumerate(inputs):
            name = ' '.join([word for word, idx in word_to_idx.items() if idx == name_seq.tolist()[0]])
            prediction = 'male' if predicted[idx] == 0 else 'female'
            print(f"Name: {name}, Predicted Label: {prediction}")

Name: Hersh, Predicted Label: male
Name: Saxon, Predicted Label: female
Name: Roselyn, Predicted Label: female
Name: Karel, Predicted Label: female
Name: Ariadne, Predicted Label: female
Name: Hilary, Predicted Label: female
Name: Sebastien, Predicted Label: female
Name: Anallese, Predicted Label: female
Name: Margurite, Predicted Label: male
Name: Ruthi, Predicted Label: female
Name: Kelcy, Predicted Label: female
Name: Tuck, Predicted Label: female
Name: Bernete, Predicted Label: female
Name: Rodie, Predicted Label: female
Name: Wilt, Predicted Label: female
Name: Chloris, Predicted Label: female
Name: Normand, Predicted Label: male
Name: Fawne, Predicted Label: male
Name: Bethany, Predicted Label: male
Name: Lianna, Predicted Label: male
Name: Pasquale, Predicted Label: female
Name: Clifford, Predicted Label: female
Name: Letti, Predicted Label: male
Name: Briggs, Predicted Label: male
Name: Danya, Predicted Label: female
Name: Gregor, Predicted Label: male
Name: Bette, Predicted La

# With LSTM

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import nltk
from nltk.corpus import names
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

# Prepare the dataset
nltk.download('names')
male_names = names.words('male.txt')
female_names = names.words('female.txt')

# Create labeled dataset
data = [(name, 0) for name in male_names] + [(name, 1) for name in female_names]

# Split dataset into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Preprocess the data
class NameDataset(Dataset):
    def __init__(self, data):
        self.data = data
        self.char2idx = {char: idx + 1 for idx, char in enumerate(set(''.join([name for name, label in data])))}
        self.idx2char = {idx + 1: char for idx, char in enumerate(set(''.join([name for name, label in data])))}
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        name, label = self.data[idx]
        name_encoded = [self.char2idx[char] for char in name]
        return torch.LongTensor(name_encoded), torch.tensor(label)

def collate_fn(batch):
    names, labels = zip(*batch)
    names = pad_sequence(names, batch_first=True)
    labels = torch.stack(labels)
    return names, labels

train_dataset = NameDataset(train_data)
test_dataset = NameDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Define the Transformer model architecture
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1, dropout=0.1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_output, _ = self.lstm(embedded)
        lstm_output = lstm_output[:, -1, :]  # Take the output of the last time step
        output = self.fc(lstm_output)
        return output

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = LSTMClassifier(len(train_dataset.char2idx) + 1, 128, 2)
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for names, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        names, labels = names.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(names)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * names.size(0)
        
    epoch_loss = running_loss / len(train_dataset)
    print(f'Train Loss: {epoch_loss:.4f}')

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for names, labels in test_loader:
        names, labels = names.to(device), labels.to(device)
        
        outputs = model(names)
        _, predicted = torch.max(outputs.data, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Accuracy on test set: {accuracy:.4f}')

[nltk_data] Downloading package names to /home/fk-ubuntu/nltk_data...
[nltk_data]   Package names is already up-to-date!
Epoch 1/10: 100%|██████████| 100/100 [00:00<00:00, 454.22it/s]


Train Loss: 0.5335


Epoch 2/10: 100%|██████████| 100/100 [00:00<00:00, 751.11it/s]


Train Loss: 0.4111


Epoch 3/10: 100%|██████████| 100/100 [00:00<00:00, 733.99it/s]


Train Loss: 0.3794


Epoch 4/10: 100%|██████████| 100/100 [00:00<00:00, 750.78it/s]


Train Loss: 0.3529


Epoch 5/10: 100%|██████████| 100/100 [00:00<00:00, 749.73it/s]


Train Loss: 0.3305


Epoch 6/10: 100%|██████████| 100/100 [00:00<00:00, 739.73it/s]


Train Loss: 0.3060


Epoch 7/10: 100%|██████████| 100/100 [00:00<00:00, 710.65it/s]


Train Loss: 0.2847


Epoch 8/10: 100%|██████████| 100/100 [00:00<00:00, 637.02it/s]


Train Loss: 0.2662


Epoch 9/10: 100%|██████████| 100/100 [00:00<00:00, 681.07it/s]


Train Loss: 0.2442


Epoch 10/10: 100%|██████████| 100/100 [00:00<00:00, 751.92it/s]

Train Loss: 0.2208
Accuracy on test set: 0.4292





In [12]:
# Step 5: Evaluate the model
model.eval()
correct = 0
total = 0
predictions = []

with torch.no_grad():
    for names, labels in test_loader:
        names, labels = names.to(device), labels.to(device)
        
        outputs = model(names)
        _, predicted = torch.max(outputs.data, 1)
        
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # Decode names and labels
        decoded_names = []
        for name_tensor in names.cpu():
            decoded_name = ''.join([test_dataset.idx2char[idx.item()] for idx in name_tensor if idx.item() != 0])
            decoded_names.append(decoded_name)
            
        predicted_labels = predicted.cpu().numpy().tolist()
        predictions.extend(list(zip(decoded_names, predicted_labels)))

accuracy = correct / total
print(f'Accuracy on test set: {accuracy:.4f}')

# Display test data input and output
print("\nTest Data Input and Predictions:")
for name, prediction in predictions:
    print(f"Name: {name}, Predicted Label: {'Male' if prediction == 0 else 'Female'}")

Accuracy on test set: 0.4292

Test Data Input and Predictions:
Name: Hersh, Predicted Label: Female
Name: Saxon, Predicted Label: Male
Name: Roselyn, Predicted Label: Male
Name: Karel, Predicted Label: Male
Name: Ariadne, Predicted Label: Male
Name: Hilary, Predicted Label: Female
Name: Sebastien, Predicted Label: Male
Name: Anallese, Predicted Label: Female
Name: Margurite, Predicted Label: Male
Name: Ruthi, Predicted Label: Female
Name: Kelcy, Predicted Label: Female
Name: Tuck, Predicted Label: Male
Name: Bernete, Predicted Label: Male
Name: Rodie, Predicted Label: Male
Name: Wilt, Predicted Label: Male
Name: Chloris, Predicted Label: Male
Name: Normand, Predicted Label: Male
Name: Fawne, Predicted Label: Male
Name: Bethany, Predicted Label: Male
Name: Lianna, Predicted Label: Male
Name: Pasquale, Predicted Label: Male
Name: Clifford, Predicted Label: Male
Name: Letti, Predicted Label: Male
Name: Briggs, Predicted Label: Male
Name: Danya, Predicted Label: Male
Name: Gregor, Predicte

# References
1. https://arxiv.org/pdf/2102.03692.pdf
2. https://alvinntnu.github.io/NTNU_ENC2045_LECTURES/exercise/13-attention.html
3. https://towardsdatascience.com/deep-learning-gender-from-name-lstm-recurrent-neural-networks-448d64553044
4. https://www.nltk.org/book/ch02.html#sec-lexical-resources