In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm.notebook import tqdm

In [None]:
!wget http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt

--2023-11-06 05:34:35--  http://www.cs.cmu.edu/afs/cs/project/ai-repository/ai/areas/nlp/corpora/names/female.txt
Resolving www.cs.cmu.edu (www.cs.cmu.edu)... 128.2.42.95
Connecting to www.cs.cmu.edu (www.cs.cmu.edu)|128.2.42.95|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 35751 (35K) [text/plain]
Saving to: ‘female.txt’


2023-11-06 05:34:36 (225 KB/s) - ‘female.txt’ saved [35751/35751]



In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# read the lines
with open('female.txt', 'r') as f:
    lines = f.readlines()
print(lines)

['# List of common female names.\n', '# Copyright (c) January 1991 by Mark Kantrowitz.\n', '# 4987 names\n', '# Thanks to Bill.Ross for about 1000 additional names.\n', '# Version 1.3 (29-MAR-94)\n', '\n', 'Abagael\n', 'Abagail\n', 'Abbe\n', 'Abbey\n', 'Abbi\n', 'Abbie\n', 'Abby\n', 'Abigael\n', 'Abigail\n', 'Abigale\n', 'Abra\n', 'Acacia\n', 'Ada\n', 'Adah\n', 'Adaline\n', 'Adara\n', 'Addie\n', 'Addis\n', 'Adel\n', 'Adela\n', 'Adelaide\n', 'Adele\n', 'Adelice\n', 'Adelina\n', 'Adelind\n', 'Adeline\n', 'Adella\n', 'Adelle\n', 'Adena\n', 'Adey\n', 'Adi\n', 'Adiana\n', 'Adina\n', 'Adora\n', 'Adore\n', 'Adoree\n', 'Adorne\n', 'Adrea\n', 'Adria\n', 'Adriaens\n', 'Adrian\n', 'Adriana\n', 'Adriane\n', 'Adrianna\n', 'Adrianne\n', 'Adrien\n', 'Adriena\n', 'Adrienne\n', 'Aeriel\n', 'Aeriela\n', 'Aeriell\n', 'Ag\n', 'Agace\n', 'Agata\n', 'Agatha\n', 'Agathe\n', 'Aggi\n', 'Aggie\n', 'Aggy\n', 'Agna\n', 'Agnella\n', 'Agnes\n', 'Agnese\n', 'Agnesse\n', 'Agneta\n', 'Agnola\n', 'Agretha\n', 'Aida\n',

In [None]:
names = []
max_len = 0
# if max_len == 10
# if name is 'TOM', I can make it TOM (+) 0000000
for l in lines[6:]:
    # delete last line spacing and make all lower case.
    curr_name = l[:-1].lower()
    if curr_name.isalpha():
        names.append(curr_name)
        max_len = max(len(curr_name), max_len)
# if name is "Tom" what exactly generated is "Tom<EndOfSignal>"
max_len += 1 # consider the 'EOS' (end of signal)
print('Maximum Length : ' + str(max_len))
print(names)

Maximum Length : 14
['abagael', 'abagail', 'abbe', 'abbey', 'abbi', 'abbie', 'abby', 'abigael', 'abigail', 'abigale', 'abra', 'acacia', 'ada', 'adah', 'adaline', 'adara', 'addie', 'addis', 'adel', 'adela', 'adelaide', 'adele', 'adelice', 'adelina', 'adelind', 'adeline', 'adella', 'adelle', 'adena', 'adey', 'adi', 'adiana', 'adina', 'adora', 'adore', 'adoree', 'adorne', 'adrea', 'adria', 'adriaens', 'adrian', 'adriana', 'adriane', 'adrianna', 'adrianne', 'adrien', 'adriena', 'adrienne', 'aeriel', 'aeriela', 'aeriell', 'ag', 'agace', 'agata', 'agatha', 'agathe', 'aggi', 'aggie', 'aggy', 'agna', 'agnella', 'agnes', 'agnese', 'agnesse', 'agneta', 'agnola', 'agretha', 'aida', 'aidan', 'aigneis', 'aila', 'aile', 'ailee', 'aileen', 'ailene', 'ailey', 'aili', 'ailina', 'ailyn', 'aime', 'aimee', 'aimil', 'aina', 'aindrea', 'ainslee', 'ainsley', 'ainslie', 'ajay', 'alaine', 'alameda', 'alana', 'alanah', 'alane', 'alanna', 'alayne', 'alberta', 'albertina', 'albertine', 'albina', 'alecia', 'aleda'

In [None]:
print(ord('a'), ord('b'), ord('z'))
print('Index of "c" :', ord('c')-ord('a'))

97 98 122
Index of "c" : 2


In [None]:
class NameDataset(Dataset):
    def __init__(self, names, max_len):
        self.names = names
        self.max_len = max_len
        self.a_order = ord('a')
        self.z_order = ord('z')
        self.num_classes = 26 + 1 # a-z + include the end of signal

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        # buffer before padding 26(index of EOS) to the name info.
        padded_name = [self.num_classes-1 for _ in range(self.max_len)] # [26, 26, ... 26]

        # names[idx] : current alphabet name ('abe')
        # curr_name : [0, 1, 4]
        curr_name = [ord(n)-self.a_order for n in names[idx]]
        # padded name : [0, 1, 4, 26, 26, 26, .... ]
        padded_name[:len(curr_name)] = curr_name

        # Slide the input to make a output
        sample = dict()
        sample['input'] = torch.LongTensor(padded_name[:-1]) # h y e m i n  -1 -1 -1
        sample['output'] = torch.LongTensor(padded_name[1:]) # y e m i n -1 -1 -1 -1
        sample['length'] = len(names[idx])
        sample['original'] = names[idx]

        return sample

In [None]:
batch_size = 64
dataset = NameDataset(names, max_len)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [None]:
sample = next(iter(dataloader))
print(sample['input'][0])
print(sample['output'][0])
print(sample['length'][0])
print(sample['original'][0])
print(sample['input'].shape, sample['output'].shape)

tensor([18, 20, 25,  0, 13, 13, 26, 26, 26, 26, 26, 26, 26])
tensor([20, 25,  0, 13, 13, 26, 26, 26, 26, 26, 26, 26, 26])
tensor(6)
suzann
torch.Size([64, 13]) torch.Size([64, 13])


In [None]:
# This could be useful with variable lengths
total_lengths = sample['length']
sort_length, sort_idx = torch.sort(total_lengths, descending=True)
sort_input = sample['input'][sort_idx]
sort_output = sample['output'][sort_idx]
sort_original_name = [sample['original'][idx] for idx in sort_idx]
# print(sort_length)
# print(sort_input)
# print(sort_original_name)

In [None]:
class RNNmodel(nn.Module):
    def __init__(self,
                 lstm_dim=256,
                 num_classes=dataset.num_classes,
                 max_len=max_len):
        super(RNNmodel, self).__init__()
        self.lstm_dim = lstm_dim
        self.num_classes = num_classes
        self.max_len = max_len
        self.char_embedding = nn.Embedding(num_embeddings=num_classes,
                                           embedding_dim=lstm_dim)
        # nn.Embedding(num_embedings, embedding_dim)
        # => We have (num_embedding) number of (embedding_dim)-dimesional vectors

        self.lstm = nn.LSTM(input_size=lstm_dim,
                            hidden_size=lstm_dim,
                            num_layers=1,
                            batch_first=True,
                            )
        # pytorch's LSTM receives (Length)x(Batch)x(Dimension) sized tensor
        # if I set batch_first=True, LSTM will receive B x L x D sized tensor.

        self.out_linear = nn.Linear(lstm_dim, num_classes)

    def forward(self, sort_input, sort_output, sort_length):
        ## originally, recommended to use torch.nn.utils.rnn.pack_padded_sequence,when we have variable lengths
        ## but in this case, I just neglected it because beginners can be more confused with this

        # sort_input: BxL-dimensional Tensor of index integers
        # lstm_input : B x L x D
        lstm_input = self.char_embedding(sort_input)
        lstm_out, (h, c) = self.lstm(lstm_input)
        out = self.out_linear(lstm_out) # ( batch x length x num_class ) -sized tensor

        return out

    def test(self, start_char):
        generated_name = list()
        generated_name.append(start_char)

        start_order = torch.LongTensor([ord(start_char)]).to(device) - ord('a')
        start_order = start_order.reshape(1, 1)
        cnt = 0

        while cnt <= self.max_len:
            curr_embed = self.char_embedding(start_order)
            if cnt == 0:
                lstm_out, (h, c) = self.lstm(curr_embed)
            else:
                lstm_out, (h, c) = self.lstm(curr_embed, (h, c))
            out = self.out_linear(lstm_out)

            sample_next = torch.distributions.Categorical(logits = out[0, 0, :]).sample().item()
            if sample_next == 26:
                break
            else:
                generated_name.append(chr(ord('a')+sample_next))
                sample_next = torch.LongTensor([sample_next]).to(device)
                start_order = sample_next.reshape(1, 1)

                cnt += 1

        return ''.join(generated_name)


In [None]:
embed = nn.Embedding(10, 64)
index = torch.LongTensor([[0, 2, 4], [1, 3, 5]]) # batch =2, length=3
print(index)
print(embed(index).shape)
model = RNNmodel()
model(index, None, None)


tensor([[0, 2, 4],
        [1, 3, 5]])
torch.Size([2, 3, 64])
lstm_out shape: torch.Size([2, 3, 256])
output shape:  torch.Size([2, 3, 27])


tensor([[[ 0.1587, -0.1036, -0.0233,  0.0709,  0.0006,  0.0240,  0.1768,
           0.0077,  0.0111, -0.1287,  0.0874,  0.0460, -0.0621, -0.0659,
           0.0432,  0.0526,  0.0275,  0.0164, -0.0060, -0.0249,  0.0481,
           0.0010, -0.0263,  0.0498, -0.0647,  0.0189, -0.0413],
         [ 0.0392, -0.0226, -0.0274,  0.0272,  0.0727,  0.0642,  0.1140,
          -0.0487,  0.0154, -0.0379,  0.0908,  0.0371, -0.1874,  0.0232,
           0.0342, -0.0167,  0.0416, -0.1923, -0.0844, -0.0020, -0.0384,
           0.0038, -0.0151, -0.0216, -0.0455, -0.0372,  0.1321],
         [-0.0011, -0.1578,  0.1219, -0.0084, -0.0186,  0.1530, -0.1066,
          -0.1009,  0.0628,  0.1343,  0.0537,  0.1882, -0.1875, -0.0672,
           0.1877, -0.0036,  0.0660, -0.1787, -0.0946, -0.1588, -0.0009,
           0.0936, -0.0436, -0.1231, -0.0588,  0.0142,  0.0290]],

        [[ 0.0669, -0.0831, -0.1083,  0.0769, -0.0404, -0.1208, -0.0057,
          -0.0148,  0.0584, -0.0530,  0.0120, -0.0469,  0.0423, -0.1246,


In [None]:
model = RNNmodel()
model(sample['input'], sample['output'], sample['length'])
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [None]:
def train(model, optimizer, sample):
    optimizer.zero_grad()
    criteria = nn.CrossEntropyLoss()

    total_lengths = sample['length']
    sort_length, sort_idx = torch.sort(total_lengths, descending=True)

    sort_input = sample['input'][sort_idx].to(device)
    sort_output = sample['output'][sort_idx].to(device)
    sort_length = sort_length.to(device)

    pred = model(sort_input, sort_output, sort_length) # B T C
    B, T, C = pred.shape

    # sort_input : [[T, O, M], [A, M, Y]], B:2, T:3
    # sort_output(gt): [[O, M, eos], [M, Y, eos]]
    # prediction:      [[O, N, eos], [N, Y, eos]]

    curr_loss = criteria(pred.reshape(B*T, C), sort_output.reshape(B*T))

    curr_loss.backward()
    optimizer.step()

    return curr_loss.item()

In [None]:
max_epoch = 100
for epoch in tqdm(range(max_epoch)):
    total_loss = 0.0
    for sample in dataloader:
        curr_loss = train(model, optimizer, sample)
        total_loss += curr_loss / len(dataloader)

    start_char = chr(np.random.randint(ord('a'), ord('z')))
    print('[EPOCH {}] TRAIN LOSS: {}, SAMPLED NAME: {}'.format(epoch,
                                                               total_loss,
                                                               model.test(start_char)))



  0%|          | 0/100 [00:00<?, ?it/s]

[EPOCH 0] TRAIN LOSS: 1.9535719492496586, SAMPLED NAME: ymunwenui
[EPOCH 1] TRAIN LOSS: 1.2529693505702875, SAMPLED NAME: jmgpdbae
[EPOCH 2] TRAIN LOSS: 1.1354151964187622, SAMPLED NAME: uboea
[EPOCH 3] TRAIN LOSS: 1.0730929397619686, SAMPLED NAME: jpdeolie
[EPOCH 4] TRAIN LOSS: 1.0306071822459881, SAMPLED NAME: swvra
[EPOCH 5] TRAIN LOSS: 0.9978405053798969, SAMPLED NAME: ha
[EPOCH 6] TRAIN LOSS: 0.9717443203314758, SAMPLED NAME: t
[EPOCH 7] TRAIN LOSS: 0.9501781257299277, SAMPLED NAME: rene
[EPOCH 8] TRAIN LOSS: 0.9314880493359688, SAMPLED NAME: marir
[EPOCH 9] TRAIN LOSS: 0.91518513743694, SAMPLED NAME: pharleq
[EPOCH 10] TRAIN LOSS: 0.9010432156232686, SAMPLED NAME: bra
[EPOCH 11] TRAIN LOSS: 0.8888701911155992, SAMPLED NAME: ronekle
[EPOCH 12] TRAIN LOSS: 0.8779245821329263, SAMPLED NAME: ddodeta
[EPOCH 13] TRAIN LOSS: 0.8679067652959087, SAMPLED NAME: lycania
[EPOCH 14] TRAIN LOSS: 0.8583846703553811, SAMPLED NAME: karien
[EPOCH 15] TRAIN LOSS: 0.849439060840851, SAMPLED NAME: je