In [33]:
import torch

In [34]:
with open('nomes.csv', 'r') as fp:
    raw = fp.read().splitlines()

male_names_list = []
female_names_list = []

# Getting list of all names from the first and third columns of the dataset
for line in raw[1:]:
    cols = line.split(',')
    first_col = cols[0].split('|')
    third_col = cols[2].split('|')
    frequency_female = 0 if cols[3] == '' else int(cols[3])
    frequency_male = 0 if cols[4] == '' else int(cols[4])
    frequency_total = int(cols[5])
    if frequency_total>=700 and third_col != ['']:
        if frequency_male>frequency_female:
            male_names_list.extend(third_col)
        else:
            female_names_list.extend(third_col)
        
        

male_names = list(set(male_names_list))
print(len(male_names))

female_names = list(set(female_names_list))
print(len(female_names))

4793
5758


In [47]:
import random

random.sample(range(100), 10)

[17, 99, 45, 73, 89, 34, 18, 83, 95, 29]

In [58]:
possible_chars = sorted(list(set(''.join(female_names) + '.')))
n_possible_chars = len(possible_chars)
def encode(x):
    return possible_chars.index(x)

X = []
Y = []

n_content_block = 3
for i, name in enumerate(female_names):
    content_block = '...'
    name += '.'
    if i<3:
        print(f'Name: {name}')
    for char in name:
        encoded_content_block = [encode(c) for c in content_block]
        encoded_char = encode(char)
        if i<3:
            print(f'    {content_block} -> {char}   |||   {encoded_content_block} -> {encoded_char}')
        X.append(encoded_content_block)
        Y.append(encoded_char)
        content_block = content_block[1:] + char
    
X = torch.tensor(X)
Y = torch.tensor(Y)

n80th = int(0.8*len(X))
n90th = int(0.9*len(X))

X_train = X[:n80th]
Y_train = Y[:n80th]

X_test = X[n80th:n90th]
Y_test = Y[n80th:n90th]

X_eval = X[n90th:]
Y_eval = Y[n90th:]

Name: JOVENTINA.
    ... -> J   |||   [0, 0, 0] -> 10
    ..J -> O   |||   [0, 0, 10] -> 15
    .JO -> V   |||   [0, 10, 15] -> 22
    JOV -> E   |||   [10, 15, 22] -> 5
    OVE -> N   |||   [15, 22, 5] -> 14
    VEN -> T   |||   [22, 5, 14] -> 20
    ENT -> I   |||   [5, 14, 20] -> 9
    NTI -> N   |||   [14, 20, 9] -> 14
    TIN -> A   |||   [20, 9, 14] -> 1
    INA -> .   |||   [9, 14, 1] -> 0
Name: DORCAS.
    ... -> D   |||   [0, 0, 0] -> 4
    ..D -> O   |||   [0, 0, 4] -> 15
    .DO -> R   |||   [0, 4, 15] -> 18
    DOR -> C   |||   [4, 15, 18] -> 3
    ORC -> A   |||   [15, 18, 3] -> 1
    RCA -> S   |||   [18, 3, 1] -> 19
    CAS -> .   |||   [3, 1, 19] -> 0
Name: JAIDETE.
    ... -> J   |||   [0, 0, 0] -> 10
    ..J -> A   |||   [0, 0, 10] -> 1
    .JA -> I   |||   [0, 10, 1] -> 9
    JAI -> D   |||   [10, 1, 9] -> 4
    AID -> E   |||   [1, 9, 4] -> 5
    IDE -> T   |||   [9, 4, 5] -> 20
    DET -> E   |||   [4, 5, 20] -> 5
    ETE -> .   |||   [5, 20, 5] -> 0


In [59]:
n_dimensions = 4
n_neuros_h1 = 400

V  = torch.randn(n_possible_chars, n_dimensions)
W1 = torch.randn(n_dimensions * n_content_block, n_neuros_h1) * 0.2
b1 = torch.randn(n_neuros_h1) * 0.1
W2 = torch.randn(n_neuros_h1, n_possible_chars) * 0.01
b2 = torch.randn(n_possible_chars) * 0.1

parameters = [V, W1, b1, W2, b2]
for parameter in parameters:
    parameter.requires_grad = True


In [60]:
n_epochs = 1000
learning_rate = 1
loss_fn = torch.nn.CrossEntropyLoss()

for epoch_i in range(n_epochs):
    # Forward Pass
    X_vectorized =  V[X_train].view(len(X_train), -1)
    
    h1 = (X_vectorized @ W1 + b1).tanh()
    output = h1 @ W2 + b2

    loss = loss_fn(output, Y_train) 

    # Backprop
    for parameter in parameters:
        parameter.grad = None
    loss.backward()

    # Update
    for parameter in parameters:
        parameter.data += - learning_rate * parameter.grad
    
    if epoch_i % 100 == 0:
        print(f'epoch: {epoch_i + 1} / {n_epochs}: {loss.item()}')


epoch: 1 / 1000: 3.2682111263275146
epoch: 101 / 1000: 2.1642227172851562
epoch: 201 / 1000: 2.066270589828491
epoch: 301 / 1000: 1.9836918115615845
epoch: 401 / 1000: 1.9301937818527222
epoch: 501 / 1000: 1.8660606145858765
epoch: 601 / 1000: 1.854499101638794
epoch: 701 / 1000: 1.8482017517089844
epoch: 801 / 1000: 1.8086493015289307
epoch: 901 / 1000: 1.8173540830612183


In [55]:
X_vectorized_test =  V[X_test].view(len(X_test), -1)
h1 = (X_vectorized_test @ W1 + b1).tanh()
output = h1 @ W2 + b2
loss = loss_fn(output, Y_test) 
print(f'Test: {loss}')

Test: 1.8254585266113281


In [61]:
n_names = 20
softmax = torch.nn.Softmax()

for i in range(n_names):
    content_block = '...' 
    new_name = ''
    while True:
        start_char_encoded = [encode(c) for c in content_block]
        start_char_vector = V[start_char_encoded].view(-1)

        h1 = (start_char_vector @ W1 + b1).tanh()
        out = h1 @ W2 + b2

        probabilities = softmax(out)
        next_char_index = torch.multinomial(probabilities, num_samples=1, replacement=True).item()
        next_char = possible_chars[next_char_index]

        if next_char == '.':
            break
        else:
            content_block = content_block[1:] + next_char
            new_name += next_char
    print(new_name)

GELY
TACI
GRACE
ZELENE
NILSA
CEL
STEFANE
DILLA
KELLA
ADELA
KELE
JAULENE
ZUANY
RAIANE
TAYADAIONE
ALCE
BERMILERILYN
LUANA
MALINE
NELIZALA


  probabilities = softmax(out)


In [57]:
torch.save(V, 'V.pt')
torch.save(W1, 'W1.pt')
torch.save(b1, 'b1.pt')
torch.save(W2, 'W2.pt')
torch.save(b2, 'b2.pt')
with open('possible_chars.txt', 'w') as fp:
    for possible_char in possible_chars:
        fp.write(possible_char + '\n')