In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import re
from sklearn.model_selection import train_test_split

In [2]:
def manual_update(model, learning_rate):
    with torch.no_grad():
        for param in model.parameters():
            param.grad = None
        for param in model.parameters():
            param.data -= learning_rate * param.grad
            

In [3]:
names = open('/Users/felipeakiomatsuoka/Desktop/NLP/DrugNames.txt', 'r').read().split('\n')
names = [re.sub(r'[^a-zA-Z0-9 ]', '', name) for name in names]
names = [name.lower() for name in names]

In [4]:
chars = sorted(list(set(''.join(names))))
stoi = {ch:i + 1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i:ch for ch, i in stoi.items()}

print(f"Vocabulary size: {len(chars)}")

Vocabulary size: 37


In [5]:
block_size = 4
batch_size = 32

X, Y = [], []

for name in names:
    context = [0] * block_size
    for ch in name + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y) 

print(f"X shape: {X.shape}, dtype: {X.dtype}")
print(f"Y shape: {Y.shape}, dtype: {Y.dtype}")

print(X[0])
print(Y[0])


X shape: torch.Size([882771, 4]), dtype: torch.int64
Y shape: torch.Size([882771]), dtype: torch.int64
tensor([0, 0, 0, 0])
tensor(27)


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)    
print(f"X_train shape is: {X_train.shape}")
print(f"X_test shape is: {X_test.shape}")
print(f"Y_train shape is: {Y_train.shape}")
print(f"Y_test shape is: {Y_test.shape}")


train_dataset = TensorDataset(X_train,Y_train)
test_dataset = TensorDataset(X_test,Y_test)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

X_train shape is: torch.Size([706216, 4])
X_test shape is: torch.Size([176555, 4])
Y_train shape is: torch.Size([706216])
Y_test shape is: torch.Size([176555])


In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self, vocab_size, block_size,embedding_dim, n_hidden):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.layers = nn.Sequential(
            nn.Linear(block_size * embedding_dim, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
            #nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
            #nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
            #nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
            #nn.Linear(n_hidden, n_hidden, bias=False), nn.BatchNorm1d(n_hidden), nn.Tanh(),
            nn.Linear(n_hidden, vocab_size, bias=False), nn.BatchNorm1d(vocab_size)
        )
        
        with torch.no_grad():
            self.layers[-1].weight.data *= 0.1
            for layer in self.layers[:-1]:
                if isinstance(layer, nn.Linear):
                    layer.weight.data *= 5/3

    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        for layer in self.layers:
            x = layer(x)
        return x
    


In [8]:
model = NeuralNetwork(len(stoi), block_size, 10, 200)
print(model)

NeuralNetwork(
  (emb): Embedding(38, 10)
  (layers): Sequential(
    (0): Linear(in_features=40, out_features=200, bias=False)
    (1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Tanh()
    (3): Linear(in_features=200, out_features=38, bias=False)
    (4): BatchNorm1d(38, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)


In [9]:
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Number of parameters:", num_params)

Number of parameters: 16456


In [10]:
epochs = 200000
lr = 0.1
e = 1e-8


for epoch in range(epochs):
    ix = torch.randint(0, len(X_train), (batch_size,))
    Xb, Yb = X_train[ix], Y_train[ix]
    Y_pred = model(Xb)
    loss = F.cross_entropy(Y_pred, Yb)
    for p in model.parameters():
        p.grad = None
    loss.backward()
    lr = 0.1 if epoch < 100000 else 0.01 
    for p in model.parameters():
        p.data -= lr * p.grad
    if epoch % 10000 == 0:
        print(f"Epoch: {epoch}, Loss: {loss.item():.4f}")


Epoch: 0, Loss: 3.6664
Epoch: 10000, Loss: 1.2775
Epoch: 20000, Loss: 1.6483
Epoch: 30000, Loss: 1.5610
Epoch: 40000, Loss: 1.2053
Epoch: 50000, Loss: 1.0707
Epoch: 60000, Loss: 0.7556
Epoch: 70000, Loss: 1.5316
Epoch: 80000, Loss: 1.2795
Epoch: 90000, Loss: 1.2239
Epoch: 100000, Loss: 1.1112
Epoch: 110000, Loss: 0.8924
Epoch: 120000, Loss: 1.0001
Epoch: 130000, Loss: 1.3853
Epoch: 140000, Loss: 1.0483
Epoch: 150000, Loss: 1.0810
Epoch: 160000, Loss: 1.2999
Epoch: 170000, Loss: 1.0369
Epoch: 180000, Loss: 1.4115
Epoch: 190000, Loss: 1.0116


In [11]:
@torch.no_grad()
def split_loss(split):
    x,y = {
        'train': (X_train, Y_train),
        'test': (X_test, Y_test)
    }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(f"{split} loss: {loss.item()}")

split_loss('train')
split_loss('test')

train loss: 1.0221842527389526
test loss: 1.0418405532836914


In [38]:
def generate_text(model, block_size, itos, num_of_words, temperature=1.0):
    for i in range(num_of_words):
        context = [0] * block_size  # Initialize with zeros
        out = []
        while True:
            with torch.no_grad():
                x = torch.tensor([context], dtype=torch.long)
                logits = model(x)
                probs = F.softmax(logits / temperature, dim=-1).squeeze()
                next_char_idx = torch.multinomial(probs, num_samples=1).item()
                context = context[1:] + [next_char_idx]  # Update context
                out.append(next_char_idx)
                if next_char_idx == 0:  # Assuming '\0' is the end token
                    break
        print(''.join([itos[i] for i in out]))


In [39]:

x = torch.tensor([[0] * block_size], dtype=torch.long)
model.eval()
generate_text(model, block_size, itos, num_of_words=10, temperature=1.0)


norphin.
prrostigminix.
cfimopssacandroxyzos.
prednic acid.
sulfate and hydrochloride.
cefazole maleate.
carbonate.
hydrochloride.
valprazole.
methorphenavrin sodex.
