## Basic MultiLayer Perceptron for Text Generation

In [None]:
#Just run this one time if pytorch and cuda is not already installed in the current jupyter kernel 
# !conda install pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch -y

In [4]:
# This is to use some benchmark data
# !pip install torchdata 
# !pip install torchtext

In [1]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [2]:
from torchtext.datasets import AG_NEWS
train_iter = iter(AG_NEWS(split='train'))

In [3]:
next(train_iter)

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

First we need to preprocess the dataset. This will consist in:
* Eliminating stop words
* 

In [4]:
words = ''

for i, (tag, text) in enumerate(train_iter):
    words += ' ' + text
    if i == 1000:
        break

In [12]:
words[:1000]

" Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market. Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums. Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday. Oil prices soar to all-time record, posing new menace to US economy (AFP) AFP - Tearaway world oil prices, toppling records and straining wallets, present a new economic menace barely three months before the US presidential election

In [16]:
from gensim.parsing.preprocessing import preprocess_string, strip_tags, remove_stopwords, strip_short, stem_text

# Preprocess text and tokenize
CUSTOM_FILTERS = [strip_tags, remove_stopwords, strip_short, stem_text]
tokens = preprocess_string(words, filters=CUSTOM_FILTERS) #Preprocess without stemming

In [17]:
tokens[0:10]

['carlyl',
 'look',
 'toward',
 'commerci',
 'aerospac',
 '(reuters)',
 'reuter',
 'privat',
 'invest',
 'firm']

In [36]:
from torchtext.vocab import build_vocab_from_iterator
# Build small vocab from tokens

vocab_size = 2000

# This expects list of list of tokens, if i just pass list of tokens it considers the letters as the tokens
vocab = build_vocab_from_iterator([tokens], max_tokens=vocab_size) #Take into account just the 2000 more common words
vocab.set_default_index(vocab_size)

Build text generation dataset

In [29]:
context_size = 4
X, Y = [], []

for i in range(len(tokens) - context_size - 1):
    context = tokens[i:(i + context_size)]
    y = tokens[i + context_size + 1]
    X.append(vocab.lookup_indices(context))
    Y.append(vocab.lookup_indices([y])[0])

X = torch.tensor(X)
Y = torch.tensor(Y)

In [34]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([25628, 4]), torch.int64, torch.Size([25628]), torch.int64)

Ahora crearemos un embbeding aleatorio para cada palabra dentro del vocabulario. Estos embedding se irán actualizando con nuestra red neuronal

In [44]:
emb_dim = 10
# Vocab size + 1 is to take into account the default token index
C = torch.randn((vocab_size + 1, emb_dim))

In [45]:
C[5] #Embedding for token number 5

tensor([ 2.5381, -0.0291,  1.3334, -0.5388,  1.9669,  1.4280, -1.7370,  0.1167,
         0.2961, -0.8868])

In [46]:
emb = C[X] #For each token index in X get the corresponding embedding (Yeah, indexing in torch is that powerful)
emb.shape 

torch.Size([25628, 4, 10])

In [50]:
import torch.nn.functional as F

W1 = 

In [1]:
import torch
x = torch.randn(1, device='cuda')
x.numpy()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

In [4]:
x = torch.randn(1, device='cpu')
x.cpu().numpy()

array([1.8363967], dtype=float32)

In [9]:
import torch.nn as nn

class SubModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.w1  = nn.Parameter(torch.randn(1, dtype=float))
        self.w2  = nn.Parameter(torch.randn(1, dtype=float))
    
class Model(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.linear = nn.Linear(1,1, bias=False)
        self.sub = SubModel()



len(list(Model().parameters()))

3

In [10]:
f = torch.randn((1, 3))
w = torch.randn(f.shape[1], 3)

torch.mm(f, w)

tensor([[ 0.7358, -0.2159,  3.8975]])

In [24]:
x = torch.ones((2,2))
y = x.view(-1, )
y[-1] = 0
print(x)

tensor([[1., 1.],
        [1., 0.]])
