<a href="https://colab.research.google.com/github/ejdogar/NLP/blob/main/Text_Generation_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import torch
import torch.nn as nn
import keras

In [4]:
docs = ["Hey how are you",
        "Hey whats up",
        "Pakistan lost the match",
        "it is raining",
        "Charles Leclerc will be the F1 champion of 2024",
        "Ronaldo is the best football player",
        "Schumacher was the best driver ever",
        "Lewis Hamilton is a cheat",
        "Sebastian Vettel is struggling"
        ]

In [5]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

In [6]:
tokenizer.fit_on_texts(docs)

In [7]:
sequences = tokenizer.texts_to_sequences(docs)

In [8]:
print(sequences)

[[3, 5, 6, 7], [3, 8, 9], [10, 11, 1, 12], [13, 2, 14], [15, 16, 17, 18, 1, 19, 20, 21, 22], [23, 2, 1, 4, 24, 25], [26, 27, 1, 4, 28, 29], [30, 31, 2, 32, 33], [34, 35, 2, 36]]


In [9]:
word2int = tokenizer.word_index
print(word2int)
vocab_size = len(word2int)
vocab_size

{'the': 1, 'is': 2, 'hey': 3, 'best': 4, 'how': 5, 'are': 6, 'you': 7, 'whats': 8, 'up': 9, 'pakistan': 10, 'lost': 11, 'match': 12, 'it': 13, 'raining': 14, 'charles': 15, 'leclerc': 16, 'will': 17, 'be': 18, 'f1': 19, 'champion': 20, 'of': 21, '2024': 22, 'ronaldo': 23, 'football': 24, 'player': 25, 'schumacher': 26, 'was': 27, 'driver': 28, 'ever': 29, 'lewis': 30, 'hamilton': 31, 'a': 32, 'cheat': 33, 'sebastian': 34, 'vettel': 35, 'struggling': 36}


36

In [10]:
int2word = {idx+1: word for idx, word in enumerate(word2int)}

In [11]:
from keras.utils import pad_sequences
sequences = pad_sequences(sequences, padding="post")
sequences

array([[ 3,  5,  6,  7,  0,  0,  0,  0,  0],
       [ 3,  8,  9,  0,  0,  0,  0,  0,  0],
       [10, 11,  1, 12,  0,  0,  0,  0,  0],
       [13,  2, 14,  0,  0,  0,  0,  0,  0],
       [15, 16, 17, 18,  1, 19, 20, 21, 22],
       [23,  2,  1,  4, 24, 25,  0,  0,  0],
       [26, 27,  1,  4, 28, 29,  0,  0,  0],
       [30, 31,  2, 32, 33,  0,  0,  0,  0],
       [34, 35,  2, 36,  0,  0,  0,  0,  0]], dtype=int32)

In [12]:
maxlength = len(sequences[0])
maxlength

9

In [13]:
input_seq = []
target_seq = []
for i in range(len(sequences)):
  input_seq.append(sequences[i][:-1])
  target_seq.append(sequences[i][1:])

print(f"{input_seq[0]=}")
print(f"{target_seq[0]=}")

input_seq[0]=array([3, 5, 6, 7, 0, 0, 0, 0], dtype=int32)
target_seq[0]=array([5, 6, 7, 0, 0, 0, 0, 0], dtype=int32)


In [14]:
seq_len = maxlength - 1
def one_hot_encoder(sequence, batch_size, seq_len, vocab_size):
  feature = np.zeros((batch_size, seq_len, vocab_size+1), dtype = np.float32)

  for i in range(batch_size):
    for k in range(seq_len):
      feature[i, k, sequence[i][k]] = 1

  return feature

In [15]:
batch_size = len(docs)
input_seq = one_hot_encoder(input_seq, batch_size, seq_len, vocab_size)

In [16]:
input_seq = torch.from_numpy(input_seq)
target_seq = torch.Tensor(target_seq)

  target_seq = torch.Tensor(target_seq)


In [17]:
is_cuda = torch.cuda.is_available()

if is_cuda:
  device = torch.device("cuda")
  print("GPU is available")
else:
  device = torch.device("cpu")
  print("GPU not available, running on CPU...")

GPU not available, running on CPU...


In [18]:
class Model(nn.Module):
  def __init__(self, input_size, output_size, hidden_dim, n_layers):
    super(Model, self).__init__()

    self.hidden_dim = hidden_dim
    self.n_layers = n_layers

    self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first = True)
    self.fc = nn.Linear(hidden_dim, output_size)


  def forward(self, x):
    batch_size = x.size(0)

    hidden = self.init_hidden(batch_size)

    out, hidden = self.rnn(x, hidden)

    out = out.contiguous().view(-1, self.hidden_dim)

    out = self.fc(out)

    return out, hidden


  def init_hidden(self, batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
    return hidden

In [19]:
model = Model(input_size = vocab_size+1, output_size = vocab_size+1, hidden_dim = 8, n_layers = 1)
model = model.to(device)

In [20]:
n_epochs = 7000
lr = 0.0001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [21]:
input_seq = input_seq.to(device)
print(f"{input_seq.shape=}")
for epoch in range(1, n_epochs+1):
  optimizer.zero_grad()

  out, hidden = model(input_seq)
  out.to(device)
  target_seq.to(device)
  loss = criterion(out, target_seq.view(-1).long())
  loss.backward()
  optimizer.step()

  if epoch%50  == 0:
    print('Epoch: {}/{}......'.format(epoch, n_epochs), end=' ')
    print("Loss : {:.4f}".format(loss.item()))

input_seq.shape=torch.Size([9, 8, 37])
Epoch: 50/7000...... Loss : 3.5674
Epoch: 100/7000...... Loss : 3.5278
Epoch: 150/7000...... Loss : 3.4836
Epoch: 200/7000...... Loss : 3.4345
Epoch: 250/7000...... Loss : 3.3807
Epoch: 300/7000...... Loss : 3.3225
Epoch: 350/7000...... Loss : 3.2608
Epoch: 400/7000...... Loss : 3.1965
Epoch: 450/7000...... Loss : 3.1306
Epoch: 500/7000...... Loss : 3.0642
Epoch: 550/7000...... Loss : 2.9981
Epoch: 600/7000...... Loss : 2.9332
Epoch: 650/7000...... Loss : 2.8701
Epoch: 700/7000...... Loss : 2.8095
Epoch: 750/7000...... Loss : 2.7515
Epoch: 800/7000...... Loss : 2.6966
Epoch: 850/7000...... Loss : 2.6449
Epoch: 900/7000...... Loss : 2.5963
Epoch: 950/7000...... Loss : 2.5508
Epoch: 1000/7000...... Loss : 2.5083
Epoch: 1050/7000...... Loss : 2.4686
Epoch: 1100/7000...... Loss : 2.4315
Epoch: 1150/7000...... Loss : 2.3969
Epoch: 1200/7000...... Loss : 2.3644
Epoch: 1250/7000...... Loss : 2.3339
Epoch: 1300/7000...... Loss : 2.3051
Epoch: 1350/7000...

In [22]:
def predict(model, words):
  sentence = np.array([[word2int[word] for word in words]])
  sentence = one_hot_encoder(sentence, 1, sentence.shape[1], vocab_size)
  sentence = torch.from_numpy(sentence)
  sentence = sentence.to(device)

  out, hidden = model(sentence)

  prob = nn.functional.softmax(out[-1], dim=0).data
  word_ind = torch.max(prob, dim=0)[1].item()

  return int2word[word_ind+1], hidden

In [23]:
def sample(model, out_size, start = "hey"):
  model.eval()

  start = start.lower()

  start = start.split()

  words = [word for word in start]

  size = out_size - len(words)

  print("===============",size)
  for i in range(size):
    o, h = predict(model, words)
    words.append(o)

  return " ".join(words)

In [25]:
sample(model, 7, "charles")



'charles will f1 is is is is'