<a href="https://colab.research.google.com/github/gargi-57/Python_Notebooks/blob/main/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import re

In [2]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("GPU is available and being used")

else:
  device = torch.device("cpu")
  print("GPU is not available, using CPU instead")

GPU is available and being used


In [3]:
filename="/content/gutenberg.txt"
text = open(filename,'r',encoding='utf-8').read()
text = text.lower()
text = re.sub(r'\ufeff','',text)

In [4]:
#mappings for unique characters and creating unique indexes for characters
chars = sorted(list(set(text)))
chars_to_int = dict((c,i) for i,c in enumerate(chars))

In [5]:
#Vocabulary and Data
n_chars = len(text)
n_vocab = len(chars)
print("Total Characters:",n_chars)
print("Total Unique Characters i.e. Vocabulary:",n_vocab)

Total Characters: 412715
Total Unique Characters i.e. Vocabulary: 75


In [6]:
chars_to_int

{'\n': 0,
 ' ': 1,
 '!': 2,
 '#': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 '(': 7,
 ')': 8,
 '*': 9,
 ',': 10,
 '-': 11,
 '.': 12,
 '/': 13,
 '0': 14,
 '1': 15,
 '2': 16,
 '3': 17,
 '4': 18,
 '5': 19,
 '6': 20,
 '7': 21,
 '8': 22,
 '9': 23,
 ':': 24,
 ';': 25,
 '=': 26,
 '?': 27,
 '[': 28,
 ']': 29,
 '_': 30,
 'a': 31,
 'b': 32,
 'c': 33,
 'd': 34,
 'e': 35,
 'f': 36,
 'g': 37,
 'h': 38,
 'i': 39,
 'j': 40,
 'k': 41,
 'l': 42,
 'm': 43,
 'n': 44,
 'o': 45,
 'p': 46,
 'q': 47,
 'r': 48,
 's': 49,
 't': 50,
 'u': 51,
 'v': 52,
 'w': 53,
 'x': 54,
 'y': 55,
 'z': 56,
 '£': 57,
 '°': 58,
 'à': 59,
 'â': 60,
 'æ': 61,
 'ç': 62,
 'è': 63,
 'é': 64,
 'ê': 65,
 'ô': 66,
 'ü': 67,
 '—': 68,
 '‘': 69,
 '’': 70,
 '“': 71,
 '”': 72,
 '•': 73,
 '™': 74}

In [7]:
seq_i = text[0:10]
print(seq_i)
print("____________________________________")
seq_o = text[10]
print(seq_o)

the projec
____________________________________
t


In [8]:
#Dataset Preparation

seq_length = 100
x =[]
y= []

for i in range(0,n_chars - seq_length,1):
  seq_in= text[i:i+seq_length]
  seq_out = text[i+seq_length]
  x.append([chars_to_int[char] for char in seq_in])
  y.append(chars_to_int[seq_out])
patterns = len(x)
print("Total Patterns for Sentence Generation:",patterns)

Total Patterns for Sentence Generation: 412615


In [10]:
#reshape data for LSTM
data_X = torch.tensor(x,dtype=torch.float32).reshape(patterns,seq_length,1)
data_X = data_X/float(n_vocab)
data_y = torch.tensor(y)

In [19]:
units = 256
layers = 1
#Define Model
class Gen_Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.lstm = nn.LSTM(input_size= 1,hidden_size = units,num_layers=layers,batch_first=True)
    self.dropout = nn.Dropout(0.2)
    self.linear = nn.Linear(units,n_vocab)
  def forward(self,x):
    x,_ = self.lstm(x)
    x = x[:,-1,:]
    x = self.linear(self.dropout(x))
    return x

In [20]:
n_epochs = 5
batch_size =120
model = Gen_Model()
model = model.to(device)

In [21]:
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
loader = data.DataLoader(data.TensorDataset(data_X,data_y),shuffle = True,batch_size = batch_size)

In [22]:
best_model = None
best_loss = np.inf
for epoch in range(n_epochs):
  model.train()
  for X_batch,y_batch in loader:
    y_pred = model(X_batch.to(device))
    loss= loss_fn(y_pred,y_batch.to(device))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  #Validation
  model.eval()
  loss = 0
  with torch.no_grad():
    for X_batch,y_batch in loader:
      y_pred = model(X_batch.to(device))
      loss += loss_fn(y_pred,y_batch.to(device))

    if loss < best_loss:
      best_loss = loss
      best_model = model.state_dict()
      print("Epoch %d: Cross entropy: %.4f" % (epoch,loss))
torch.save([best_model, chars_to_int],"single-char_prediction.pth")

Epoch 0: Cross entropy: 1137911.3750
Epoch 1: Cross entropy: 1102504.1250
Epoch 2: Cross entropy: 1079620.1250
Epoch 3: Cross entropy: 1049106.6250
Epoch 4: Cross entropy: 1024073.9375


In [23]:
seq_length = 100
start = np.random.randint(0,len(text)-seq_length)
prompt = text[start:start+seq_length]
pattern = [chars_to_int[c] for c in prompt]

In [24]:
best_model,chars_to_init = torch.load("single-char_prediction.pth")
n_vocab = len(chars_to_int)
int_to_char = dict((i,c) for c, i in chars_to_int.items())

In [25]:
model.load_state_dict(best_model)

<All keys matched successfully>

In [27]:
print('Prompt:  "%s"' % prompt)

Prompt:  "dy and practical work. they looked
a fine body of men, and have been greatly appreciated by the miss"


In [28]:
model.eval()
with torch.no_grad():
  for i in range(1000):
    # create a input
    x = np.reshape(pattern,(1,len(pattern),1)) / float(n_vocab)
    x = torch.tensor(x, dtype=torch.float32)
    #generate logits as output
    prediction = model(x.to(device))
    #convert logits into character
    index = int(prediction.argmax())
    result = int_to_char[index]
    print(result,end ="")
    #append the new character into the prompt for the next iteration
    pattern.append(index)
    pattern = pattern[1:]

 of the cornent of the cornent oo the cornent of the cornent oo the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo the cornent of the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo the cornent of the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo the cornent of the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo the cornent of the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo the cornent of the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo the cornent of the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo the cornent of the coine tf tee the cart of the cornent oo the cornent of the cornent oo the cornent of the cornent oo