In [1]:
import torch
import os
import numpy as np
import torchvision as tv
import matplotlib.pyplot as plt
from torchvision import datasets, models, transforms
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

dataset_path = '/content/drive/MyDrive/Colab Notebooks/dataset.csv'
dataset = pd.read_csv(dataset_path, header=None)
print(dataset.shape)
print(dataset.head())
X = dataset[dataset.columns[:-1]]
y = dataset[dataset.columns[-1]]

(20000, 2)
                      0                     1
0    tkwgnzexww kfuaywb    qhtdkwbuttxhcryvtz
1      pcnaoqmoazhx jdz      m kylnjlyweuxgaw
2  ljavvykffjxochoobbev  igyssvhccgul ellzzbs
3     sjwldsdeeprqwmcjh     pgtiapabbmontj ge
4      bfmunydqia jfsym      zcjrkvanfyxgcpvj


In [4]:
def word_to_numbers(alphabet):
  mapper = {key: idx + 1 for idx, key in enumerate(alphabet)}
  mapper[0] = None
  return lambda s: [mapper[l] for l in s]


def numbers_to_word(alphabet):
  mapper = {idx + 1: key for idx, key in enumerate(alphabet)}
  mapper[None] = 0
  return lambda n: "".join([mapper[i] for i in n])


In [5]:
import string

alphabet = string.ascii_lowercase + ' '
to_num = word_to_numbers(alphabet)
to_word = numbers_to_word(alphabet)
test = to_num("hello")
print(test, to_word(test))

[8, 5, 12, 12, 15] hello


In [7]:
X_ = [to_num(x)[0:10] for x in X[0].tolist()]
y_ = [to_num(i)[0:10] for i in y.tolist()]


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.15)
X_train = torch.LongTensor(X_train)
X_test = torch.LongTensor(X_test)
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

torch.Size([17000, 10])
torch.Size([3000, 10])
torch.Size([17000, 10])
torch.Size([3000, 10])


In [17]:
class Network(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embed = torch.nn.Embedding(vocab_size, embed_dim)
        self.rnn = torch.nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, inp):
      inp = self.embed(inp)
      inp = inp.unsqueeze(1)
      out, _ = self.rnn(inp)
      return self.linear(out)


In [18]:
vocab_size = len(alphabet) + 1
embed_dim = 28
hidden_dim = 128

model = Network(vocab_size, embed_dim, hidden_dim)

In [19]:
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(model.parameters()), lr=0.001)

In [24]:
train_loss = 0.
train_passed = 0
model.train()
for idx, X_batch in enumerate(X_train):
  optimizer.zero_grad()
  answers = model.forward(X_batch)
  answers = answers.transpose(1, 2)
  y_batch = y_train[idx]
  y_batch = y_batch.unsqueeze(1)
  loss = loss_fn(answers, y_batch)
  train_loss += loss.item()
  loss.backward()
  optimizer.step()
  train_passed += 1

model.eval()
matches, total = 0, 0
for idx, X_batch in enumerate(X_test):
  answers = model(X_batch)
  predictions = torch.nn.functional.softmax(answers, dim=2)
  _, batch_out = predictions.max(dim=2)
  batch_out = batch_out.squeeze(1)
  original = y_test[idx]
  matches += torch.eq(batch_out, original).sum().item()
  total += torch.numel(batch_out)

accuracy = matches / total
print('Accuracy: {:4.2f}%'.format(accuracy * 100))


Accuracy: 100.00%


In [25]:
control = "khoorczruog"
x_ = torch.LongTensor(to_num(control))
answers = model(x_)
predictions = torch.nn.functional.softmax(answers, dim=2)
_, batch_out = predictions.max(dim=2)
batch_out = batch_out.squeeze(1)
print(to_word(batch_out.tolist()))

hello world
