In [17]:
%load_ext autoreload
%autoreload 2
import torch
from torch.utils.data import Dataset,DataLoader
from pathlib import Path

from scripts.transliteration_tokenizers import create_source_target_tokenizers
from scripts.data_utils import TransliterationDataset,pad_collate
from scripts.models import Simple_seq2seq,Attention_seq2seq
#from scripts.attention_seq2seq import Attention_seq2seq
from scripts.train_utils import masked_loss, masked_accuracy

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
cur_dir = Path.cwd()
data_dir = cur_dir / "data"
raw_data_dir = data_dir / "raw_data"
proc_data_dir = data_dir / "processed_data"

sample_file = raw_data_dir / "sample.tsv"

train_file = proc_data_dir / "train_clean.tsv"
dev_file = raw_data_dir / "te.translit.sampled.dev.tsv"
test_file = raw_data_dir / "te.translit.sampled.test.tsv"

weighted_sample_file = proc_data_dir / "weighted_sample.tsv"
max_sample_file = proc_data_dir / "max_sample.tsv"
repeat_sample_file = proc_data_dir / "repeat_sample.tsv"

weighted_dev_file = proc_data_dir / "weighted_dev.tsv"
max_dev_file = proc_data_dir / "max_dev.tsv"
repeat_dev_file = proc_data_dir / "repeat_dev.tsv"

weighted_train_file = proc_data_dir / "weighted_train.tsv"
max_train_file = proc_data_dir / "max_train.tsv"
repeat_train_file = proc_data_dir / "repeat_train.tsv"

weighted_test_file = proc_data_dir / "weighted_test.tsv"
max_test_file = proc_data_dir / "max_test.tsv"
repeat_test_file = proc_data_dir / "repeat_test.tsv"

target_corpus_file =  proc_data_dir / "target_corpus.txt"
source_corpus_file = proc_data_dir / "source_corpus.txt"

In [3]:
source_tokenizer, target_tokenizer = create_source_target_tokenizers(source_corpus_file,target_corpus_file, 128,128)
pad_id = target_tokenizer.padding['pad_id']
model = Simple_seq2seq(64, 128,source_tokenizer, target_tokenizer)

In [4]:
max_dataset = TransliterationDataset(max_sample_file,source_tokenizer, target_tokenizer)
max_sample_loader = DataLoader(max_dataset, batch_size = 3, collate_fn=pad_collate, drop_last=False)
max_sample_iter = iter(max_sample_loader)
next(max_sample_iter)
max_batch = next(max_sample_iter)
max_batch

(tensor([[ 1, 31, 10, 55, 34, 54, 81,  2],
         [ 1, 31, 35, 32, 24,  2,  0,  0],
         [ 1, 31, 69, 32,  2,  0,  0,  0]]),
 tensor([[  1,   6,   4,  21,  55,  19,  44,  79,  92,   2],
         [  1,   6, 104,  86,   2,   0,   0,   0,   0,   0],
         [  1,   6, 104,   4,   2,   0,   0,   0,   0,   0]]),
 [8, 6, 5])

In [5]:
["".join(word.split()) for word in source_tokenizer.decode_batch(max_batch[0].tolist())]

['angikarinchaaru', 'anthamu', 'anatham']

In [6]:
["".join(word.split()) for word in target_tokenizer.decode_batch(max_batch[1].tolist())]

['అంగీకరించారు', 'అంతము', 'అంతం']

In [7]:
max_out = model(max_batch)
print(max_batch[0].shape, max_batch[1].shape,len(max_batch[2]) ,max_out.shape)

torch.Size([3, 8]) torch.Size([3, 10]) 3 torch.Size([3, 9, 128])


In [8]:
["".join(word.split()) for word in target_tokenizer.decode_batch(torch.argmax(max_out, dim = -1).tolist())]

['ైద్ేనికిలిలిఫుంది', 'సంద్లికుగసుసుసుసు', 'ైద్లికుచేచేచేచేసు']

In [9]:
masked_loss(max_out,max_batch[1], pad_id ), masked_accuracy(max_out,max_batch[1], pad_id )

(tensor(4.8207, grad_fn=<NllLossBackward>), tensor(0.))

In [10]:
source_tokenizer, target_tokenizer = create_source_target_tokenizers(source_corpus_file,target_corpus_file, 128,128)
pad_id = target_tokenizer.padding['pad_id']
model = Attention_seq2seq(64, 128,source_tokenizer, target_tokenizer)

In [11]:
max_dataset = TransliterationDataset(max_sample_file,source_tokenizer, target_tokenizer)
max_sample_loader = DataLoader(max_dataset, batch_size = 3, collate_fn=pad_collate, drop_last=False)
max_sample_iter = iter(max_sample_loader)
next(max_sample_iter)
max_batch = next(max_sample_iter)
max_batch

(tensor([[ 1, 31, 10, 55, 34, 54, 81,  2],
         [ 1, 31, 35, 32, 24,  2,  0,  0],
         [ 1, 31, 69, 32,  2,  0,  0,  0]]),
 tensor([[  1,   6,   4,  21,  55,  19,  44,  79,  92,   2],
         [  1,   6, 104,  86,   2,   0,   0,   0,   0,   0],
         [  1,   6, 104,   4,   2,   0,   0,   0,   0,   0]]),
 [8, 6, 5])

In [12]:
["".join(word.split()) for word in source_tokenizer.decode_batch(max_batch[0].tolist())]

['angikarinchaaru', 'anthamu', 'anatham']

In [13]:
["".join(word.split()) for word in target_tokenizer.decode_batch(max_batch[1].tolist())]

['అంగీకరించారు', 'అంతము', 'అంతం']

In [14]:
max_out = model(max_batch)
print(max_batch[0].shape, max_batch[1].shape,len(max_batch[2]) ,max_out.shape)

torch.Size([3, 8]) torch.Size([3, 10]) 3 torch.Size([3, 9, 128])


In [15]:
["".join(word.split()) for word in target_tokenizer.decode_batch(torch.argmax(max_out, dim = -1).tolist())]

['ఊించఎఎఎడుఎఎ', 'ంపుంంండకాకాంప్', 'ాాంాాద్ంంండ']

In [16]:
masked_loss(max_out,max_batch[1], pad_id ), masked_accuracy(max_out,max_batch[1], pad_id )

(tensor(4.8986, grad_fn=<NllLossBackward>), tensor(0.0588))