In [1]:
import torch
from torch.utils.data import Dataset,DataLoader
from pathlib import Path
import pandas as pd

from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence,pad_packed_sequence
from scripts.transliteration_tokenizers import create_source_target_tokenizers
from scripts.data_utils import TransliterationDataset,pad_collate

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [2]:
cur_dir = Path.cwd()
data_dir = cur_dir / "data"
raw_data_dir = data_dir / "raw_data"
proc_data_dir = data_dir / "processed_data"

In [3]:
sample_file = raw_data_dir / "sample.tsv"

train_file = proc_data_dir / "train_clean.tsv"
dev_file = raw_data_dir / "te.translit.sampled.dev.tsv"
test_file = raw_data_dir / "te.translit.sampled.test.tsv"

weighted_sample_file = proc_data_dir / "weighted_sample.tsv"
max_sample_file = proc_data_dir / "max_sample.tsv"
repeat_sample_file = proc_data_dir / "repeat_sample.tsv"

weighted_dev_file = proc_data_dir / "weighted_dev.tsv"
max_dev_file = proc_data_dir / "max_dev.tsv"
repeat_dev_file = proc_data_dir / "repeat_dev.tsv"

weighted_train_file = proc_data_dir / "weighted_train.tsv"
max_train_file = proc_data_dir / "max_train.tsv"
repeat_train_file = proc_data_dir / "repeat_train.tsv"

weighted_test_file = proc_data_dir / "weighted_test.tsv"
max_test_file = proc_data_dir / "max_test.tsv"
repeat_test_file = proc_data_dir / "repeat_test.tsv"

target_corpus_file =  proc_data_dir / "target_corpus.txt"
source_corpus_file = proc_data_dir / "source_corpus.txt"

In [4]:
source_tokenizer, target_tokenizer = create_source_target_tokenizers(source_corpus_file,target_corpus_file, 128,128)

In [5]:
max_dataset = TransliterationDataset(max_sample_file,source_tokenizer, target_tokenizer)
max_dataset[9]

([1, 31, 10, 42, 14, 34, 54, 12, 51, 2],
 [1, 6, 4, 21, 55, 19, 44, 79, 71, 81, 2])

In [6]:
weighted_dataset = TransliterationDataset(weighted_sample_file,source_tokenizer, target_tokenizer)
weighted_dataset[9]

([1, 31, 35, 32, 24, 2], [1, 6, 104, 86, 2], 0.6)

In [7]:
repeat_dataset = TransliterationDataset(repeat_sample_file,source_tokenizer, target_tokenizer)
repeat_dataset[9]

([1, 31, 14, 32, 2], [1, 6, 4, 19, 4, 2])

In [8]:
''.join([target_tokenizer.id_to_token(ids) for ids in max_dataset[7][1]][1:-1])

'అంగీకరించడం'

In [9]:
max_sample_loader = DataLoader(max_dataset, batch_size = 3, collate_fn=pad_collate, drop_last=False)
max_sample_iter = iter(max_sample_loader)
max_batch = next(max_sample_iter)
max_batch

(tensor([[  1, 127,  34, 111,   2],
         [  1,  31,  14,  32,   2],
         [  1,  31,  14,   4,   2]]),
 tensor([[ 1,  6,  4, 28, 93, 66,  2],
         [ 1,  6,  4, 19,  4,  2,  0],
         [ 1,  6,  4, 19,  2,  0,  0]]),
 [5, 5, 5])

In [10]:
weighted_sample_loader = DataLoader(weighted_dataset, batch_size = 3, collate_fn=pad_collate, drop_last=False)
weighted_sample_iter = iter(weighted_sample_loader)
weighted_batch = next(weighted_sample_iter)
weighted_batch

(tensor([[  1,  31,  14,   4,   2],
         [  1,  32,  14,   4,   2],
         [  1, 127,  32,   2,   0]]),
 tensor([[  1,   6,   4,  19,   2],
         [  1,   6,   4,  19,   2],
         [  1,   6, 104,   4,   2]]),
 [5, 5, 4],
 (0.33299999999999996, 0.25, 0.75))

In [11]:
repeat_sample_loader = DataLoader(repeat_dataset, batch_size = 3, collate_fn=pad_collate, drop_last=False)
repeat_sample_iter = iter(repeat_sample_loader)
repeat_batch = next(repeat_sample_iter)
repeat_batch

(tensor([[  1, 127,  34, 111,   2],
         [  1,  32,  14,   4,   2],
         [  1, 127,  32,   2,   0]]),
 tensor([[  1,   6,   4,  28,  93,  66,   2],
         [  1,   6,   4,  19,   2,   0,   0],
         [  1,   6, 104,   4,   2,   0,   0]]),
 [5, 5, 4])