# Simple RNN ML Model

## Tokenizer - Character based tokenization
## Encoding - One-Hot


In [14]:
from datasets import load_dataset
from transformers import AutoTokenizer
from torch.utils import data

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def encode(examples):
    return tokenizer(examples["text"], truncation=True, max_length=20, padding="max_length")

dataset = load_dataset('wikitext', 'wikitext-2-v1')['train']
dataset = dataset.filter(lambda x: len(x) != 0, input_columns='text') \
                    .map(encode, batched=True)


dataset.set_format("torch", ["input_ids", "token_type_ids", "attention_mask"])
dataloader = data.DataLoader(dataset, batch_size=30)
print(len(dataset))
print(dataset[0])

                    
for batch in iter(dataloader):
    X = batch['input_ids']
    print(tokenizer.decode(X[0]))

Found cached dataset wikitext (/home/fritzprix/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/fritzprix/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-edadd597d2b188b4.arrow


  0%|          | 0/24 [00:00<?, ?ba/s]

23767
{'input_ids': tensor([  101,  1027, 11748,  4801,  4360, 11906,  3523,  1027,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}
[CLS] = valkyria chronicles iii = [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] two manga adaptations were produced, following each of the game's main female protagonists imca [SEP]
[CLS] most of the equipment, arms, and machinery at the little rock arsenal was removed to east [SEP]
[CLS] in 1873, the building was renamed little rock barracks and used as a barracks for married officers [SEP]
[CLS] barker continued to attend evening classes at the croydon art school between the 1920s and the 1940s, [SEP]
[CLS] child < unk > in picture and verse ( by m. k. < unk [SEP]
[CLS] flower fairies of the seasons ; < u

In [1]:
from model import WikiDataset

data = WikiDataset(n_steps=10)


Found cached dataset wikitext (/home/fritzprix/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/fritzprix/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-bdf99227b8fab1b7.arrow
Loading cached processed dataset at /home/fritzprix/.cache/huggingface/datasets/wikitext/wikitext-2-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126/cache-270be629ba8ebf39.arrow


{'ø', ':', 'ต', 'ô', 'ه', 'z', '¥', 'ä', '戦', 'გ', '⅓', 'ị', 'M', '8', 'ë', 'ს', '7', '$', 'ė', 'G', ']', '+', 'O', 'ń', 'с', 'ิ', '‑', 'プ', 'Á', 'n', '・', '×', 'ç', 'Z', '³', 'ṣ', 'ュ', 'ơ', '⅔', 'ầ', '6', 'ễ', 'b', 'š', '@', 'L', '4', 'ั', '\n', 'E', 'ī', 'Ö', '`', "'", '#', 'С', '5', 'ö', 'ร', '€', '¡', '2', 'A', 'ย', 'ص', 'Î', '〈', 'ʿ', '.', 'Y', 'ن', 'ヴ', '☉', '§', '3', '჻', 'r', 'É', 'T', 'о', 'キ', 'κ', 'в', 'Ā', 'ト', 'е', 'â', '/', 'B', '₤', 'å', 'ó', 'č', '9', 'ú', '°', 'µ', 'W', '殻', 'é', '→', '≤', 'ł', '½', 'k', 'Þ', 'ś', 'ṯ', 'ل', 'ス', 'x', 'ม', 'ă', 'J', '=', '…', 'à', 'ệ', 'ắ', 'ū', '’', 'т', 'წ', 'ล', '\ufeff', 'ვ', 'ê', 'ჯ', 'β', 'Í', 'ų', 'ã', 'ო', 't', '′', '\\', '%', 'û', '動', 'y', 'w', 'ō', '⁄', 'o', '1', 'я', 'ძ', 'C', 'ṃ', 'ì', 'ā', 'რ', '±', '[', '&', 'Q', '—', 'q', ';', '–', 'Æ', 'ا', '£', 'დ', 'ỳ', 'ṅ', '†', '火', '|', 'P', 'ก', 'p', 'ử', '>', 'g', 'I', 'v', 'u', 'ッ', '″', 'í', 'X', '攻', 'F', 'D', 'а', 'm', '"', 'H', '～', '♯', 'ư', 'ḥ', 'Ł', 'ấ', '機', 'j', '♭', 'l

In [2]:
data[0]

NameError: name 'data' is not defined