In [19]:
import re

In [5]:
import datasets

In [93]:
from tqdm import tqdm

In [7]:
ds_train = datasets.load_from_disk("./train.hf")

In [10]:
ds_train

Dataset({
    features: ['text'],
    num_rows: 989529
})

In [139]:
ds_train["text"][:10]

['Hmm. ###>Hmm. _',
 'Mundurlah, bajingan. ###>Back the fuck up!',
 'Rebecca... Pikirkanlah baik-baik. ###>Rebecca have a good think about it.',
 "Kau salah. ###>You're wrong.",
 'Brook, apa kau ada perkataan untuk dirimu? ###>Brook, what do you have to say for yourself?',
 'Katakan. ###>Tell me.',
 "Kita mulai dengan cambuk pendek. ###>We'll start with the riding crop.",
 'Oh, aku tahu itu. ###>Oh, I know it.',
 "Tidak bisa . ###>I... can't.",
 'Dan apa yg dia selipkan di bawah meja membuktikan kekuatan bahwa itu benar. ###>And what he slid under the table convinced the powers that be it was true.']

In [None]:
tokens_to_find = [
    "##Feel",
    "@",
]
for sentence in ds_train["text"]:
    for tk in tokens_to_find:
        if tk in sentence:
            print(sentence)

In [138]:
vocab = {"id": set(), "eng": set()}

In [142]:
regex_pattern = re.compile(r'^[a-zA-Z0-9\s.,?!;:\'\"-]+$')
def is_valid_sample(s):
    return bool(regex_pattern.match(s))

In [140]:
clean_data = [sentence for sentence in ds_train["text"] if is_valid_sample(sentence)]

In [141]:
len(clean_data)

0

In [108]:
# tokenizer 

def tokenize(s):
    id, eng = [*s.split("###>")]
    #print("id:", id)
    #print("eng:", eng)
    
    ret = {"id": list(), "eng": list()}
    regex = r'(\s|,|!|\?|\.|:|;|\'|\"|“|”|‘|’|\(|\)|\[|\]|\{|\})'
    
    # id
    tks = re.split(regex, id.strip()) # split with various symbols (excluding hyphens)
    for tk in tks:
        if tk.replace(' ', '') != '':
            ret["id"].append(tk)
    # eng
    tks = re.split(regex, eng.strip())
    for tk in tks:
        if tk.replace(' ', '') != '':
            ret["eng"].append(tk)
    
    return ret

test = "Brook, apa kau ada perkataan untuk dirimu? ###>Brook, what do you have to say for yourself?"
tokenize(test)

{'id': ['Brook',
  ',',
  'apa',
  'kau',
  'ada',
  'perkataan',
  'untuk',
  'dirimu',
  '?'],
 'eng': ['Brook',
  ',',
  'what',
  'do',
  'you',
  'have',
  'to',
  'say',
  'for',
  'yourself',
  '?']}

In [116]:
# test the tokenizer

test_sentences = [
    "Selamat pagi! Apa kabar? #Tanya###>Good morning! How are you? #Question",
    "Saya suka bermain sepak bola, terutama di akhir pekan. @Sport###>I love playing soccer, especially on weekends. @Football",
    "Kita harus pergi ke pasar sekarang! #Urgent###>We have to go to the market now! #Important",
    "Buku ini sangat menarik, dan saya tidak bisa berhenti membacanya. %Menarik###>This book is very interesting, and I can't stop reading it. %Interesting",
    "Apa pendapatmu tentang film itu? *Pikirkan###>What do you think about that movie? *Think about it",
    "Hati-hati! Jalan licin, terutama saat hujan. #HatiHati###>Be careful! The road is slippery, especially when it rains. #Caution",
    "Dia berkata, 'Saya akan datang besok.' #Janji###>He said, 'I will come tomorrow.' #Promise",
    "Saya tidak suka makanan pedas; lebih baik yang manis. #Pilih###>I don't like spicy food; I prefer sweet. #Choice",
    "Mengapa kamu terlambat? Apakah ada masalah? @Tanya###>Why are you late? Is there a problem? @Question",
    "Bisa kita bicara nanti? Saya sibuk saat ini. @Sibuk###>Can we talk later? I'm busy right now. @Busy",
    "Cinta itu aneh, bukan? Terkadang, sulit untuk dipahami. *Cinta###>Love is strange, isn't it? Sometimes, it's hard to understand. *Love",
    "Dia membeli apel, jeruk, dan pisang. %Belanja###>She bought apples, oranges, and bananas. %Shopping",
    "Wow! Itu luar biasa, bukan? #Wow###>Wow! That's amazing, isn't it? #Amazing",
    "Kamu tidak bisa pergi begitu saja; kita harus membahas ini. @Bahas###>You can't just leave; we need to discuss this. @Discuss",
    "Setiap orang berhak atas pendapatnya, bukan? *Pendapat###>Everyone has the right to their opinion, right? *Opinion"
]

for test in test_sentences:
    out = tokenize(test)
    print(out)
    print()

{'id': ['Selamat', 'pagi', '!', 'Apa', 'kabar', '?', '#Tanya'], 'eng': ['Good', 'morning', '!', 'How', 'are', 'you', '?', '#Question']}

{'id': ['Saya', 'suka', 'bermain', 'sepak', 'bola', ',', 'terutama', 'di', 'akhir', 'pekan', '.', '@Sport'], 'eng': ['I', 'love', 'playing', 'soccer', ',', 'especially', 'on', 'weekends', '.', '@Football']}

{'id': ['Kita', 'harus', 'pergi', 'ke', 'pasar', 'sekarang', '!', '#Urgent'], 'eng': ['We', 'have', 'to', 'go', 'to', 'the', 'market', 'now', '!', '#Important']}

{'id': ['Buku', 'ini', 'sangat', 'menarik', ',', 'dan', 'saya', 'tidak', 'bisa', 'berhenti', 'membacanya', '.', '%Menarik'], 'eng': ['This', 'book', 'is', 'very', 'interesting', ',', 'and', 'I', 'can', "'", 't', 'stop', 'reading', 'it', '.', '%Interesting']}

{'id': ['Apa', 'pendapatmu', 'tentang', 'film', 'itu', '?', '*Pikirkan'], 'eng': ['What', 'do', 'you', 'think', 'about', 'that', 'movie', '?', '*Think', 'about', 'it']}

{'id': ['Hati-hati', '!', 'Jalan', 'licin', ',', 'terutama', '

In [None]:
for x in tqdm(ds_train["text"]):
    tks = tokenize(x)
    for t in tks["id"]:
        vocab["id"].add(t)
    for t in tks["eng"]:
        if "#" in t:
            print(t)
        vocab["eng"].add(t)

In [99]:
vocab["id"] = sorted(set(vocab["id"]))
vocab["eng"] = sorted(set(vocab["eng"]))

In [None]:
vocab["eng"][:50]

In [189]:
ds_clean = Dataset.load_from_disk("./train_clean.hf")

In [190]:
ds_clean[0]

{'text': 'Mundurlah, bajingan. ###>Back the fuck up!'}

In [197]:
ds_enc = Dataset.load_from_disk("./train_encoded.hf")["text"]

In [198]:
ds_enc[:5]

[{'eng': [7321, 100075, 73516, 103053, 0], 'id': [44829, 5, 73544, 1892]},
 {'eng': [41906, 75949, 56394, 74605, 100306, 56538, 79193, 1569],
  'id': [54130, 1892, 1892, 1892, 51455, 73486, 1892]},
 {'eng': [55770, 2, 91096, 105604, 1569], 'id': [32653, 122144, 1892]},
 {'eng': [9833,
   5,
   104739,
   68663,
   105834,
   75949,
   100823,
   93840,
   72869,
   105904,
   3516],
  'id': [12396, 5, 72306, 93431, 70724, 117645, 132271, 84233, 3694]},
 {'eng': [49941, 82806, 1569], 'id': [32561, 1892]}]

In [169]:
from utils.coder import Coder

In [173]:
coder_id = Coder(vocab["id"])
coder_eng = Coder(vocab["eng"])

In [178]:
coder_eng.decode([0])

['!']

In [183]:
for x in ds_enc[:20]["text"]:
    id, eng = x["id"], x["eng"]

    print(id, eng)

[44829, 5, 73544, 1892] [7321, 100075, 73516, 103053, 0]
[54130, 1892, 1892, 1892, 51455, 73486, 1892] [41906, 75949, 56394, 74605, 100306, 56538, 79193, 1569]
[32653, 122144, 1892] [55770, 2, 91096, 105604, 1569]
[12396, 5, 72306, 93431, 70724, 117645, 132271, 84233, 3694] [9833, 5, 104739, 68663, 105834, 75949, 100823, 93840, 72869, 105904, 3516]
[32561, 1892] [49941, 82806, 1569]
[34662, 111279, 81411, 78894, 115673, 1892] [54100, 2, 81375, 97587, 105243, 100075, 92820, 66254, 1569]
[47044, 5, 71276, 127642, 91428, 1892] [37108, 5, 25228, 80157, 79193, 1569]
[64681, 77539, 1892] [25228, 1569, 1569, 1569, 62503, 2, 99256, 1569]
[17059, 72306, 133818, 81865, 123862, 81770, 74154, 102266, 103708, 94786, 73452, 91428, 74643, 1892] [5452, 104739, 76000, 96133, 102415, 100075, 99294, 65481, 100075, 89336, 100035, 59849, 79193, 104253, 101748, 1569]
[52023, 0] [25099, 0]
[68634, 5, 133346, 5, 133346, 1892] [54555, 5, 104938, 5, 104938, 1569]
[26980, 91428, 1892] [14320, 79193, 86715, 1569]

In [185]:
for x in ds_enc[:20]["text"]:
    id, eng = x["id"], x["eng"]
    

    print(coder_id.decode(id), coder_eng.decode(eng))



['Mundurlah', ',', 'bajingan', '.'] ['Back', 'the', 'fuck', 'up', '!']
['Rebecca', '.', '.', '.', 'Pikirkanlah', 'baik-baik', '.'] ['Rebecca', 'have', 'a', 'good', 'think', 'about', 'it', '.']
['Kau', 'salah', '.'] ['You', "'", 're', 'wrong', '.']
['Brook', ',', 'apa', 'kau', 'ada', 'perkataan', 'untuk', 'dirimu', '?'] ['Brook', ',', 'what', 'do', 'you', 'have', 'to', 'say', 'for', 'yourself', '?']
['Katakan', '.'] ['Tell', 'me', '.']
['Kita', 'mulai', 'dengan', 'cambuk', 'pendek', '.'] ['We', "'", 'll', 'start', 'with', 'the', 'riding', 'crop', '.']
['Oh', ',', 'aku', 'tahu', 'itu', '.'] ['Oh', ',', 'I', 'know', 'it', '.']
['Tidak', 'bisa', '.'] ['I', '.', '.', '.', 'can', "'", 't', '.']
['Dan', 'apa', 'yg', 'dia', 'selipkan', 'di', 'bawah', 'meja', 'membuktikan', 'kekuatan', 'bahwa', 'itu', 'benar', '.'] ['And', 'what', 'he', 'slid', 'under', 'the', 'table', 'convinced', 'the', 'powers', 'that', 'be', 'it', 'was', 'true', '.']
['Pondok', '!'] ['Hut', '!']
['Whoa', ',', 'whoa', ',', '

In [221]:
coder_id.decode([0]), coder_eng.decode([0])

(['!'], ['!'])

In [191]:
for i, x in enumerate(ds_enc[:20]["text"]):
    id, eng = x["id"], x["eng"]

    print(id == coder_id.encode(coder_id.decode(id)),
          id == ds_clean["text"]["id"][i]
         )


    print(eng == coder_eng.encode(coder_eng.decode(eng)))


TypeError: list indices must be integers or slices, not str

In [211]:
from dataloader import ENGtoID
import torch
import torchvision

In [212]:
from torchvision.transforms import ToTensor

In [213]:
dataset = ENGtoID(valid=False, transform=ToTensor())

In [214]:
dataset[i]

KeyError: 'eng'

In [218]:
from torch.utils.data import Dataset as torchDataset

from datasets import Dataset as dsDataset

class ENGtoID(torchDataset):
    def __init__(self, valid=False, transform=None):

        self.transform = transform

        dataset_path = "./valid_encoded.hf" if valid else "./train_encoded.hf"
        self.data = dsDataset.load_from_disk(dataset_path)["text"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        x, y = self.data[i]["eng"], self.data[i]["id"]
        if self.transform:
            x = self.transform(x)
        return x, y

dataset = ENGtoID(valid=False, transform=ToTensor())

dataset[0]

TypeError: pic should be PIL Image or ndarray. Got <class 'list'>

In [224]:
z = [5, 6, 7, 8]
torch.tensor(z), torch.Tensor(z)

(tensor([5, 6, 7, 8]), tensor([5., 6., 7., 8.]))