# model0
- following the tutorial found on: https://huggingface.co/blog/how-to-train



### preprocessing

In [14]:
# open file IMDB_dataset.csv and save it as txt file
import csv 
csv_file = 'IMDB_dataset.csv'
txt_file = 'IMDB_dataset.txt'

with open('data/IMDB_dataset.txt', 'w') as output_file:
    with open('data/IMDB_dataset.csv', 'r') as input_file:
        for line in input_file:
            [ output_file.write(" ".join(row)+'\n') for row in csv.reader(input_file)]


In [32]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path("./data/").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

# Save files to disk
tokenizer.save_model(".", "m0_models/imdb_tokenizer")






['./models/imdb_tokenizer-vocab.json', './models/imdb_tokenizer-merges.txt']

### 2. Train a tokenizer

In [41]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./m0_models/imdb_tokenizer-vocab.json",
    "./m0_models/imdb_tokenizer-merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode("Hi what are you doing?"), "\n",
    tokenizer.encode("Hi what are you doing?").tokens,
)
# Encoding(num_tokens=7, ...)
# tokens: ['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']


Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]) 
 ['<s>', 'Hi', 'Ġwhat', 'Ġare', 'Ġyou', 'Ġdoing', '?', '</s>']


### 3. Train a language model from scratch

In [46]:
from torch.utils.data import Dataset

class EsperantoDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./m0_models/imdb_tokenizer-vocab.json",
            "./m0_models/imdb_tokenizer-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path("./data/").glob("*-train.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])


In [45]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./models/EsperBERTo-small",
    tokenizer="./models/EsperBERTo-small"
)

# The sun <mask>.
# =>

result = fill_mask("La suno <mask>.")

# {'score': 0.2526160776615143, 'sequence': '<s> La suno brilis.</s>', 'token': 10820}
# {'score': 0.0999930202960968, 'sequence': '<s> La suno lumis.</s>', 'token': 23833}
# {'score': 0.04382849484682083, 'sequence': '<s> La suno brilas.</s>', 'token': 15006}
# {'score': 0.026011141017079353, 'sequence': '<s> La suno falas.</s>', 'token': 7392}
# {'score': 0.016859788447618484, 'sequence': '<s> La suno pasis.</s>', 'token': 4552}


  from .autonotebook import tqdm as notebook_tqdm


OSError: Can't load the configuration of './models/EsperBERTo-small'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure './models/EsperBERTo-small' is the correct path to a directory containing a config.json file