# 🦕 Tokenizer

#### 📚 Libraries
Import libraries and configure the environment.

In [12]:
# NLP
import re
from unidecode import unidecode

# Data
import json
from datasets import load_dataset

# DL
import torch

#### 📂 Data
Load the data and take a look at the first few rows.

In [2]:
dataset = load_dataset("dewithsan/secop_corpus_clean")

In [3]:
corpus_df = dataset["train"].to_pandas()
corpus_df.head(5)

Unnamed: 0,id_doc,doc_text
0,266671326,SOLICITUD CERTIFICACIÓN DE \nINSUFICIENCIA ...
1,321522708,ADENDA Página 1 \n \n ADENDA No. 1 \n \nPe...
2,291869951,\n \n \n \nSISTEMA ESTRATÉGICO DE TRANSPORTE...
3,291901564,CERTIFICACION DE INSUFICIENCIA\nVIGENTE\nDESDE...
4,304566990,ANE XO Nro. 2 \nOBLIGACIONES DE LA POLICÍA NAC...


#### 🔑 Tokenizer
Tokenize the text data.

In [4]:
corpus_text = "\n".join(corpus_df["doc_text"])
print(f"Length of corpus: {len(corpus_text):,} characters")

Length of corpus: 133,947,131 characters


In [5]:
# Extra spaces
corpus_text_clean = corpus_text.replace("\n", " ").replace("\r", " ")
corpus_text_clean = re.sub(r" +", " ", corpus_text_clean)

# Normalization
pat = r'[^\w\s!"·$%&/()=?¿\\|@#+,\.-^\*;:_\[\]\{\} !¡¿?,\.@#$%^&\*]'
corpus_text_clean = re.sub(pat, "", corpus_text_clean)
corpus_text_clean = corpus_text_clean.lower()
corpus_text_clean = unidecode(corpus_text_clean)

In [6]:
chars = sorted(list(set(corpus_text_clean)))
print(f"Number of unique characters: {len(chars)}")
print("".join(chars))

Number of unique characters: 96
	
 !"#$%&'()*+,./0123456789:;<=>?@ABCDEFGHILMNOPRSTUVXYZ[\]^_abcdefghijklmnopqrstuvwxyz{|}


In [7]:
stoi = {char: i for i, char in enumerate(chars, start=1)}
itos = {i: char for i, char in enumerate(chars, start=1)}
itos[0] = "[UNK]"


def encode(text: str, stoi: dict = stoi) -> list:
    """Encode text to a list of integers."""
    return [stoi[char] if char in stoi else 0 for char in text]


def decode(integers: list, itos: dict = itos) -> str:
    """Decode list of integers to text."""
    return "".join([itos[i] for i in integers])

In [8]:
hey = "Hola, mundo"
print(encode(hey))
print(decode(encode(hey)))

[48, 82, 79, 68, 21, 9, 80, 88, 81, 71, 82]
Hola, mundo


In [9]:
data = torch.tensor(encode(corpus_text_clean), dtype=torch.long)

In [10]:
print(data.shape, data.dtype)
print(data[:100])

torch.Size([127545940]) torch.int64
tensor([ 9, 86, 82, 79, 76, 70, 76, 87, 88, 71,  9, 70, 72, 85, 87, 76, 73, 76,
        70, 68, 70, 76, 82, 81,  9, 71, 72,  9, 76, 81, 86, 88, 73, 76, 70, 76,
        72, 81, 70, 76, 68,  9, 82,  9, 76, 81, 72, 91, 76, 86, 87, 72, 81, 70,
        76, 68,  9, 71, 72,  9, 83, 72, 85, 86, 82, 81, 68, 79,  9, 70, 82, 71,
        76, 74, 82, 34,  9, 87, 68, 75, 73, 26, 30,  9, 89, 72, 85, 86, 76, 82,
        81, 34,  9, 25,  9, 22, 24,  9, 83, 68])


Train validation split.

In [11]:
train_size = int(len(data) * 0.90)
train_data = data[:train_size]
val_data = data[train_size:]

print(f"Train size: {train_data.shape[0]:,} characters")
print(f"Validation size: {val_data.shape[0]:,} characters")

Train size: 114,791,346 characters
Validation size: 12,754,594 characters


Save data to disc

In [14]:
torch.save(train_data, "data/train_data.pt")
torch.save(val_data, "data/val_data.pt")

In [15]:
# Save stoi and itos as json
encoder_dict = {"stoi": stoi, "itos": itos}
with open("data/encoder_dict.json", "w") as f:
    json.dump(encoder_dict, f)