# Dataset Generation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import logging
import os
import random
import sys
import warnings

import numpy as np
import orjson
import torch
from tqdm import tqdm

ROOT_FOLDER = os.path.join(".", "..")
if ROOT_FOLDER not in sys.path:
    sys.path.insert(0, ROOT_FOLDER)


# from environment import Environment, EnvSettings

warnings.filterwarnings("ignore")
logging.disable(logging.WARNING)
torch.backends.cudnn.deterministic = True
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

## Utils

In [3]:
def set_seed(seed: int = 420):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)


def save_json(path: str, data: dict):
    with open(path, "wb") as f:
        f.write(
            orjson.dumps(
                data,
                option=orjson.OPT_SORT_KEYS
                + orjson.OPT_SERIALIZE_NUMPY
                + orjson.OPT_INDENT_2,
            )
        )

## 1. Single number

In [4]:
def generate_number_digit(
    n: int = 500,
    min_length: int = 3,
    max_length: int = 10,
    max_number_length: int = 3,
    seed: int = 420,
) -> list[str]:
    letters = "abcdefghijklmnopqrstuvwxyz"
    symbols = "!@$%^&*()_+№;:?*()-=|\\/`~/., "
    chars = symbols + letters + letters.upper()
    digits = "0123456789"

    results = []
    set_seed(seed)
    for _ in tqdm(range(n)):
        lst = random.choices(chars, k=random.randint(min_length, max_length))
        number_length = random.randint(1, max_number_length)

        insert_idx = random.randint(0, len(lst) - number_length)

        lst[insert_idx : insert_idx + number_length] = random.choices(
            digits, k=number_length
        )

        results.append("".join(lst))

    return results


single_number_data = generate_number_digit()

100%|██████████| 500/500 [00:00<00:00, 515777.67it/s]


In [5]:
single_number_data[:10]

['039',
 'G19',
 'MAYk79l',
 '525',
 'Ef@654*',
 'Wvh28',
 'klTMK9',
 'e(|LT075№%',
 'H(-F=u148T',
 '.~v№oB41']

In [6]:
save_json(
    "../../data/single_number.json", {"regex": r"\d+", "examples": single_number_data}
)

## 2. Word from subset of symbols

In [7]:
def generate_word(
    n: int = 500,
    min_length: int = 6,
    max_length: int = 20,
    min_word_length: int = 3,
    max_word_length: int = 6,
    seed: int = 420,
) -> list[str]:
    word_letters = "cat"

    letters = "bdefghijklmnopqrsuvwxyz"
    symbols = "!@$%^&*()_+№;:?*()-=|\\/`~/.,"
    digits = "0123456789"
    non_letters = symbols + digits
    chars = non_letters + letters + letters.upper()
    chars += " " * (len(chars) // 4)  # make space more frequent

    results = []
    set_seed(seed)
    for _ in tqdm(range(n)):
        lst = random.choices(chars, k=random.randint(min_length, max_length))
        word_length = random.randint(min_word_length, max_word_length)

        insert_idx = random.randint(0, len(lst) - word_length)

        word = random.choices(word_letters, k=word_length)
        if insert_idx != 0:
            word.insert(0, " ")
            insert_idx -= 1

        if insert_idx + word_length < len(lst):
            word.append(" ")

        lst[insert_idx : insert_idx + word_length] = word

        results.append("".join(lst))

    return results


word_data = generate_word()

100%|██████████| 500/500 [00:00<00:00, 270356.07it/s]


In [8]:
word_data[:10]

[' tctcc №',
 '0 7 M p cca fF',
 'r ttacaa $M|Y=KLxo:^w',
 'u tccta K h\\.  7',
 ' ccca \\~U',
 ' =m 8SYL:gr ttctc ;o',
 ' aacta O) Q8',
 ' k i  EU7Y.O aaac vvr',
 '  5% ttaaat %',
 'ctcaat 8 O0knF']

In [9]:
save_json("../../data/word.json", {"regex": r"[cat]+", "examples": word_data})

## 3. Simple email

In [10]:
def generate_email(
    n: int = 500,
    min_body_length: int = 4,
    max_body_length: int = 8,
    min_domain_main_length: int = 3,
    max_domain_main_length: int = 4,
    min_domain_sub_length: int = 2,
    max_domain_sub_length: int = 5,
    max_length: int = 35,
    seed: int = 420,
) -> list[str]:
    letters = "abcdefghijklmnopqrstuvwxyz"
    letters += letters.upper()
    symbols = "!@$%^&*()_+№;:?*()-=|\\/`~/.,"
    digits = "0123456789"
    non_letters = symbols + digits
    chars = non_letters + letters
    chars += " " * (len(chars) // 4)  # make space more frequent

    results = []
    set_seed(seed)
    for _ in tqdm(range(n)):
        body_length = random.randint(min_body_length, max_body_length)
        domain_main_length = random.randint(
            min_domain_main_length, max_domain_main_length
        )
        domain_sub_length = random.randint(min_domain_sub_length, max_domain_sub_length)

        email_length = body_length + domain_main_length + domain_sub_length + 2
        lst = random.choices(chars, k=random.randint(email_length, max_length))

        insert_idx = random.randint(0, len(lst) - email_length)

        body = random.choices(letters, k=body_length)
        domain_main = random.choices(letters, k=domain_main_length)
        domain_sub = random.choices(letters, k=domain_sub_length)
        email = body + ["@"] + domain_main + ["."] + domain_sub
        if insert_idx != 0:
            email.insert(0, " ")
            insert_idx -= 1

        if insert_idx + len(email) < len(lst):
            email.append(" ")

        lst[insert_idx : insert_idx + len(email)] = email

        results.append("".join(lst))

    return results


email_data = generate_email()

100%|██████████| 500/500 [00:00<00:00, 135195.46it/s]


In [11]:
email_data[:10]

['+KJ%8 $W?9 CXSk@hhao.gKaY ',
 'Kxp?&xm Po!Kq HfODwcmd@CHK.oeluP',
 ' bSYK?h hfQhgnC@YiW.fcNZS',
 'z*   TAnhR@buVO.rIr v!8 C  ',
 '*Sa N MSxsucV@IOf.SSBmi vB.wek-x,q',
 'ZZSf@UJUT.sU   iHjD E(~h',
 '8~  HXG; eAhi@bEqn.Gd  9',
 'ireEQMm@JBEt.BiHi  ',
 '% oyKV@UWr.VxCM 0   ys-` :Ya P ,7+',
 'I1 LeRwjIu@iIO.djw KL;']

In [12]:
save_json("../../data/email.json", {"regex": r"\w+@\w+\.\w+", "examples": email_data})