The purpose of this notebook is to take a text input and randomly select junk to intersperse throughout the text. The junk will be randomly inserted between words.

There will be two junk regimes to choose from. In the first regime, the junk is randomly chosen from a set of fake latin/greek sounding pseudowords.

In the second regime, the junk is chosen from "complete junk": random strings of characters, essentially "keyboard mashing". These are random letters, numbers, underscores, hyphens, etc. all mixed together in a string.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

In [None]:

JUNK_TOKENS_NUMERIC_MASH = tuple({
    "123", "456", "789", "101112", "333",
    "elt", "aaaa", "bbbbb",
    "yzyzyzyz", "gorggorggorg", "99zzyyaa",
    "qxjx92a__pllqqq", "55m-ghq", "aaslkfd",
    "qmplxz", "338", "qvnnxa",
    "zlrp33", "mash_09", "rxxjpl", "qlqllqq",
    "prax_7", "qxjx92a", "pllqqq",
    "aaslkfdjdfj", "tmpshd", "nseg",
    "retetetet", "yuiyuiyui", "ebg", "38u7zz",
    "zz__aa", "vm00p", "rlrllrllr",
    "zzqqmm22", "vvrraaxx", "nno0ppll",
    "brrtzz44", "klxklxklx", "mprqzz09",
    "xxyyzz8899", "qqqqpplm", "zzrraaqp",
    "zmgxpl01", "gxpqrrn", "lmqqzz3",
    "arx9z9z9", "zzoomm11", "jhqwxp77",
    "fzznmm22", "xxlwwkq3", "pzxq88lm",
    "zzzzjqpa", "mqx9nnq", "xplmrr00",
    "kqqz0z0z", "rrxq99aa", "pllxxwq",
    "dqzmmf91", "rnggll22", "wqxpzzl",
    "zlkqp55", "mmzqrr8", "vxvxvz22",
    "plmxy7q", "qzzoxxk", "rqm99zzy",
    "lzzppq14", "xgqr00qs", "mzznplw4",
    "znqx88aa", "qqzmrr02", "pplqz1z1"
})


JUNK_TOKENS_PSEUDO = tuple({
    "amet",
    "consectetur", "adipiscing", "thalion", "meridos", "kalethon", "doryne",
    "pelaxis", "morathos", "serion", "valethis", "threnon", "kaesira",
    "lorinon", "barathor", "melidon", "tarikos", "pharion", "silethra",
    "orinthos", "varenes", "kalithra", "morinae", "lycrate", "helinon",
    "dramis", "solithon", "perakis", "minorae", "tarinon", "silathos",
    "verion", "korathe", "periostra", "kaligenis", "dorathium", "seraphele",
    "thalionis", "varenthos", "melidonae", "lorinthum", "xenathra",
    "phorinos", "veracalix", "mithareon", "torivalis", "dorithra",
    "luminarae", "heliostrum", "parathion", "sarimora", "kalithium",
    "morathion",
    "thalimoros",
    "xenarithos",
    "moradrix",
    "kalitheron",
    "perionax",
    "dorimethis",
    "varalithos",
    "serinthae",
    "melaxiora",
    "phorandis",
    "loratheon",
    "kaliserum",
    "verathrix",
    "moradion",
    "dorexila",
    "sariphon",
    "thalexior",
    "minarethus",
    "helidrax",
    "torinathae",

})

JUNK_TOKENS_LATIN_GREEK = tuple({"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
    "sigma", "pi", "psi", "phi", "kappa", "omega", "lorem", "ipsum"})

_DEFAULT_RNG = np.random.default_rng()


In [None]:
print(len(JUNK_TOKENS_NUMERIC_MASH))
print(len(JUNK_TOKENS_PSEUDO))

73
73


In [None]:
def add_junk(
    text,
    junk_type = "mash",
    p = 0.25,
    rng = None,
    min_junk = 1,
    max_junk = 3,
):
    """
    Insert random junk tokens between words with probability p.
    For each boundary chosen for insertion, insert random number of junk tokens
    from min_junk to max_junk.
    """
    jt = junk_type.strip().lower()
    assert jt in {"mash", "pseudo"}

    if jt == "mash":
        junk_tokens = JUNK_TOKENS_NUMERIC_MASH
    else:
        junk_tokens = JUNK_TOKENS_PSEUDO

    if not text:
        return text

    words = text.split()
    n = len(words)


    if n < 2 or p <= 0.0:
        return text

    if rng is None:
        rng = _DEFAULT_RNG

    # n-1 possible boundaries between words
    n_boundaries = n - 1


    boundary_mask = rng.random(n_boundaries) < p
    n_insert_boundaries = int(boundary_mask.sum())


    if n_insert_boundaries == 0:
        return text

    # for each True boundary, decide how many junk tokens to insert

    counts_per_boundary = rng.integers(
        min_junk, max_junk + 1, size=n_insert_boundaries
    )
    total_junk = int(counts_per_boundary.sum())


    junk_indices = rng.integers(0, len(junk_tokens), size=total_junk)


    out_len = n + total_junk
    new_words = [None] * out_len

    j = 0  # index in new_words
    k = 0  # index into junk_indices
    c = 0  # index into counts_per_boundary

    for i, w in enumerate(words):
        new_words[j] = w
        j += 1

        # insert junk after this word if boundary_mask[i] is True
        if i < n_boundaries and boundary_mask[i]:
            n_here = int(counts_per_boundary[c])
            c += 1

            # insert n_here junk tokens
            for _ in range(n_here):
                new_words[j] = junk_tokens[junk_indices[k]]
                j += 1
                k += 1

    return " ".join(new_words)


In [None]:
add_junk("this is my favourite game of all time and i miss it dearly i made most of my best friends playing this game",
         p = 0.2,
         junk_type="pseudo",
         min_junk=5,
         max_junk=10)

'this is my favourite game of all seraphele morathion morinae varenes perakis minorae time and i miss it dearly i made most of my best kaesira meridos morathos helinon heliostrum torivalis luminarae melidon dorathium melidonae friends playing this game'

In [None]:
df = pd.read_csv('/content/drive/MyDrive/GoEmotions-test.csv')

In [None]:
df_small = df.sample(n=10, random_state=42).reset_index()

In [None]:
rng = np.random.default_rng(2025)

df_small["NoisyText"] = [
    add_junk(t, p=0.50, rng=rng)
    for t in df_small["Text"].astype(str).values
]


In [None]:
print(df_small['NoisyText'][3])

best ebg sub ever


In [None]:
%%time
rng1 = np.random.default_rng(2025)
rng2 = np.random.default_rng(2025)
rng3 = np.random.default_rng(2025)
rng4 = np.random.default_rng(2025)

df_noise_10_1 = df.copy()
df_noise_10_1["NoisyText"] = [
    add_junk(t, p=0.10, max_junk=1, rng=rng1)
    for t in df_noise_10_1["Text"].astype(str).values
]

df_noise_20_2 = df.copy()
df_noise_20_2["NoisyText"] = [
    add_junk(t, p=0.20, max_junk=2, rng=rng2)
    for t in df_noise_20_2["Text"].astype(str).values
]

df_noise_30_2 = df.copy()
df_noise_30_2["NoisyText"] = [
    add_junk(t, p=0.30, max_junk=2, rng=rng3)
    for t in df_noise_30_2["Text"].astype(str).values
]

df_noise_40_3 = df.copy()
df_noise_40_3["NoisyText"] = [
    add_junk(t, p=0.40, max_junk=3, rng=rng4)
    for t in df_noise_40_3["Text"].astype(str).values
]


CPU times: user 118 ms, sys: 0 ns, total: 118 ms
Wall time: 125 ms


In [None]:
%%time
rng5 = np.random.default_rng(2025)
rng6 = np.random.default_rng(2025)
rng7 = np.random.default_rng(2025)
rng8 = np.random.default_rng(2025)

df_noise_pseudo_10_1 = df.copy()
df_noise_pseudo_10_1["NoisyText"] = [
    add_junk(t, p=0.10, max_junk=1, junk_type = "pseudo", rng=rng5)
    for t in df_noise_pseudo_10_1["Text"].astype(str).values
]

df_noise_pseudo_20_2 = df.copy()
df_noise_pseudo_20_2["NoisyText"] = [
    add_junk(t, p=0.20, max_junk=2, junk_type = "pseudo", rng=rng6)
    for t in df_noise_pseudo_20_2["Text"].astype(str).values
]

df_noise_pseudo_30_2 = df.copy()
df_noise_pseudo_30_2["NoisyText"] = [
    add_junk(t, p=0.30, max_junk=2, junk_type = "pseudo", rng=rng7)
    for t in df_noise_pseudo_30_2["Text"].astype(str).values
]

df_noise_pseudo_40_3 = df.copy()
df_noise_pseudo_40_3["NoisyText"] = [
    add_junk(t, p=0.40, max_junk=3, junk_type = "pseudo", rng=rng8)
    for t in df_noise_pseudo_40_3["Text"].astype(str).values
]

CPU times: user 77.6 ms, sys: 3.16 ms, total: 80.8 ms
Wall time: 79.2 ms


In [None]:
%%time
rng9 = np.random.default_rng(2025)
rng10 = np.random.default_rng(2025)

df_noise_pseudo_5_1 = df.copy()
df_noise_pseudo_5_1["NoisyText"] = [
    add_junk(t, p=0.05, max_junk=1, junk_type = "pseudo", rng=rng9)
    for t in df_noise_pseudo_5_1["Text"].astype(str).values
]

df_noise_pseudo_20_1 = df.copy()
df_noise_pseudo_20_1["NoisyText"] = [
    add_junk(t, p=0.2, max_junk=1, junk_type = "pseudo", rng=rng10)
    for t in df_noise_pseudo_5_1["Text"].astype(str).values
]



CPU times: user 53.7 ms, sys: 5.45 ms, total: 59.1 ms
Wall time: 90.2 ms


In [None]:
%%time
rng11 = np.random.default_rng(2025)
rng12 = np.random.default_rng(2025)

df_noise_mash_5_1 = df.copy()
df_noise_mash_5_1["NoisyText"] = [
    add_junk(t, p=0.05, max_junk=1, junk_type = "mash", rng=rng11)
    for t in df_noise_mash_5_1["Text"].astype(str).values
]

df_noise_mash_20_1 = df.copy()
df_noise_mash_20_1["NoisyText"] = [
    add_junk(t, p=0.2, max_junk=1, junk_type = "mash", rng=rng12)
    for t in df_noise_mash_5_1["Text"].astype(str).values
]

CPU times: user 32.7 ms, sys: 0 ns, total: 32.7 ms
Wall time: 33.4 ms


In [None]:
df_noise_10_1.to_csv('/content/drive/MyDrive/Colab_data/test_noise_mash_10_1.csv')
df_noise_20_2.to_csv('/content/drive/MyDrive/Colab_data/test_noise_mash_20_2.csv')
df_noise_30_2.to_csv('/content/drive/MyDrive/Colab_data/test_noise_mash_30_2.csv')
df_noise_40_3.to_csv('/content/drive/MyDrive/Colab_data/test_noise_mash_40_3.csv')

In [None]:
df_noise_pseudo_10_1.to_csv('/content/drive/MyDrive/Colab_data/test_noise_pseudo_10_1.csv')
df_noise_pseudo_20_2.to_csv('/content/drive/MyDrive/Colab_data/test_noise_pseudo_20_2.csv')
df_noise_pseudo_30_2.to_csv('/content/drive/MyDrive/Colab_data/test_noise_pseudo_30_2.csv')
df_noise_pseudo_40_3.to_csv('/content/drive/MyDrive/Colab_data/test_noise_pseudo_40_3.csv')

In [None]:
df_noise_pseudo_5_1.to_csv('/content/drive/MyDrive/Colab_data/test_noise_pseudo_5_1.csv')
df_noise_pseudo_20_1.to_csv('/content/drive/MyDrive/Colab_data/test_noise_pseudo_20_1.csv')

In [None]:
df_noise_mash_5_1.to_csv('/content/drive/MyDrive/Colab_data/test_noise_mash_5_1.csv')
df_noise_mash_20_1.to_csv('/content/drive/MyDrive/Colab_data/test_noise_mash_20_1.csv')