In [1]:
import os
import math
import pandas as pd
import random

data_folder = "data"
seed = 2023

In [2]:
# load in the wordlists for each language
wordlist_files = [
    f"{data_folder}/wordlist_de.txt",
    f"{data_folder}/wordlist_en.txt",
    f"{data_folder}/wordlist_fr.txt",
    f"{data_folder}/wordlist_it.txt",
    f"{data_folder}/wordlist_pl.txt",
    f"{data_folder}/wordlist_sv.txt",
]

In [3]:
wordlist_dfs = []
for filename in wordlist_files:
    with open(filename, "r") as f:
        words = f.read().splitlines()
        words_df = pd.DataFrame({"word": words})
        words_df["language"] = filename.split("_")[-1][0:2]
        wordlist_dfs.append(words_df)

In [4]:
# concatenate all wordlists into one dataframe
corpus_df = pd.concat(wordlist_dfs).reset_index(drop=True)

In [5]:
corpus_df

Unnamed: 0,word,language
0,a,de
1,aal,de
2,aale,de
3,aalen,de
4,aalend,de
...,...,...
3922530,zons,sv
3922531,zoo,sv
3922532,zoologisk,sv
3922533,zoologiska,sv


In [6]:
# remove any words that are only one character long
corpus_df = corpus_df[corpus_df["word"].apply(lambda x: len(x) > 1)].reset_index(
    drop=True
)

In [7]:
len(corpus_df[corpus_df["language"] == "en"])

80615

In [8]:
# obtain the training data for the language model (large sample of the english words)
english_train_pickle_file = f"{data_folder}/english_train.pkl"
if os.path.isfile(english_train_pickle_file):
    print(f"loading in english_train from {english_train_pickle_file}")
    english_train = pd.read_pickle(english_train_pickle_file)
else:
    print(f"creating english_train dataframe with seed {seed}")
    # set seed for sampling
    random.seed(seed)
    n_words = len(corpus_df[corpus_df["language"] == "en"]) - 10000

    # sample english words from the corpus
    english_train = corpus_df[corpus_df["language"] == "en"].sample(
        n_words, random_state=seed
    )
    english_train = english_train.reset_index(drop=True)

    print(f"saving english_train to {english_train_pickle_file}")
    # save data for later
    english_train.to_pickle(english_train_pickle_file)

loading in english_train from data/english_train.pkl


In [9]:
english_train

Unnamed: 0,word,language
0,knots,en
1,stalemating,en
2,whoops,en
3,implantation,en
4,levers,en
...,...,...
70610,forcefulness,en
70611,fat,en
70612,creakier,en
70613,ramming,en


In [10]:
# remove the words that we use to train language model from the corpus
cond = corpus_df["word"].isin(english_train["word"])
corpus_df = corpus_df.drop(corpus_df[cond].index)
corpus_df = corpus_df.reset_index(drop=True)
corpus_df["language"].value_counts()

language
it    1860115
pl    1513862
fr     190643
de     185253
sv      74944
en      10000
Name: count, dtype: int64

In [11]:
# obtain the sample of words for the anomaly detection task (10000 english words, 10000 non-english words)
corpus_sample_pickle_file = f"{data_folder}/corpus_sample.pkl"
if os.path.isfile(corpus_sample_pickle_file):
    print(f"loading in corpus_sample_df from {corpus_sample_pickle_file}")
    corpus_sample_df = pd.read_pickle(corpus_sample_pickle_file)
else:
    print(f"creating corpus_sample_df dataframe with seed {seed}")
    # set seed for sampling
    random.seed(seed)
    n_english = 10000
    n_remaining = 10000
    # sampling non-english words
    languages = corpus_df["language"].unique()
    words_per_language = math.floor(n_remaining / (len(languages) - 1))
    non_english_df = pd.concat(
        [
            corpus_df[corpus_df["language"] == lang].sample(
                words_per_language, random_state=seed
            )
            for lang in languages
            if lang != "en"
        ]
    )
    # take the remaining english words
    english_df = corpus_df[corpus_df["language"] == "en"]
    corpus_sample_df = pd.concat([non_english_df, english_df]).reset_index(drop=True)

    print(f"saving corpus_sample_df to {corpus_sample_pickle_file}")
    # save data for later
    corpus_sample_df.to_pickle(corpus_sample_pickle_file)

loading in corpus_sample_df from data/corpus_sample.pkl


In [12]:
corpus_sample_df

Unnamed: 0,word,language
0,abblendet,de
1,bestechendes,de
2,narrensicheren,de
3,inakzeptable,de
4,abbestelle,de
...,...,...
19995,zillion,en
19996,zincked,en
19997,zines,en
19998,zingers,en


In [13]:
corpus_sample_df["language"].value_counts()

language
en    10000
de     2000
fr     2000
it     2000
pl     2000
sv     2000
Name: count, dtype: int64