In [1]:
import pandas as pd
import random


# The model currently reads the first (1-test_percent)% of the data.csv file for training and the remaining
# (test_percent)% for testing. The current data.csv files are shuffled and words are repeated according
# to their frequency in text. Therefore, the test set at the end may contain words that are in the
# training set. This function takes an existing data.csv file as input and splits it into a training and
# test set, but ensures that the words in the test set are not in the training set.
def create_traintest_split(
    path_to_data: str, test_percent: float = 0.05, random_seed: int = None
) -> tuple:
    """
    Split a dataset into training and test sets, ensuring words in the test set
    are not in the training set.

    Args:
        path_to_data (str): Path to the CSV file containing the dataset.
        test_percent (float): Percentage of unique words to use for testing.
        random_seed (int, optional): Seed for random number generation.

    Returns:
        Tuple[List[str], float]: A tuple containing the new dataset and the
        actual train-test split ratio.
    """
    if random_seed:
        random.seed(random_seed)

    # Read the data efficiently and get value counts
    word_counts = pd.read_csv(
        path_to_data, header=None, names=["word"]
    ).word.value_counts()

    # Calculate test set size and select test words
    test_size = int(len(word_counts) * test_percent)
    test_words = pd.Series(random.sample(list(word_counts.index), k=test_size))

    # Create training set using boolean indexing and repeat
    train_set = word_counts[~word_counts.index.isin(test_words)].repeat(
        word_counts[~word_counts.index.isin(test_words)]
    )

    # Convert to list, shuffle, and add test words
    train_list = train_set.index.tolist()
    random.shuffle(train_list)
    full_dataset = train_list + test_words.tolist()

    # Calculate actual split ratio
    train_test_split = len(train_list) / len(full_dataset)

    return full_dataset, train_test_split

In [2]:
new_dataset, train_test_split = create_traintest_split(
    "data/kidwords_5000000_020724.csv", test_percent=0.05, random_seed=43
)

In [3]:
train_test_split

0.9998767049204427

In [4]:
len(new_dataset)

4850153

In [5]:
# Verify the train_test_split ratio. This should equal the length og the new dataset
int((1 - train_test_split) * len(new_dataset)) + int(
    train_test_split * len(new_dataset)
)

4850153

In [6]:
new_dataset[:10]

['has', 'knows', 'an', 'the', 'another', 'he', 'water', 'if', 'face', 'behind']