# Sample a dataset from a (multilingual) corpus

In [None]:
import random
import os
import re

# To create your own samples, download the corpora from the OPUS website: https://opus.nlpl.eu/
sample_size = 1000
dataset_path_base = "path-to-opus-datasets/Europarl-v7/en-es"
dataset_filename_pattern = "Europarl-v7.en-es.<LANG>"
sample_filename_pattern = "Europarl-v7.en-es.sample.01.<LANG>.txt"
index_filename = "Europarl-v7.en-es.sample_indices.01.txt"
languages = ["en", "es"]

In [None]:
# Function to generate random indices
def generate_random_indices(valid_lines, sample_size):
    return sorted(random.sample(valid_lines, sample_size))

# Function to save indices
def save_indices(indices, file_path):
    with open(file_path, 'w') as file:
        file.write('\n'.join(map(str, indices)))

# Function to load indices
def load_indices(file_path):
    with open(file_path, 'r') as file:
        return [int(line.strip()) for line in file]

# Function to check if a line contains alphabet characters
def is_valid_line(line):
    return bool(re.search("[a-zA-Z]", line))

# Check and collect valid lines that are valid across all languages
def get_valid_lines_across_languages(languages, dataset_path_base, dataset_filename_pattern):
    file_paths = [os.path.join(dataset_path_base, dataset_filename_pattern.replace("<LANG>", lang)) for lang in languages]
    files = [open(path, 'r', encoding="utf-8") for path in file_paths]
    valid_indices = []
    index = 0  # Initialize line index

    try:
        while True:
            lines = [next(f).strip() for f in files]
            if all(is_valid_line(line) for line in lines):
                valid_indices.append(index)
            index += 1  # Increment line index after checking each line
    except StopIteration:
        pass  # Ends when any file runs out of lines
    finally:
        for f in files:
            f.close()
    return valid_indices

In [None]:
# Check if index file exists and if it does not, create it
index_file = os.path.join(dataset_path_base, index_filename)
if not os.path.exists(index_file):
    valid_lines = get_valid_lines_across_languages(languages, dataset_path_base, dataset_filename_pattern)

    # Ensure there are enough valid lines to sample
    if len(valid_lines) < sample_size:
        raise ValueError(f"Not enough valid lines to sample. Only found {len(valid_lines)} valid lines.")

    # Generate random line numbers from valid lines
    random_line_numbers = generate_random_indices(valid_lines, sample_size)
    
    # Save the indices
    save_indices(random_line_numbers, index_file)
else:
    # Load indices from file
    random_line_numbers = load_indices(index_file)

In [None]:
# Process each language using the shared indices
for language_code in languages:
    input_file = os.path.join(dataset_path_base, dataset_filename_pattern.replace("<LANG>", language_code))
    output_file = os.path.join(dataset_path_base, sample_filename_pattern.replace("<LANG>", language_code))

    # Extract the sampled lines
    with open(input_file, 'r', encoding="utf-8") as file:
        lines = file.readlines()
        with open(output_file, 'w', encoding="utf-8") as output:
            for i in random_line_numbers:
                output.write(lines[i])

    print(f"Sample of {sample_size} lines has been written to {output_file}")

print("Sample generation finished for all languages")