In [1]:
import os
from pathlib import Path
import random
import re
from typing import List, Tuple

def create_blank(text: str, blank_ratio: float = 0.3) -> Tuple[str, List[str]]:
    words = text.split()
    num_blanks = int(len(words) * blank_ratio)
    indices = random.sample(range(len(words)), num_blanks)
    blanked_words = []
    
    for i in indices:
        blanked_words.append(words[i])
        words[i] = '_' * len(words[i])
    
    return ' '.join(words), blanked_words

def process_files(input_dir: str, output_dir: str):
    blanks_dir = Path(output_dir) / 'blanks'
    solutions_dir = Path(output_dir) / 'solutions'
    
    for dir_path in [blanks_dir, solutions_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
    
    for file_path in Path(input_dir).glob('*.txt'):
        text = file_path.read_text()
        blanked_text, solutions = create_blank(text)
        
        (blanks_dir / f"{file_path.stem}_blanks.txt").write_text(blanked_text)
        (solutions_dir / f"{file_path.stem}_solutions.txt").write_text('\n'.join(solutions))

if __name__ == "__main__":
    process_files('../rawdata', './blanks')