## Split the generated file into multiple individual txt files

In [2]:
import os
import re

# Path to your input file
INPUT_FILE = 'data/generated.csv'

In [3]:
# Read the content of the file
with open(INPUT_FILE, 'r', encoding='utf-8') as file:
    content = file.read()

# Split the content by the [INST] tag
parts = re.split(r'\[INST\]', content)

# Function to sanitize and create a filename from text
def create_filename(text):
    # Take the first five words, join them, and remove non-alphanumeric characters
    first_five_words = ' '.join(text.split()[:5])
    sanitized = re.sub(r'[^a-zA-Z0-9]+', '', first_five_words)
    return sanitized

# Process each part except the first one, since it's before the first [INST]
for part in parts[1:]:
    # Extract the age from the instruction
    age_match = re.search(r'cuando tenía (\d+) años', part)
    if age_match:
        age = age_match.group(1)
        # Create a directory for the age if it doesn't exist
        directory = f'./data/generated/{age}_years'
        os.makedirs(directory, exist_ok=True)
        
        # Extract the content after the instruction
        content_start = part.find(']') + 1
        file_content = part[content_start:].strip()
        
        # Create a filename from the first five words of the file content
        filename = f"{create_filename(file_content)}.txt"
        
        # Write the content to the file in the appropriate directory
        with open(os.path.join(directory, filename), 'w', encoding='utf-8') as output_file:
            output_file.write(file_content)
