In [7]:
import pandas as pd
from pathlib import Path

def sample_training_data(
    input_path: str,
    output_path: str,
    sample_size: int,
    random_state: int = 42
) -> None:
    """Load a dataset, sample rows, and save the sample to a new CSV."""

    input_path = Path(input_path)
    output_path = Path(output_path)

    # Load dataset
    if not input_path.exists():
        raise FileNotFoundError(f"Input file not found: {input_path}")

    df = pd.read_csv(input_path)

    if df.empty:
        raise ValueError("The input dataset is empty.")

    # Validate sample size
    if sample_size <= 0:
        raise ValueError("sample_size must be a positive integer.")

    if sample_size > len(df):
        raise ValueError(
            f"sample_size ({sample_size}) is larger than dataset size ({len(df)})"
        )

    # Sample rows
    sample_df = df.sample(n=sample_size, random_state=random_state)

    # Ensure output directory exists
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Save sample
    sample_df.to_csv(output_path, index=False)

    print(f"Saved sample ({sample_size} rows) to: {output_path}")


if __name__ == "__main__":
    sample_size=30

    sample_training_data(
        input_path=r"C:\Users\dnnxl\Documents\GitHub\llm-param-search\dataset\FEINA_test_split_train.csv",
        output_path=f"C:\\Users\\dnnxl\\Documents\\GitHub\\llm-param-search\\dataset\\FEINA_test_split_train_{sample_size}.csv",
        sample_size=sample_size,
        random_state=42,
    )


Saved sample (30 rows) to: C:\Users\dnnxl\Documents\GitHub\llm-param-search\dataset\FEINA_test_split_train_30.csv
