## Datasets for each Knowledge Profile

In [None]:
knowledge_profiles = ["0000", "1000", "0110", "1110", "0111"]
base_dir_method = "generation/generation_method/correct_responses/temp_1"
base_dir_simple = "generation/generation_simple/correct_responses/temp_1"

for knowledge_profile in knowledge_profiles:

    
    df_method = read_synthetic_data(base_dir_method, knowledge_profile)
    df_method.to_csv(f"{knowledge_profile}_generated_data.csv", index=False)

    df_simple = read_synthetic_data(base_dir_simple, knowledge_profile)
    df_simple.to_csv(f"{knowledge_profile}_simple_generated_data.csv", index=False)

## Mixed Generated Datasets

In [None]:
def create_mixed_datasets(size=500, generation='simple'):
    # Load data for each knowledge profile
    data_frames = {}
    for profile in knowledge_profiles:

        if generation=='simple':
            file_path = os.path.join(input_directory, f"{profile}_simple_generated_data.csv")
            output_file = os.path.join(output_directory, f"simple_mixed_data_{size}.csv")
        else:
            file_path = os.path.join(input_directory, f"{profile}_generated_data.csv")
            output_file = os.path.join(output_directory, f"mixed_data_{size}.csv")

        if os.path.exists(file_path):
            data_frames[profile] = pd.read_csv(file_path)
        else:
            raise FileNotFoundError(f"File not found: {file_path}")

    samples_per_profile = size // len(knowledge_profiles)
    
    # Sample equal numbers of rows from each profile
    mixed_data = pd.concat([
        df.sample(n=samples_per_profile, random_state=random.randint(0, 1000), replace=False)
        for profile, df in data_frames.items()
    ])

    mixed_data = mixed_data.sample(frac=1, random_state=random.randint(0, 1000)).reset_index(drop=True)
    mixed_data.to_csv(output_file, index=False)


In [None]:
input_directory = "augmented_datasets"  
output_directory = "augmented_datasets"  

# create mixed datasets
create_mixed_datasets(generated='method')
create_mixed_datasets(generated='simple')

## Create Generated + Original Datsets

In [None]:
# base size for the generated dataset and original sizes for the original
original_sizes = [100, 200, 300, 400]
BASE_SIZE = 500

for size in original_sizes:

    train_path = "path/to/original/data/train.csv"
    augmented_path = f"generated_datasets/mixed_data_{BASE_SIZE}.csv"
    output_path = f"generated_datasets/generated_original_{size}.csv"

    train_df = pd.read_csv(train_path)
    augmented_df = pd.read_csv(augmented_path)

    # Rename columns in the generated dataset to match train_split.csv
    augmented_df = augmented_df.rename(
        columns={
            "text": "CONTRAPOSITION task",
            "rubric1": "Statement of what should be proven: A proof by contraposition of an implication consists in showing that if x rational, then x^2 is rational. ",
            "rubric2": "Correct assumption: x is rational [Assumption] ",
            "rubric3": "Correct proof reasoning",
            "rubric4": "Proof conclusion: By contraposition, if x^2 is irrational, then x is irrational."
        }
    )

    sample_size = size 
    sampled_df = train_df.sample(n=sample_size, random_state=42)

    combined_df = pd.concat([augmented_df, sampled_df], ignore_index=True)
    shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    shuffled_df.to_csv(output_path, index=False)