In [34]:
from google.colab import files
import pandas as pd
import numpy as np

def split_dataset_by_organism_and_sequence_fraction(
    csv_path, test_frac=0.3, random_state=42,
    train_out='train_dataset.csv', test_out='test_dataset.csv'
):

    """
    Splits a dataset into training and test sets such that:
    - No organism appears in both sets.
    - The test set includes approximately `test_frac` of total sequences.

    Parameters:
    - csv_path: Path to the input CSV file.
    - test_frac: Fraction of sequences to include in the test set.
    - random_state: Seed for reproducibility.
    - train_out: Path to save the training CSV.
    - test_out: Path to save the test CSV.

    Returns:
    - train_set, test_set: DataFrames of the training and test splits.
    """

    df = pd.read_csv(csv_path)
    rng = np.random.default_rng(random_state)

    # Count sequences per organism
    organism_counts = df['organism'].value_counts().reset_index()
    organism_counts.columns = ['organism', 'count']
    organism_counts = organism_counts.sample(frac=1, random_state=random_state)  # Shuffle

    total_sequences = len(df)
    test_organisms = []
    running_total = 0

    # Accumulate organisms until close to desired test fraction
    for _, row in organism_counts.iterrows():
        if running_total >= test_frac * total_sequences:
            break
        test_organisms.append(row['organism'])
        running_total += row['count']

    # Split based on selected organisms
    test_set = df[df['organism'].isin(test_organisms)].copy()
    train_set = df[~df['organism'].isin(test_organisms)].copy()

    # Save
    train_set.to_csv(train_out, index=False)
    test_set.to_csv(test_out, index=False)

    print(f"✅ Train set: {len(train_set)} rows, {train_set['organism'].nunique()} organisms")
    print(f"✅ Test set:  {len(test_set)} rows, {test_set['organism'].nunique()} organisms")
    print(f"🎯 Target test fraction: {test_frac:.2f} | Actual: {len(test_set)/total_sequences:.2f}")

    return train_set, test_set



In [93]:
import pandas as pd

def split_by_id_reference(full_data_path, train_id_path, id_column='ID',
                          train_out='train_features.csv', test_out='test_features.csv'):
    """
    Splits full dataset into train/test sets based on IDs found in a reference train set.

    Parameters:
    - full_data_path: CSV with the full dataset containing an 'ID' column.
    - train_id_path: CSV with the reference training set to extract IDs.
    - id_column: The name of the ID column (default 'ID').
    - train_out: Path to save training split.
    - test_out: Path to save testing split.
    """

    # Load full feature dataset and ID reference file
    full_df = pd.read_csv(full_data_path)
    train_ids = pd.read_csv(train_id_path)[id_column].unique()

    # Split based on ID presence
    train_df = full_df[full_df[id_column].isin(train_ids)].copy()
    test_df = full_df[~full_df[id_column].isin(train_ids)].copy()

    # Save results
    train_df.to_csv(train_out, index=False)
    test_df.to_csv(test_out, index=False)

    print(f"✅ Training features saved to: {train_out} ({len(train_df)} rows)")
    print(f"✅ Testing features saved to: {test_out} ({len(test_df)} rows)")
    return train_df, test_df
