In [None]:
# Load the dataset and convert to pandas
from datasets import load_dataset

ds = load_dataset("d4nieldev/qpl-decomposer-cot-ds")
df = ds['train'].to_pandas()
df

In [None]:
# Convert 'Top' to 'TopSort'
df[df['op'] == 'Top'] = df[df['op'] == 'Top'].assign(op='TopSort')
print(df['op'].value_counts())

In [None]:
# Over/Under-sample the dataset
import pandas as pd

# Over/Under-sample ratios
ratios = {
    'Scan': 0.3,
    'Aggregate': 1,
    'Sort': 1,
    'Join': 1,
    'Filter': 2,
    'TopSort': 1,
    'Except': 2,
    'Intersect': 2,
    'Union': 5
}

def sample_group(group):
    tag = group['op'].iloc[0]
    ratio = ratios.get(tag, 1.0)
    n = len(group)
    target = int(round(n * ratio))
    if ratio < 1:
        return group.sample(frac=ratio)
    elif ratio > 1:
        full_repeats = target // n
        rest = target % n
        parts = [group] * full_repeats
        if rest > 0:
            parts.append(group.sample(n=rest))
        return pd.concat(parts)
    else:
        return group

sampled_groups = pd.concat([sample_group(g) for _, g in df.groupby('op')])
balanced_df = sampled_groups.sample(frac=1).reset_index(drop=True)
print(balanced_df['op'].value_counts())

In [None]:
merged = df.merge(balanced_df.drop_duplicates(), how='left', indicator=True)
all_present = ((merged['_merge'] == 'both') | (merged['op'] == 'Scan')).all()
print(all_present)

In [None]:
from datasets import Dataset

ds['train'] = Dataset.from_pandas(balanced_df)
ds.push_to_hub("d4nieldev/qpl-decomposer-cot-ds")