In [1]:
import pandas as pd

In [None]:
# Define each dataframe size
group_sizes = {
    'dreamt_train1': 20,
    'dreamt_train2': 20,
    'dreamt_train3': 20,
    'dreamt_train4': 10,
    'dreamt_validation': 20,
    'dreamt_test': 10
}

dreamt_filename = './all_dreamt.csv'
chunk_size = 10 ** 5

# Output CSV file paths
output_files = {group: f"{group}.csv" for group in group_sizes}

# Track whether we've written the header yet
first_write = {group: True for group in group_sizes}

# To keep track of PERSON_IDs and group assignments
seen_ids = []
id_to_group = {}

# Assign each new PERSON_ID to a group
def assign_group(person_id):
    if person_id not in id_to_group:
        seen_ids.append(person_id)
        idx = len(seen_ids) - 1
        cumulative = 0
        for group, size in group_sizes.items():
            if idx < cumulative + size:
                id_to_group[person_id] = group
                break
            cumulative += size
    return id_to_group[person_id]

for chunk in pd.read_csv(dreamt_filename, chunksize=chunk_size):
    # Assign group to each row based on PERSON_ID
    chunk['GROUP'] = chunk['PERSON_ID'].map(assign_group)

    # Write rows to corresponding group CSV
    for group in group_sizes:
        filtered = chunk[chunk['GROUP'] == group].drop(columns='GROUP')
        if not filtered.empty:
            filtered.to_csv(
                output_files[group],
                mode='a',
                index=False,
                header=first_write[group]
            )
            first_write[group] = False  # Only write header once

In [10]:
for i in range(1, 5):
    dreamt_train = pd.read_csv('./dreamt_train' + str(i) + '.csv')
    print(f"dreamt_train{i} ids: {dreamt_train['PERSON_ID'].unique}")
    dreamt_train_slice = (dreamt_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))
    print(f"dreamt_train{i}.shape: {dreamt_train_slice.shape}")
    dreamt_train_slice.to_csv('./dreamt_train' + str(i) + '.csv',mode='w+', index=False)

dreamt_train1 ids: <bound method Series.unique of 0            1
1            1
2            1
3            1
4            1
            ..
40932239    20
40932240    20
40932241    20
40932242    20
40932243    20
Name: PERSON_ID, Length: 40932244, dtype: int64>


  dreamt_train_slice = (dreamt_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))


dreamt_train1.shape: (639586, 6)
dreamt_train2 ids: <bound method Series.unique of 0           21
1           21
2           21
3           21
4           21
            ..
41270607    40
41270608    40
41270609    40
41270610    40
41270611    40
Name: PERSON_ID, Length: 41270612, dtype: int64>


  dreamt_train_slice = (dreamt_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))


dreamt_train2.shape: (644873, 6)
dreamt_train3 ids: <bound method Series.unique of 0           41
1           41
2           41
3           41
4           41
            ..
40174479    60
40174480    60
40174481    60
40174482    60
40174483    60
Name: PERSON_ID, Length: 40174484, dtype: int64>


  dreamt_train_slice = (dreamt_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))


dreamt_train3.shape: (627746, 6)
dreamt_train4 ids: <bound method Series.unique of 0           61
1           61
2           61
3           61
4           61
            ..
20783877    70
20783878    70
20783879    70
20783880    70
20783881    70
Name: PERSON_ID, Length: 20783882, dtype: int64>


  dreamt_train_slice = (dreamt_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))


dreamt_train4.shape: (324758, 6)


In [11]:
dreamt_validation = pd.read_csv('./dreamt_validation.csv')
print(f"dreamt_validation ids: {dreamt_validation['PERSON_ID'].unique}")

dreamt_validation ids: <bound method Series.unique of 0           71
1           71
2           71
3           71
4           71
            ..
40312719    90
40312720    90
40312721    90
40312722    90
40312723    90
Name: PERSON_ID, Length: 40312724, dtype: int64>


In [12]:
dreamt_validation_slice = (dreamt_validation.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))
dreamt_validation_slice.shape

  dreamt_validation_slice = (dreamt_validation.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))


(629906, 6)

In [None]:
dreamt_validation_slice.to_csv('./dreamt_validation.csv', mode='w+', index=False)

In [14]:
dreamt_test = pd.read_csv('./dreamt_test.csv')
print(f"dreamt_test ids: {dreamt_test['PERSON_ID'].unique}")

dreamt_test ids: <bound method Series.unique of 0            91
1            91
2            91
3            91
4            91
           ... 
20057477    100
20057478    100
20057479    100
20057480    100
20057481    100
Name: PERSON_ID, Length: 20057482, dtype: int64>


In [15]:
dreamt_test_slice = (dreamt_test.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))
dreamt_test_slice.shape

  dreamt_test_slice = (dreamt_test.groupby('PERSON_ID').apply(lambda group: group.iloc[::64]).reset_index(drop=True))


(313408, 6)

In [None]:
dreamt_test_slice.to_csv('./dreamt_test.csv', mode='w+', index=False)