In [1]:
import pandas as pd

In [4]:
capture_filename = './all_capture24.csv'
chunk_size = 10000

# Group sizes to split the dataset (how many persons are stored in each group)
group_sizes = {
    'capture_train1': 15,
    'capture_train2': 15,
    'capture_train3': 15,
    'capture_train4': 15,
    'capture_train5': 15,
    'capture_train6': 15,
    'capture_train7': 16,
    'capture_validation1': 15,
    'capture_validation2': 15,
    'capture_test': 15
}

# Output CSV file paths
output_files = {group: f"{group}.csv" for group in group_sizes}

# Track whether we've written the header yet
first_write = {group: True for group in group_sizes}

# To keep track of PERSON_IDs and group assignments
seen_ids = []
id_to_group = {}

# Assign each new PERSON_ID to a group
def assign_group(person_id):
    if person_id not in id_to_group:
        seen_ids.append(person_id)
        idx = len(seen_ids) - 1
        cumulative = 0
        for group, size in group_sizes.items():
            if idx < cumulative + size:
                id_to_group[person_id] = group
                break
            cumulative += size
    return id_to_group[person_id]

for chunk in pd.read_csv(capture_filename, chunksize=chunk_size):
    # Assign group to each row based on PERSON_ID
    chunk['GROUP'] = chunk['PERSON_ID'].map(assign_group)

    # Write rows to corresponding group CSV
    for group in group_sizes:
        filtered = chunk[chunk['GROUP'] == group].drop(columns='GROUP')
        if not filtered.empty:
            filtered.to_csv(
                output_files[group],
                mode='a',
                index=False,
                header=first_write[group]
            )
            first_write[group] = False  # Only write header once

In [6]:
for i in range(1, 8):
    capture_train = pd.read_csv('./capture_train' + str(i) + '.csv')
    print(f"capture_train{i} ids: {capture_train['PERSON_ID'].unique}")
    capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))
    print(f"capture_train{i}.shape: {capture_train_slice.shape}")
    capture_train_slice.to_csv('./capture_train' + str(i) + '.csv',mode='w+', index=False)

capture_train1 ids: <bound method Series.unique of 0           152
1           152
2           152
3           152
4           152
           ... 
94243337    166
94243338    166
94243339    166
94243340    166
94243341    166
Name: PERSON_ID, Length: 94243342, dtype: int64>


  capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_train1.shape: (942441, 6)
capture_train2 ids: <bound method Series.unique of 0           167
1           167
2           167
3           167
4           167
           ... 
94045365    181
94045366    181
94045367    181
94045368    181
94045369    181
Name: PERSON_ID, Length: 94045370, dtype: int64>


  capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_train2.shape: (940459, 6)
capture_train3 ids: <bound method Series.unique of 0           182
1           182
2           182
3           182
4           182
           ... 
89716788    196
89716789    196
89716790    196
89716791    196
89716792    196
Name: PERSON_ID, Length: 89716793, dtype: int64>


  capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_train3.shape: (897174, 6)
capture_train4 ids: <bound method Series.unique of 0           197
1           197
2           197
3           197
4           197
           ... 
93954116    211
93954117    211
93954118    211
93954119    211
93954120    211
Name: PERSON_ID, Length: 93954121, dtype: int64>


  capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_train4.shape: (939548, 6)
capture_train5 ids: <bound method Series.unique of 0           212
1           212
2           212
3           212
4           212
           ... 
92488305    226
92488306    226
92488307    226
92488308    226
92488309    226
Name: PERSON_ID, Length: 92488310, dtype: int64>


  capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_train5.shape: (924890, 6)
capture_train6 ids: <bound method Series.unique of 0           227
1           227
2           227
3           227
4           227
           ... 
90759042    241
90759043    241
90759044    241
90759045    241
90759046    241
Name: PERSON_ID, Length: 90759047, dtype: int64>


  capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_train6.shape: (907598, 6)
capture_train7 ids: <bound method Series.unique of 0           242
1           242
2           242
3           242
4           242
           ... 
97924705    257
97924706    257
97924707    257
97924708    257
97924709    257
Name: PERSON_ID, Length: 97924710, dtype: int64>


  capture_train_slice = (capture_train.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_train7.shape: (979257, 6)


In [2]:
for i in range(1, 3):
    capture_validation = pd.read_csv('./capture_validation' + str(i) + '.csv')
    print(f"capture_validation{i} ids: {capture_validation['PERSON_ID'].unique}")
    capture_validation_slice = (capture_validation.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))
    print(f"capture_validation{i}.shape: {capture_validation_slice.shape}")
    capture_validation_slice.to_csv('./capture_validation' + str(i) + '.csv',mode='w+', index=False)

capture_validation1 ids: <bound method Series.unique of 0           258
1           258
2           258
3           258
4           258
           ... 
90025319    272
90025320    272
90025321    272
90025322    272
90025323    272
Name: PERSON_ID, Length: 90025324, dtype: int64>


  capture_validation_slice = (capture_validation.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_validation1.shape: (900258, 6)
capture_validation2 ids: <bound method Series.unique of 0           273
1           273
2           273
3           273
4           273
           ... 
92154220    287
92154221    287
92154222    287
92154223    287
92154224    287
Name: PERSON_ID, Length: 92154225, dtype: int64>


  capture_validation_slice = (capture_validation.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_validation2.shape: (921549, 6)


In [3]:
capture_test = pd.read_csv('./capture_test.csv')
print(f"capture_test ids: {capture_test['PERSON_ID'].unique}")
capture_test_slice = (capture_test.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))
print(f"capture_test.shape: {capture_test_slice.shape}")
capture_test_slice.to_csv('./capture_test.csv',mode='w+', index=False)

capture_test ids: <bound method Series.unique of 0           288
1           288
2           288
3           288
4           288
           ... 
86440635    302
86440636    302
86440637    302
86440638    302
86440639    302
Name: PERSON_ID, Length: 86440640, dtype: int64>


  capture_test_slice = (capture_test.groupby('PERSON_ID').apply(lambda group: group.iloc[::100]).reset_index(drop=True))


capture_test.shape: (864415, 6)
