In [1]:
import os
import numpy as np
import pandas as pd
from ieeg.auth import Session

from get_iEEG_data import *
from iEEG_helper_functions import *

In [2]:
# Load ../../Data/multi_dataset_batches.csv as a pandas dataframe
df = pd.read_csv("../../Data/multi_dataset_batches.csv")
# add a new column called has_multi_sampling_rate
df["has_multi_sampling_rate"] = False
df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,batch,has_multi_sampling_rate
0,193,3,282.888818,"[256, 256, 256]",256,72419.537447,1,False
1,181,2,163.567852,"[512, 512]",512,83746.74,1,False
2,137,3,188.772956,"[512, 512, 256]",256,84569.2175,1,False
3,195,3,261.462102,"[256, 256, 512]",256,105005.468189,1,False
4,148,2,189.895812,"[1024, 512]",512,109615.733611,1,False
5,156,3,161.961359,"[1024, 512, 512]",512,119491.976389,1,False
6,147,2,177.036235,"[1024, 512]",512,126679.248055,1,False
7,179,2,258.874397,"[512, 512]",512,132543.691111,1,False
8,194,3,260.575113,"[512, 512, 512]",512,133414.457733,1,False
9,167,2,261.566072,"[512, 512]",512,133921.828889,1,False


In [3]:
# Implement a function that takes in a stringified list of integers and returns a list of integers
def string_to_list(string):
    """
    Convert a stringified list of integers to a list of integers

    Parameters
    ----------
    string: str
        A stringified list of integers

    Returns
    -------
    list
        A list of integers
    """
    # YOUR CODE HERE
    return [int(x) for x in string.strip("[]").split(",")]

In [4]:
# for each row in the dataframe, load sampling_rate into a list. if there are multiple unique sampling rates, mark has_multi_sampling_rate as True
for index, row in df.iterrows():
    sampling_rate = string_to_list(row["sampling_rate"])
    if len(set(sampling_rate)) > 1:
        df.at[index, "has_multi_sampling_rate"] = True

In [5]:
df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,batch,has_multi_sampling_rate
0,193,3,282.888818,"[256, 256, 256]",256,72419.537447,1,False
1,181,2,163.567852,"[512, 512]",512,83746.74,1,False
2,137,3,188.772956,"[512, 512, 256]",256,84569.2175,1,True
3,195,3,261.462102,"[256, 256, 512]",256,105005.468189,1,True
4,148,2,189.895812,"[1024, 512]",512,109615.733611,1,True
5,156,3,161.961359,"[1024, 512, 512]",512,119491.976389,1,True
6,147,2,177.036235,"[1024, 512]",512,126679.248055,1,True
7,179,2,258.874397,"[512, 512]",512,132543.691111,1,False
8,194,3,260.575113,"[512, 512, 512]",512,133414.457733,1,False
9,167,2,261.566072,"[512, 512]",512,133921.828889,1,False


In [6]:
# only keep rows with has_multi_sampling_rate as False
multiple_sample_rate_df = df[df["has_multi_sampling_rate"] == True]
# Reset index
multiple_sample_rate_df = multiple_sample_rate_df.reset_index(drop=True)
# Drop the batch column
multiple_sample_rate_df = multiple_sample_rate_df.drop(columns=["batch"])
multiple_sample_rate_df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,has_multi_sampling_rate
0,137,3,188.772956,"[512, 512, 256]",256,84569.2175,True
1,195,3,261.462102,"[256, 256, 512]",256,105005.468189,True
2,148,2,189.895812,"[1024, 512]",512,109615.733611,True
3,156,3,161.961359,"[1024, 512, 512]",512,119491.976389,True
4,147,2,177.036235,"[1024, 512]",512,126679.248055,True
5,214,2,311.207215,"[1024, 512]",512,206271.032951,True
6,153,2,305.785224,"[512, 1024]",512,227762.6725,True
7,213,2,558.426826,"[1024, 512]",512,369247.993412,True
8,215,4,235.29497,"[2048, 1024, 2048, 1024]",1024,412838.037587,True
9,149,4,523.779698,"[512, 1024, 1024, 1024]",512,525169.350833,True


In [7]:
# only keep rows with has_multi_sampling_rate as False
single_sample_rate_df = df[df["has_multi_sampling_rate"] == False]
# Reset index
single_sample_rate_df = single_sample_rate_df.reset_index(drop=True)
# Drop the batch column
single_sample_rate_df = single_sample_rate_df.drop(columns=["batch"])
single_sample_rate_df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,has_multi_sampling_rate
0,193,3,282.888818,"[256, 256, 256]",256,72419.537447,False
1,181,2,163.567852,"[512, 512]",512,83746.74,False
2,179,2,258.874397,"[512, 512]",512,132543.691111,False
3,194,3,260.575113,"[512, 512, 512]",512,133414.457733,False
4,167,2,261.566072,"[512, 512]",512,133921.828889,False
5,216,2,288.169321,"[512, 512]",512,147542.692112,False
6,159,2,291.381685,"[512, 512]",512,149187.422778,False
7,140,2,148.518412,"[1024, 1024]",1024,152082.854166,False
8,201,2,306.61172,"[512, 512]",512,156985.200807,False
9,152,2,385.824318,"[512, 512]",512,197542.050833,False


In [8]:
def assign_batches(df, column):
    # Sort by the specified column
    sorted_df = df.sort_values(by=column)

    # Calculate total and target size for each batch
    total_size = sorted_df[column].sum()
    target_per_batch = total_size / 4

    # Initialize batch column
    sorted_df["batch"] = 0
    current_batch = 1
    current_sum = 0

    # Iteratively assign batch numbers
    for index, row in sorted_df.iterrows():
        if current_sum + row[column] > target_per_batch and current_batch < 4:
            current_batch += 1
            current_sum = 0
        sorted_df.at[index, "batch"] = current_batch
        current_sum += row[column]

    return sorted_df


# Assign batches to single_sample_rate_df
single_sample_rate_df = assign_batches(single_sample_rate_df, "size_estimate")

In [9]:
single_sample_rate_df

Unnamed: 0,hup_id,num_datasets,total_hours,sampling_rate,min_sampling_rate,size_estimate,has_multi_sampling_rate,batch
0,193,3,282.888818,"[256, 256, 256]",256,72419.537447,False,1
1,181,2,163.567852,"[512, 512]",512,83746.74,False,1
2,179,2,258.874397,"[512, 512]",512,132543.691111,False,1
3,194,3,260.575113,"[512, 512, 512]",512,133414.457733,False,1
4,167,2,261.566072,"[512, 512]",512,133921.828889,False,1
5,216,2,288.169321,"[512, 512]",512,147542.692112,False,2
6,159,2,291.381685,"[512, 512]",512,149187.422778,False,2
7,140,2,148.518412,"[1024, 1024]",1024,152082.854166,False,2
8,201,2,306.61172,"[512, 512]",512,156985.200807,False,2
9,152,2,385.824318,"[512, 512]",512,197542.050833,False,3
