## IMPORT

In [1]:
import os
import pandas as pd

## HELPER FUNCTIONS

In [2]:
def get_filtered_filenames(folder_path, prefix='task_sets_500'):
  filtered_files = []
  for file in os.listdir(folder_path):
    if file.startswith(prefix) and file.endswith('.csv'):
      filtered_files.append('./csv/' + file)
  return filtered_files

In [3]:
def preprocess_data(file_names):
  dfs = []
  for file_name in file_names:
    df = pd.read_csv(file_name)  
    df = df.dropna()
    assert not df.isnull().any().any(), "DataFrame contains NaN values"
    dfs.append(df)
  
  combined_df = pd.concat(dfs, ignore_index=True)  
  combined_df.drop_duplicates(inplace=True)  
  combined_df["thm1"] = combined_df["thm1"].astype(bool)
  combined_df["thm2"] = combined_df["thm2"].astype(bool)
  combined_df["thm3"] = combined_df["thm3"].astype(bool)
  # combined_df = combined_df.sample(n=14000)
  return combined_df

## DATA PREPROCESSING

In [4]:
FILEPATH = '/Users/jiwoo/Desktop/generalized-sporadic/csv/'
PREFIX = ['task_sets_500', 'task_sets_550', 'task_sets_600', 'task_sets_650', 'task_sets_700', 'task_sets_750', 'task_sets_800', 'task_sets_850','task_sets_900','task_sets_950', 'task_sets_975']

In [5]:
for pre in PREFIX:
  filenames = get_filtered_filenames(FILEPATH)
  df = preprocess_data(filenames)
  print("------------------------------{}-------------------------------".format(pre))
  print(df.describe())
  df.to_csv(pre + '.csv', index=False)

------------------------------task_sets_500-------------------------------
          num_tasks         t_max   utilization
count  14000.000000  14000.000000  14000.000000
mean       4.493000    357.467286      0.496839
std        0.994567    433.861478      0.001842
min        3.000000      1.000000      0.485008
25%        4.000000     91.000000      0.496034
50%        4.000000    214.000000      0.497255
75%        5.000000    449.000000      0.498139
max       10.000000   2995.000000      0.499806
------------------------------task_sets_550-------------------------------
          num_tasks         t_max   utilization
count  14000.000000  14000.000000  14000.000000
mean       4.492714    357.562929      0.496836
std        0.994063    433.277987      0.001843
min        3.000000      1.000000      0.485008
25%        4.000000     91.000000      0.496025
50%        4.000000    215.000000      0.497253
75%        5.000000    450.000000      0.498136
max       10.000000   3001.000000 