## IMPORT

In [1]:
import os
import pandas as pd

## HELPER FUNCTIONS

In [2]:
def get_filtered_filenames(folder_path, prefix):
  filtered_files = []
  for file in os.listdir(folder_path):
    if file.startswith(prefix) and file.endswith('.csv'):
      filtered_files.append('./csv/' + file)
  return filtered_files

In [3]:
def preprocess_data(file_names):
  dfs = []
  for file_name in file_names:
    df = pd.read_csv(file_name)  
    df = df.dropna()
    assert not df.isnull().any().any(), "DataFrame contains NaN values"
    dfs.append(df)
  
  combined_df = pd.concat(dfs, ignore_index=True)  
  combined_df.drop_duplicates(inplace=True)  
  combined_df["thm1"] = combined_df["thm1"].astype(bool)
  combined_df["thm2"] = combined_df["thm2"].astype(bool)
  combined_df["thm3"] = combined_df["thm3"].astype(bool)
  combined_df = combined_df.sample(n=11000)
  return combined_df

## DATA PREPROCESSING

In [4]:
FILEPATH = '/Users/jiwoo/Desktop/generalized-sporadic/csv/'
PREFIX = ['task_sets_500', 'task_sets_550', 'task_sets_600', 'task_sets_650', 'task_sets_700', 'task_sets_750', 'task_sets_800', 'task_sets_850','task_sets_900','task_sets_950', 'task_sets_975']

In [5]:
for pre in PREFIX:
  filenames = get_filtered_filenames(FILEPATH, pre)
  df = preprocess_data(filenames)
  print("------------------------------{}-------------------------------".format(pre))
  print(df.describe())
  df.to_csv(pre + '_test.csv', index=False)

------------------------------task_sets_500-------------------------------
          num_tasks         t_max   utilization
count  11000.000000  11000.000000  11000.000000
mean       4.497818    357.279818      0.496844
std        0.996400    434.532377      0.001843
min        3.000000      1.000000      0.485008
25%        4.000000     90.000000      0.496044
50%        4.000000    213.000000      0.497258
75%        5.000000    449.000000      0.498141
max        9.000000   3001.000000      0.499806
------------------------------task_sets_550-------------------------------
          num_tasks         t_max   utilization
count  11000.000000  11000.000000  11000.000000
mean       4.847000    433.192364      0.546550
std        1.013389    525.141416      0.001875
min        3.000000      1.000000      0.534800
25%        4.000000    110.000000      0.545682
50%        5.000000    260.000000      0.546930
75%        5.000000    550.000000      0.547867
max       11.000000   3706.000000 