### Complement program for clean_data.csv

##### GOAL: (1) shrink the size of clean_data.csv (2) Produce train.csv and validation.csv

In [38]:
import os
import pandas as pd
import numpy as np
import random


In [39]:
DATA_PATH = "./dataset/processed/clean_data.csv"
TRAIN_EXPORT_PATH = "./dataset/processed/train.csv"
TEST_EXPORT_PATH = "./dataset/processed/validation.csv"


NUM_TRAIN = 600
NUM_TEST = 120
NUM_TOTAL = NUM_TRAIN + NUM_TEST
NUM_CLASS = 12

NUM_PER_CLASS = NUM_TOTAL // NUM_CLASS
NUM_PER_TRAIN = NUM_TRAIN // NUM_CLASS
NUM_PER_TEST = NUM_TEST // NUM_CLASS

SEED = 52

print(NUM_PER_CLASS, NUM_PER_TEST, NUM_PER_TRAIN)

60 10 50


In [40]:
# make the class list 
CLASS_LIST = [str(i) for i in list(np.arange(4, 9.5, 0.5))]
CLASS_LIST.insert(0, '<4')
print(CLASS_LIST)

['<4', '4.0', '4.5', '5.0', '5.5', '6.0', '6.5', '7.0', '7.5', '8.0', '8.5', '9.0']


In [41]:
# read from csv file
df = pd.read_csv(DATA_PATH)

# the generation of data will be random
train_data = pd.DataFrame([])
test_data = pd.DataFrame([])

df_group = df.groupby("band")
for c in CLASS_LIST:
    df_sample = df_group.get_group(c)
    # print(df_sample)

    samples = df_sample.sample(NUM_PER_CLASS, ignore_index=True)

    train_sample = samples[0:NUM_PER_TRAIN]
    test_sample = samples[NUM_PER_TRAIN:]
    
    train_data = pd.concat([train_data, train_sample], axis=0, ignore_index=True)
    test_data = pd.concat([test_data, test_sample], axis=0, ignore_index=True)

# shuffle both train and test data
train_data = train_data.sample(frac=1, random_state=SEED, ignore_index=True)
test_data = test_data.sample(frac=1, random_state=SEED, ignore_index=True)

# export data to csv files
train_data.to_csv(TRAIN_EXPORT_PATH, index=False, encoding='utf-8', mode='w')
test_data.to_csv(TEST_EXPORT_PATH, index=False, encoding='utf-8', mode='w')


