### Splits

In [1]:
import pandas as pd
import os

In [2]:
database_name = 'non_hri_data_cues.csv'
current_folder = os.getcwd()

database_path = os.path.join(current_folder, 'processed_datasets',database_name)

database = pd.read_csv(database_path)

### Random 10%

In [3]:
# random 10% of the data for test set
test_set = database.sample(frac=0.1, random_state=42)
remaining_data = database.drop(test_set.index)

# split val and train
val_set = remaining_data.sample(frac=0.1, random_state=42)
train_set = remaining_data.drop(val_set.index)


test_set_path = os.path.join(current_folder, 'processed_datasets','non_hri_data_test.csv')
val_set_path = os.path.join(current_folder, 'processed_datasets','non_hri_data_val.csv')
train_set_path = os.path.join(current_folder, 'processed_datasets','non_hri_data_train.csv')

test_set.to_csv(test_set_path, index=False)
val_set.to_csv(val_set_path, index=False)
train_set.to_csv(train_set_path, index=False)

      

### Balanced

In [4]:
# total samples per class
total_samples = len(database)
samples_per_label = int(0.1 * total_samples / 3)

df_test_label_0 = database[database['label'] == 0].sample(n=samples_per_label, random_state=42)
df_test_label_1 = database[database['label'] == 1].sample(n=samples_per_label, random_state=42)
df_test_label_2 = database[database['label'] == 2].sample(n=samples_per_label, random_state=42)

# combine the samples
balanced_test = pd.concat([df_test_label_0, df_test_label_1, df_test_label_2])

# shuffle the data
balanced_test = balanced_test.sample(frac=1, random_state=42).reset_index(drop=True)

# remove the samples from the database
database = database.drop(df_test_label_0.index)
database = database.drop(df_test_label_1.index)
database = database.drop(df_test_label_2.index)

# split val and train sets
df_val_label_0 = database[database['label'] == 0].sample(n=samples_per_label, random_state=42)
df_val_label_1 = database[database['label'] == 1].sample(n=samples_per_label, random_state=42)
df_val_label_2 = database[database['label'] == 2].sample(n=samples_per_label, random_state=42)

val_set = pd.concat([df_val_label_0, df_val_label_1, df_val_label_2])

# shuffle the data
val_set = val_set.sample(frac=1, random_state=42).reset_index(drop=True)

# remove the samples from the database
database = database.drop(df_val_label_0.index)
database = database.drop(df_val_label_1.index)
database = database.drop(df_val_label_2.index)

train_set = database


# save the balanced test set
balanced_test_path = os.path.join(current_folder, 'processed_datasets','non_hri_data_balanced_test.csv')
balanced_test.to_csv(balanced_test_path, index=False)

# save the val and train sets
val_set_path = os.path.join(current_folder, 'processed_datasets','non_hri_data_balanced_val.csv')
val_set.to_csv(val_set_path, index=False)

train_set_path = os.path.join(current_folder, 'processed_datasets','non_hri_data_balanced_train.csv')
train_set.to_csv(train_set_path, index=False)
