# Split Dataset

## Split the dataset into Train, Valid and Test sets
- Train : Valid : Test = 70 : 15 : 15

In [1]:
import configparser
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path

config = configparser.ConfigParser()
config.read("config.ini")
random_seed = config["data_preprocessing"].getint("random_seed")
label_file = Path(config["data_preprocessing"]["label_file"])
dataset_dir = Path(config["data_preprocessing"]["dataset_dir"])

dataset_df = pd.read_csv(label_file, header=None, names=["record_name", "label"])

temp_df, test_df = train_test_split(
    dataset_df, test_size=0.15, stratify=dataset_df["label"], random_state=random_seed
)

train_df, valid_df = train_test_split(
    temp_df, test_size=0.15 / 0.85, stratify=temp_df["label"], random_state=random_seed
)

# Save the splitted dataset
train_df.sort_index().to_csv(dataset_dir / "train.csv", index=False)
valid_df.sort_index().to_csv(dataset_dir / "valid.csv", index=False)
test_df.sort_index().to_csv(dataset_dir / "test.csv", index=False)

## Analyze the label distribution

In [2]:
def calculate_label_stats(subset_df):
    label_counts = subset_df["label"].value_counts()
    label_proportions = subset_df["label"].value_counts(normalize=True)

    label_stats = pd.DataFrame({"count": label_counts, "proportion": label_proportions})
    
    return label_stats


dataset_label_stats = calculate_label_stats(dataset_df)
train_label_stats = calculate_label_stats(train_df)
valid_label_stats = calculate_label_stats(valid_df)
test_label_stats = calculate_label_stats(test_df)

print("Dataset:")
print(dataset_label_stats)
print("\nTrain set:")
print(train_label_stats)
print("\nValid set:")
print(valid_label_stats)
print("\nTest set:")
print(test_label_stats)

Dataset:
       count  proportion
label                   
N       5076    0.595216
O       2415    0.283185
A        758    0.088884
~        279    0.032716

Train set:
       count  proportion
label                   
N       3552    0.595174
O       1691    0.283345
A        530    0.088807
~        195    0.032674

Valid set:
       count  proportion
label                   
N        762    0.595313
O        362    0.282813
A        114    0.089063
~         42    0.032813

Test set:
       count  proportion
label                   
N        762    0.595313
O        362    0.282813
A        114    0.089063
~         42    0.032813
