# Inter Dataset Balancer

This notebook will read all files from and will balance them according to the number of samples per class of one split (train, validation or test).
It takes the minimum number of samples per class from each split (train, validation or test), from all datasets with same split.
Then, it will remove samples from the dataset with more samples per class.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from dataset_processor import BalanceToMinimumClass

In [2]:
# Location with the data
root_dir = Path("../data/standartized_balanced")
# Location to save the data
output_dir = Path("../data/standartized_intra_balanced")
# Class to balance
class_to_balance = "standard activity code"

standartized_codes = {
    0: "sit",
    1: "stand",
    2: "walk",
    3: "stair up",
    4: "stair down",
    5: "run",
    6: "stair up and down",
}

# Get the class codes
class_codes = list(standartized_codes.keys())

In [3]:
# Minimum of each split 
split_min = {"train": np.inf, "validation": np.inf, "test": np.inf}

# Read all CSVs from all datasets
for f in root_dir.rglob("*.csv"):
    # Read dataframe
    df = pd.read_csv(f)
    # For each class `c`
    for c in class_codes:
        # Get the split name, based on file name (train, validation or test)
        split_name = f.stem
        # Number of elements from class `c`
        numel = len(df[df[class_to_balance] == c])
        # If the dataset does not have any element from class `c`, skip it
        if numel > 0:
            # Update the minimum
            split_min[split_name] = min(split_min[split_name], numel)

# Create a dictionary with the minimum class count for each split
split_balancer = {
    "train": BalanceToMinimumClass(
        class_column=class_to_balance, min_value=split_min["train"], random_state=0
    ),
    "validation": BalanceToMinimumClass(
        class_column=class_to_balance, min_value=split_min["validation"], random_state=0
    ),
    "test": BalanceToMinimumClass(
        class_column=class_to_balance, min_value=split_min["test"], random_state=0
    ),
}

# Dump some information
print("Minimum class count in each split (from all files):")
print(split_min)

Minimum class count in each split (from all files):
{'train': 231, 'validation': 68, 'test': 24}


In [4]:
# Read all CSVs from all datasets
for f in root_dir.rglob("*.csv"):
    # Get the dataset name, based on the parent folder name
    dataset_name = f.parent.name
    # Get the split name, based on file name (train, validation or test)
    split_name = f.stem
    # Get the filename (without parent directories)
    fname = f.name
    # Read dataframe
    df = pd.read_csv(f)
    # Balance the dataframe (based on the minimum class count of that split)
    df = split_balancer[split_name](df)
    # Create the output filename
    output_fname =  output_dir / dataset_name / f"{split_name}.csv"
    # Create the output directory (if it does not exist)
    output_fname.parent.mkdir(exist_ok=True, parents=True)
    # Save the dataframe
    df.to_csv(output_fname, index=False)