In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
# =========================
NORMAL_PATH = "../data/merge/openstack_normal.csv"
ABNORMAL_PATH = "../data/merge/openstack_abnormal.csv"

OUTPUT_DIR = "../data/split"
RANDOM_STATE = 42

TRAIN_RATIO = 0.7
VAL_RATIO = 0.2
TEST_RATIO = 0.1

os.makedirs(OUTPUT_DIR, exist_ok=True)

# =========================
# Load data
# =========================
normal_df = pd.read_csv(NORMAL_PATH)
abnormal_df = pd.read_csv(ABNORMAL_PATH)


In [5]:

normal_df["Label"] = 0
abnormal_df["Label"] = 1

df = pd.concat([normal_df, abnormal_df], ignore_index=True)

print("Total samples:", len(df))
print("Label distribution:")
print(df["Label"].value_counts())
print(df["Label"].value_counts(normalize=True))

# =========================
# Split train / temp
# =========================
train_df, temp_df = train_test_split(
    df,
    test_size=(1 - TRAIN_RATIO),
    stratify=df["Label"],
    random_state=RANDOM_STATE
)

# =========================
# Split val / test
# =========================
val_ratio_adjusted = VAL_RATIO / (VAL_RATIO + TEST_RATIO)

val_df, test_df = train_test_split(
    temp_df,
    test_size=(1 - val_ratio_adjusted),
    stratify=temp_df["Label"],
    random_state=RANDOM_STATE
)

Total samples: 207636
Label distribution:
Label
0    189202
1     18434
Name: count, dtype: int64
Label
0    0.91122
1    0.08878
Name: proportion, dtype: float64


In [6]:

# =========================
# Save
# =========================
train_df.to_csv(f"{OUTPUT_DIR}/train.csv", index=False)
val_df.to_csv(f"{OUTPUT_DIR}/val.csv", index=False)
test_df.to_csv(f"{OUTPUT_DIR}/test.csv", index=False)

# =========================
# Final check
# =========================
def check(name, df):
    print(f"\n{name}")
    print(df["Label"].value_counts())
    print(df["Label"].value_counts(normalize=True))

check("TRAIN", train_df)
check("VAL", val_df)
check("TEST", test_df)



TRAIN
Label
0    132441
1     12904
Name: count, dtype: int64
Label
0    0.911218
1    0.088782
Name: proportion, dtype: float64

VAL
Label
0    37840
1     3687
Name: count, dtype: int64
Label
0    0.911214
1    0.088786
Name: proportion, dtype: float64

TEST
Label
0    18921
1     1843
Name: count, dtype: int64
Label
0    0.911241
1    0.088759
Name: proportion, dtype: float64
