In [1]:
import polars as pl

In [2]:
df = pl.read_csv("./csv_data/molten_aluminum.csv")

In [4]:
len(df)

4925624

In [5]:
# split_data.ipynb에 추가할 코드
import polars as pl
from datetime import datetime

# 1. 데이터 스키마 확인
df_sample = pl.scan_csv("./csv_data/molten_aluminum.csv").head(5).collect()
print("컬럼:", df_sample.columns)
print("\n샘플 데이터:")
print(df_sample)

# 2. 시간 기반 split (권장)
df = pl.scan_csv("./csv_data/molten_aluminum.csv")

# 시간 파싱 및 정렬
df_sorted = df.with_columns(
    pl.col("time").str.to_datetime("%Y-%m-%d %H:%M:%S%.3f", strict=False)
).sort("time")

# 시간 범위 확인
time_range = df_sorted.select([
    pl.col("time").min().alias("start_time"),
    pl.col("time").max().alias("end_time")
]).collect()

print(f"\n시간 범위: {time_range['start_time'][0]} ~ {time_range['end_time'][0]}")

# 3. Split (70:15:15)
total = df_sorted.select(pl.count()).collect().item()
train_end = int(total * 0.7)
val_end = int(total * 0.85)

df_train = df_sorted.head(train_end).collect()
df_val = df_sorted.slice(train_end, val_end - train_end).collect()
df_test = df_sorted.tail(total - val_end).collect()

# 저장
df_train.write_csv("./csv_data/molten_aluminum_train.csv")
df_val.write_csv("./csv_data/molten_aluminum_val.csv")
df_test.write_csv("./csv_data/molten_aluminum_test.csv")

print(f"\n✅ Split 완료!")
print(f"Train: {len(df_train):,} rows ({len(df_train)/total*100:.1f}%)")
print(f"Val: {len(df_val):,} rows ({len(df_val)/total*100:.1f}%)")
print(f"Test: {len(df_test):,} rows ({len(df_test)/total*100:.1f}%)")

컬럼: ['time', 'curr', 'currR', 'currS', 'currT', 'Ground', 'PT100', 'Vibra', 'Volt', 'VoltR', 'VoltS', 'VoltT']

샘플 데이터:
shape: (5, 12)
┌─────────────────────────┬───────┬───────┬───────┬───┬───────┬───────┬───────┬───────┐
│ time                    ┆ curr  ┆ currR ┆ currS ┆ … ┆ Volt  ┆ VoltR ┆ VoltS ┆ VoltT │
│ ---                     ┆ ---   ┆ ---   ┆ ---   ┆   ┆ ---   ┆ ---   ┆ ---   ┆ ---   │
│ str                     ┆ f64   ┆ f64   ┆ f64   ┆   ┆ f64   ┆ f64   ┆ f64   ┆ f64   │
╞═════════════════════════╪═══════╪═══════╪═══════╪═══╪═══════╪═══════╪═══════╪═══════╡
│ 2021-04-01T00:00:00.200 ┆ 200.0 ┆ 222.1 ┆ 224.4 ┆ … ┆ 219.4 ┆ 221.0 ┆ 218.4 ┆ 221.9 │
│ 2021-04-01T00:00:00.300 ┆ 300.0 ┆ 222.0 ┆ 224.4 ┆ … ┆ 219.4 ┆ 220.9 ┆ 218.4 ┆ 221.9 │
│ 2021-04-01T00:00:00.400 ┆ 400.0 ┆ 222.0 ┆ 224.5 ┆ … ┆ 219.5 ┆ 220.9 ┆ 218.5 ┆ 221.9 │
│ 2021-04-01T00:00:00.500 ┆ 500.0 ┆ 221.9 ┆ 224.3 ┆ … ┆ 219.4 ┆ 220.8 ┆ 218.4 ┆ 221.8 │
│ 2021-04-01T00:00:00.600 ┆ 600.0 ┆ 221.9 ┆ 224.2 ┆ … ┆ 219.4 ┆ 220.8 ┆ 2

(Deprecated in version 0.20.5)
  total = df_sorted.select(pl.count()).collect().item()



✅ Split 완료!
Train: 3,447,936 rows (70.0%)
Val: 738,844 rows (15.0%)
Test: 738,844 rows (15.0%)
