In [2]:
import argparse

from pathlib import Path
import os
import numpy as np
import pandas as pd

import mlflow

In [20]:
TARGET_COL = "I_f"

NUMERIC_COLS = [
    "I_y",
    "PF",
    "e_PF",
    "d_if",
]

CAT_NOM_COLS = [
]

CAT_ORD_COLS = [
]

In [21]:
# Define Arguments for this step

class MyArgs:
    def __init__(self, /, **kwargs):
        self.__dict__.update(kwargs)

args = MyArgs(
            raw_data = "../../data/", 
            train_data = "/tmp/prep/train",
            val_data = "/tmp/prep/val",
            test_data = "/tmp/prep/test",
            )

os.makedirs(args.train_data, exist_ok = True)
os.makedirs(args.val_data, exist_ok = True)
os.makedirs(args.test_data, exist_ok = True)


In [28]:

def main(args):
    '''Read, split, and save datasets'''

    # ------------ Reading Data ------------ #
    # -------------------------------------- #

    print("mounted_path files: ")
    arr = os.listdir(args.raw_data)
    print(arr)

    data = pd.read_csv((Path(args.raw_data) / 'data.csv'))
    data = data[NUMERIC_COLS + CAT_NOM_COLS + CAT_ORD_COLS + [TARGET_COL]]

    # ------------- Split Data ------------- #
    # -------------------------------------- #

    # Split data into train, val and test datasets

    random_data = np.random.rand(len(data))

    msk_train = random_data < 0.7
    msk_val = (random_data >= 0.7) & (random_data < 0.85)
    msk_test = random_data >= 0.85

    train = data[msk_train]
    val = data[msk_val]
    test = data[msk_test]

    mlflow.log_metric('train size', train.shape[0])
    mlflow.log_metric('val size', val.shape[0])
    mlflow.log_metric('test size', test.shape[0])

    train.to_parquet((Path(args.train_data) / "train.parquet"))
    val.to_parquet((Path(args.val_data) / "val.parquet"))
    test.to_parquet((Path(args.test_data) / "test.parquet"))


In [29]:
mlflow.start_run()

lines = [
    f"Raw data path: {args.raw_data}",
    f"Train dataset output path: {args.train_data}",
    f"Val dataset output path: {args.val_data}",
    f"Test dataset path: {args.test_data}",

]

for line in lines:
    print(line)

main(args)

mlflow.end_run()

Raw data path: ../../data/
Train dataset output path: /tmp/prep/train
Val dataset output path: /tmp/prep/val
Test dataset path: /tmp/prep/test
mounted_path files: 
['data.csv', 'taxi-batch.csv', 'taxi-data.csv', 'taxi-request.json']
       I_y    PF  e_PF  d_if   I_f
5     5.51  0.95  0.22  0.42  1.60
24    5.53  0.76  0.18  0.46  1.42
30    4.54  0.71  0.32  0.15  1.40
56    4.84  0.65  0.20  0.50  1.85
61    4.02  0.66  0.25  0.48  1.54
...    ...   ...   ...   ...   ...
7342  5.75  0.66  0.35  0.70  1.36
7345  4.66  0.89  0.26  0.71  1.57
7346  3.11  0.97  0.30  0.73  1.78
7349  4.55  0.66  0.11  0.46  1.24
7360  4.66  0.83  0.05  0.42  1.83

[1099 rows x 5 columns]


In [10]:
ls "/tmp/prep/train" 

 Volume in drive D has no label.
 Volume Serial Number is 9E07-C812

 Directory of d:\tmp\prep\train

29/12/2024  15:19    <DIR>          .
29/12/2024  15:17    <DIR>          ..
29/12/2024  15:19            58,345 train.parquet
               1 File(s)         58,345 bytes
               2 Dir(s)  667,786,452,992 bytes free
