0. **Define** a schema for the data.
1. **Extract** all data from CSV files.
2. **Transform** data into inputs/targets.
3. **Split** inputs/targets into train/test sets.
4. **Sample** inputs/targets for the testing sets.
5. **Load** outputs inputs/targets to parquet files.

# IMPORTS

In [1]:
import polars as pl
from sklearn.model_selection import train_test_split

# CONFIGS

## Schema

In [2]:
SCHEMA = {
    "instant": pl.UInt32,
    "dteday": pl.Datetime,
    "season": pl.UInt8,
    "yr": pl.UInt8,
    "mnth": pl.UInt8,
    "hr": pl.UInt8,
    "holiday": pl.Boolean,
    "weekday": pl.UInt8,
    "workingday": pl.Boolean,
    "weathersit": pl.UInt8,
    "temp": pl.Float32,
    "atemp": pl.Float32,
    "hum": pl.Float32,
    "windspeed": pl.Float32,
    "casual": pl.UInt32,
    "registered": pl.UInt32,
    "cnt": pl.UInt32,
}

## Extract

In [3]:
HOUR_FILE = "../data/hour.csv"

## Transform

In [4]:
INDEX_COL = "instant"
TARGET_COL = "cnt"

# Split

In [5]:
SHUFFLE = False  # time-sensitive
TEST_SIZE = 0.2

# Sample

In [6]:
SAMPLE_SIZE = 2000

## Load

In [7]:
INPUTS_TRAIN_FILE = "../data/inputs_train.parquet"
INPUTS_TEST_FILE = "../data/inputs_test.parquet"
TARGETS_TRAIN_FILE = "../data/targets_train.parquet"
TARGETS_TEST_FILE = "../data/targets_test.parquet"
INPUTS_SAMPLE_FILE = "../tests/data/inputs_sample.parquet"
TARGETS_SAMPLE_FILE = "../tests/data/targets_sample.parquet"

# EXTRACT

In [8]:
hour = pl.read_csv(HOUR_FILE, try_parse_dates=True).cast(SCHEMA)
hour

instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
u32,datetime[μs],u8,u8,u8,u8,bool,u8,bool,u8,f32,f32,f32,f32,u32,u32,u32
1,2011-01-01 00:00:00,1,0,1,0,false,6,false,1,0.24,0.2879,0.81,0.0,3,13,16
2,2011-01-01 00:00:00,1,0,1,1,false,6,false,1,0.22,0.2727,0.8,0.0,8,32,40
3,2011-01-01 00:00:00,1,0,1,2,false,6,false,1,0.22,0.2727,0.8,0.0,5,27,32
4,2011-01-01 00:00:00,1,0,1,3,false,6,false,1,0.24,0.2879,0.75,0.0,3,10,13
5,2011-01-01 00:00:00,1,0,1,4,false,6,false,1,0.24,0.2879,0.75,0.0,0,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
17375,2012-12-31 00:00:00,1,1,12,19,false,1,true,2,0.26,0.2576,0.6,0.1642,11,108,119
17376,2012-12-31 00:00:00,1,1,12,20,false,1,true,2,0.26,0.2576,0.6,0.1642,8,81,89
17377,2012-12-31 00:00:00,1,1,12,21,false,1,true,1,0.26,0.2576,0.6,0.1642,7,83,90
17378,2012-12-31 00:00:00,1,1,12,22,false,1,true,1,0.26,0.2727,0.56,0.1343,13,48,61


# TRANFORM

In [9]:
inputs = hour.drop(TARGET_COL)
inputs

instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
u32,datetime[μs],u8,u8,u8,u8,bool,u8,bool,u8,f32,f32,f32,f32,u32,u32
1,2011-01-01 00:00:00,1,0,1,0,false,6,false,1,0.24,0.2879,0.81,0.0,3,13
2,2011-01-01 00:00:00,1,0,1,1,false,6,false,1,0.22,0.2727,0.8,0.0,8,32
3,2011-01-01 00:00:00,1,0,1,2,false,6,false,1,0.22,0.2727,0.8,0.0,5,27
4,2011-01-01 00:00:00,1,0,1,3,false,6,false,1,0.24,0.2879,0.75,0.0,3,10
5,2011-01-01 00:00:00,1,0,1,4,false,6,false,1,0.24,0.2879,0.75,0.0,0,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
17375,2012-12-31 00:00:00,1,1,12,19,false,1,true,2,0.26,0.2576,0.6,0.1642,11,108
17376,2012-12-31 00:00:00,1,1,12,20,false,1,true,2,0.26,0.2576,0.6,0.1642,8,81
17377,2012-12-31 00:00:00,1,1,12,21,false,1,true,1,0.26,0.2576,0.6,0.1642,7,83
17378,2012-12-31 00:00:00,1,1,12,22,false,1,true,1,0.26,0.2727,0.56,0.1343,13,48


In [10]:
targets = hour.select(TARGET_COL)
targets

cnt
u32
16
40
32
13
1
…
119
89
90
61


# SPLIT

In [11]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(
    inputs, targets, test_size=TEST_SIZE, shuffle=SHUFFLE
)
inputs_train.shape, inputs_test.shape, targets_train.shape, targets_test.shape

((13903, 16), (3476, 16), (13903, 1), (3476, 1))

# SAMPLE

In [12]:
inputs_train_sample = inputs_train.tail(SAMPLE_SIZE)
targets_train_sample = targets_train.tail(SAMPLE_SIZE)
inputs_train_sample.shape, targets_train_sample.shape

((2000, 16), (2000, 1))

# LOAD

In [13]:
inputs_train.write_parquet(INPUTS_TRAIN_FILE)
inputs_test.write_parquet(INPUTS_TEST_FILE)
targets_train.write_parquet(TARGETS_TRAIN_FILE)
targets_test.write_parquet(TARGETS_TEST_FILE)
inputs_train_sample.write_parquet(INPUTS_SAMPLE_FILE)
targets_train_sample.write_parquet(TARGETS_SAMPLE_FILE)