1. **Extract** all data from CSV files.
2. **Transform** data into inputs/targets.
3. **Split** inputs/targets into train/test sets.
4. **Sample** inputs/targets for the testing sets.
5. **Load** outputs inputs/targets to parquet files.

# IMPORTS

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# CONFIGS

## Extract

In [2]:
HOUR_FILE = "../data/hour.csv"

## Transform

In [3]:
INDEX_COL = "instant"
TARGET_COL = 'cnt'

# Split

In [4]:
SHUFFLE = False # time-sensitive
TEST_SIZE = 0.2

# Sample

In [5]:
SAMPLE_RATIO = 0.15
SAMPLE_RANDOM_STATE = 0

## Load

In [6]:
INPUTS_TRAIN_FILE = "../data/inputs_train.parquet"
INPUTS_TEST_FILE = "../data/inputs_test.parquet"
TARGETS_TRAIN_FILE = "../data/targets_train.parquet"
TARGETS_TEST_FILE = "../data/targets_test.parquet"
INPUTS_SAMPLE_FILE = "../tests/data/inputs_sample.parquet"
TARGETS_SAMPLE_FILE = "../tests/data/targets_sample.parquet"

# EXTRACT

In [7]:
hour = pd.read_csv(HOUR_FILE, index_col=INDEX_COL)
print("Hour:", hour.shape)
hour.head()

Hour: (17379, 16)


Unnamed: 0_level_0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


# TRANFORM

In [8]:
inputs = hour.drop(TARGET_COL, axis="columns")
print("Inputs:", inputs.shape)
inputs.head()

Inputs: (17379, 15)


Unnamed: 0_level_0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13
2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32
3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27
4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10
5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1


In [9]:
targets = hour[TARGET_COL].to_frame()
print("Targets:", targets.shape)
targets.head()

Targets: (17379, 1)


Unnamed: 0_level_0,cnt
instant,Unnamed: 1_level_1
1,16
2,40
3,32
4,13
5,1


# SPLIT

In [10]:
inputs_train, inputs_test, targets_train, targets_test = train_test_split(
    inputs, targets, test_size=TEST_SIZE, shuffle=SHUFFLE
)
inputs_train.shape, inputs_test.shape, targets_train.shape, targets_test.shape

((13903, 15), (3476, 15), (13903, 1), (3476, 1))

# SAMPLE

In [11]:
inputs_train_sample = inputs_train.sample(frac=SAMPLE_RATIO, random_state=SAMPLE_RANDOM_STATE)
targets_train_sample = targets_train.sample(frac=SAMPLE_RATIO, random_state=SAMPLE_RANDOM_STATE)
inputs_train_sample.shape, targets_train_sample.shape

((2085, 15), (2085, 1))

# LOAD

In [12]:
inputs_train.to_parquet(INPUTS_TRAIN_FILE)
inputs_test.to_parquet(INPUTS_TEST_FILE)
targets_train.to_parquet(TARGETS_TRAIN_FILE)
targets_test.to_parquet(TARGETS_TEST_FILE)
inputs_train_sample.to_parquet(INPUTS_SAMPLE_FILE)
targets_train_sample.to_parquet(TARGETS_SAMPLE_FILE)