## Notebook configuration

In [1]:
# IMPORTS & CONFIG

# utils
from watermark import watermark
from loguru import logger
import sys
import configparser
from tqdm import tqdm

# data science
import pandas as pd

# config
logger_format = (
    "<green>{time:MMM-D HH:mm:ss.SSS}</green> | "
    "<level>{level: <8}</level> | "
    "<level>{message}</level>"
)
logger.configure(extra={"ip": "", "user": ""})
logger.remove()
logger.add(sys.stderr, format=logger_format)
pass

In [2]:
# FUNCTIONS

def get_cutoff_indices(
    data: pd.DataFrame,
    n_rows: int,
    step_size: int
    ) -> list:
        """
        Slices and slides the input dataframe into a list of cutoff indices to prepare the data for training.
        """
        stop_position = len(data) - 1
        
        # Start the first sub-sequence at index position 0
        subseq_first_idx = 0
        subseq_mid_idx = n_rows
        subseq_last_idx = n_rows + 1
        indices = []
        
        while subseq_last_idx <= stop_position:
            indices.append((subseq_first_idx, subseq_mid_idx, subseq_last_idx))
            
            subseq_first_idx += step_size
            subseq_mid_idx += step_size
            subseq_last_idx += step_size

        return indices

In [3]:
# WATERMARK
wmrk_dict = {
    'author':'Gustavo Morales',
    'current_date':True,
    'updated':True,
    'python':True,
    'packages':'pandas,numpy,plotly',
}

print(watermark(**wmrk_dict))

Author: Gustavo Morales

Last updated: 2023-08-21

Python implementation: CPython
Python version       : 3.11.4
IPython version      : 8.14.0

pandas: 2.0.3
numpy : 1.25.0
plotly: 5.15.0



In [4]:
# hardcoded notebook parameters
config = configparser.ConfigParser()
config.read('config.cfg')

complete_data_file_path = config['PATHS']['complete_data_file_path']

In this notebook, the idea is to use the TS data created in notebook `02` to generate tabular data. After feature engineering, this tabular data will be ready to train supervised models.

## 1. Load the input data

In [5]:
rides_df = pd.read_parquet(complete_data_file_path)
rides_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2295120 entries, 0 to 2295119
Data columns (total 3 columns):
 #   Column              Dtype         
---  ------              -----         
 0   pickup_hour         datetime64[ns]
 1   total_rides         int64         
 2   pickup_location_id  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 52.5 MB


Let's say we want to focus on one location, Central Park.

In [6]:
# slicing the main df accordingly
rides_cp_df = rides_df.loc[rides_df.pickup_location_id==43, :].reset_index(drop=True)
rides_cp_df.head(10)

Unnamed: 0,pickup_hour,total_rides,pickup_location_id
0,2022-01-01 00:00:00,97,43
1,2022-01-01 01:00:00,60,43
2,2022-01-01 02:00:00,22,43
3,2022-01-01 03:00:00,8,43
4,2022-01-01 04:00:00,6,43
5,2022-01-01 05:00:00,5,43
6,2022-01-01 06:00:00,3,43
7,2022-01-01 07:00:00,10,43
8,2022-01-01 08:00:00,7,43
9,2022-01-01 09:00:00,19,43


In [7]:
len(rides_cp_df)

8760

For example, we can take the first 24 rows of data (so, one day worth of hourly data) and get their respective cutoff indices:

In [8]:
# take the first day of data and show first 5 sets of cutoff indices
n_rows = 24
step_size = 1

indices = get_cutoff_indices(
    rides_cp_df,
    n_rows,
    step_size
)
indices[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]