In [64]:
import pandas as pd
from tqdm import tqdm

In [3]:
ts = pd.read_parquet("../data/transformed/ts_data_2023_01.parquet")
ts.shape

(191208, 3)

In [4]:
# Central Park pickup location id = 43
cp_data = ts[ts.location_id==43].reset_index(drop=True)
cp_data.head()

Unnamed: 0,pickup_hour,location_id,nr_rides
0,2023-01-01 00:00:00,43,93
1,2023-01-01 01:00:00,43,81
2,2023-01-01 02:00:00,43,30
3,2023-01-01 03:00:00,43,15
4,2023-01-01 04:00:00,43,4


In [13]:
def get_cutoff_indices(data, history = 12, step = 1)->list:
    from_pos = 0
    to_pos = history

    indices = []
    while to_pos<len(data):
        # next index is for response
        response = to_pos+1
        ind = (from_pos, to_pos, response)
        from_pos+=step
        to_pos+=step
        indices.append(ind)

    return indices

In [53]:
all_features = []
hours = []
all_responses = []

idx = get_cutoff_indices(data=cp_data, history=24, step=1)
for cutoff_ind in idx:
    features = cp_data.nr_rides.iloc[cutoff_ind[0]:cutoff_ind[1]].values
    
    pickup_hour = cp_data.pickup_hour.iloc[cutoff_ind[1]:cutoff_ind[2]].values[0]
    response = cp_data.nr_rides.iloc[cutoff_ind[1]:cutoff_ind[2]].values[0]
    all_features.append(features)
    hours.append(pickup_hour)
    all_responses.append(response)

ml_data = pd.DataFrame(all_features)
new_column_names=[]
for i in list(ml_data.columns)[::-1]:
    new_column_names.append(f"rides_{i+1}_h_ago")
ml_data.columns = new_column_names

ml_data["pickup_hour"] = hours
ml_data["response"] = all_responses
ml_data



Unnamed: 0,rides_24_h_ago,rides_23_h_ago,rides_22_h_ago,rides_21_h_ago,rides_20_h_ago,rides_19_h_ago,rides_18_h_ago,rides_17_h_ago,rides_16_h_ago,rides_15_h_ago,...,rides_8_h_ago,rides_7_h_ago,rides_6_h_ago,rides_5_h_ago,rides_4_h_ago,rides_3_h_ago,rides_2_h_ago,rides_1_h_ago,pickup_hour,response
0,93,81,30,15,4,4,4,12,12,23,...,104,65,39,35,32,41,18,13,2023-01-02 00:00:00,2
1,81,30,15,4,4,4,12,12,23,37,...,65,39,35,32,41,18,13,2,2023-01-02 01:00:00,0
2,30,15,4,4,4,12,12,23,37,41,...,39,35,32,41,18,13,2,0,2023-01-02 02:00:00,2
3,15,4,4,4,12,12,23,37,41,103,...,35,32,41,18,13,2,0,2,2023-01-02 03:00:00,2
4,4,4,4,12,12,23,37,41,103,97,...,32,41,18,13,2,0,2,2,2023-01-02 04:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,74,50,33,16,17,3,1,0,1,1,...,106,107,109,96,107,156,108,88,2023-01-31 19:00:00,81
716,50,33,16,17,3,1,0,1,1,2,...,107,109,96,107,156,108,88,81,2023-01-31 20:00:00,49
717,33,16,17,3,1,0,1,1,2,16,...,109,96,107,156,108,88,81,49,2023-01-31 21:00:00,44
718,16,17,3,1,0,1,1,2,16,69,...,96,107,156,108,88,81,49,44,2023-01-31 22:00:00,35


In [65]:
### Transform time series data into ml data for all locations

def transform_ts_data_into_features_targets(data:pd.DataFrame, historical_feat:int=12, step_size:int=1)->pd.DataFrame:
    location_ids_ls = data.location_id.unique()
    transformed_data = pd.DataFrame()

    for loc_id in tqdm(location_ids_ls):
        df_by_loc = data[data.location_id==loc_id].reset_index(drop=True)
        idx = get_cutoff_indices(data=df_by_loc, history=historical_feat, step=step_size)

        all_features = []
        hours = []
        locations = []
        all_responses = []

        for cutoff_ind in idx:
            features = df_by_loc.nr_rides.iloc[cutoff_ind[0]:cutoff_ind[1]].values
            
            pickup_hour = df_by_loc.pickup_hour.iloc[cutoff_ind[1]:cutoff_ind[2]].values[0]
            # location = df_by_loc.location_id.iloc[cutoff_ind[1]:cutoff_ind[2]].values[0]
            response = df_by_loc.nr_rides.iloc[cutoff_ind[1]:cutoff_ind[2]].values[0]
            all_features.append(features)
            hours.append(pickup_hour)
            locations.append(loc_id)
            all_responses.append(response)

        ml_data = pd.DataFrame(all_features)
        new_column_names=[]
        for i in list(ml_data.columns)[::-1]:
            new_column_names.append(f"rides_{i+1}_h_ago")
        ml_data.columns = new_column_names

        ml_data["pickup_hour"] = hours
        ml_data["location_id"] = locations
        ml_data["response"] = all_responses
        transformed_data = pd.concat([transformed_data, ml_data]).reset_index(drop=True)
    return transformed_data

In [66]:
transformed_df = transform_ts_data_into_features_targets(data=ts, historical_feat=24, step_size=1)
transformed_df.head()

100%|██████████| 257/257 [00:37<00:00,  6.93it/s]


Unnamed: 0,rides_24_h_ago,rides_23_h_ago,rides_22_h_ago,rides_21_h_ago,rides_20_h_ago,rides_19_h_ago,rides_18_h_ago,rides_17_h_ago,rides_16_h_ago,rides_15_h_ago,...,rides_7_h_ago,rides_6_h_ago,rides_5_h_ago,rides_4_h_ago,rides_3_h_ago,rides_2_h_ago,rides_1_h_ago,pickup_hour,location_id,response
0,0,0,0,0,0,1,0,0,1,0,...,3,1,2,4,1,0,0,2023-01-02 00:00:00,1,0
1,0,0,0,0,1,0,0,1,0,0,...,1,2,4,1,0,0,0,2023-01-02 01:00:00,1,0
2,0,0,0,1,0,0,1,0,0,1,...,2,4,1,0,0,0,0,2023-01-02 02:00:00,1,0
3,0,0,1,0,0,1,0,0,1,3,...,4,1,0,0,0,0,0,2023-01-02 03:00:00,1,0
4,0,1,0,0,1,0,0,1,3,4,...,1,0,0,0,0,0,0,2023-01-02 04:00:00,1,1
