# Feature Selection

In [1]:
import datetime
import numpy as np
import pandas as pd
import time
import warnings

from itertools import product
from sklearn.model_selection import train_test_split

warnings.simplefilter('ignore')

### config

In [2]:
DATA_FILE_PATHS = 'D:/rahnema/final project/dataset/'
OUTPUT_PATH = 'D:/rahnema/final project/label/feature.parquet'
START_DATE = '2023-01-01'

### Load Data

In [3]:
def load_data(file_paths, start_date=None):
    df = pd.read_parquet(file_paths)
    df['date'] = df['tpep_pickup_datetime'].dt.date.astype(str)

    if start_date:
        df = df[df['date'] > start_date].reset_index(drop=True)

    return df


rides_df = load_data(DATA_FILE_PATHS, START_DATE)
print(rides_df.shape)
rides_df.head()

(12595923, 20)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,date
0,2,2023-01-02 00:00:37,2023-01-02 00:13:15,1.0,9.29,1.0,N,70,4,1,35.9,1.0,0.5,8.18,0.0,1.0,49.08,2.5,0.0,2023-01-02
1,2,2023-01-02 00:01:53,2023-01-02 00:34:16,1.0,20.4,2.0,N,132,238,1,70.0,0.0,0.5,15.86,6.55,1.0,95.16,0.0,1.25,2023-01-02
2,2,2023-01-02 00:04:59,2023-01-02 00:12:03,5.0,1.68,1.0,N,142,229,1,10.0,1.0,0.5,2.25,0.0,1.0,17.25,2.5,0.0,2023-01-02
3,2,2023-01-02 00:00:28,2023-01-02 00:08:45,1.0,1.74,1.0,N,164,224,1,10.7,1.0,0.5,0.0,0.0,1.0,15.7,2.5,0.0,2023-01-02
4,2,2023-01-02 00:00:08,2023-01-02 00:04:30,6.0,0.63,1.0,N,144,231,1,6.5,1.0,0.5,0.0,0.0,1.0,11.5,2.5,0.0,2023-01-02


### aggregate data and labeling

In [4]:
def labeling(rides_df: pd.DataFrame):
    aggregated_df = rides_df.groupby(['date', 'PULocationID']).size().reset_index(name='count')
    unique_dates = rides_df['date'].unique()
    unique_pu_location_ids = rides_df['PULocationID'].unique()
    all_combinations = list(product(unique_dates, unique_pu_location_ids))
    combinations_df = pd.DataFrame(all_combinations, columns=['date', 'PULocationID'])
    label_df = aggregated_df.merge(combinations_df, how='right', on=['date', 'PULocationID']).fillna(0)
    return label_df


rides_df = labeling(rides_df)
print(rides_df.shape)
rides_df.head()

(31964, 3)


Unnamed: 0,date,PULocationID,count
0,2023-01-02,70,503.0
1,2023-01-02,132,6419.0
2,2023-01-02,142,2028.0
3,2023-01-02,164,1462.0
4,2023-01-02,144,567.0


### adding calender features

In [5]:
def adding_feature(rides_df: pd.DataFrame):   
    #rides_df['count']=rides_df['count'] + 1000
    rides_df['date']=rides_df['date'].astype('datetime64')
    rides_df['PU_day_of_month'] = rides_df['date'].dt.day.astype(np.uint8)
    rides_df['PU_day_of_week'] = rides_df['date'].dt.weekday.astype(np.uint8)        
    rides_df = rides_df.sort_values(['date'])    
    rides_df['last_day_demand'] = rides_df.groupby(['PULocationID'])['count'].shift(1)    
    rides_df['last_week_demand'] = rides_df.groupby(['PULocationID'])['count'].shift(7)
    
    return rides_df

rides_df=adding_feature(rides_df)
print(rides_df.shape)
rides_df.head()

(31964, 7)


Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day_demand,last_week_demand
0,2023-01-02,70,503.0,2,0,,
166,2023-01-02,165,1.0,2,0,,
167,2023-01-02,3,2.0,2,0,,
168,2023-01-02,147,2.0,2,0,,
169,2023-01-02,122,2.0,2,0,,


### checking one week of data as a sample

In [6]:
rides_df[(rides_df['PULocationID'] == 79)].tail(8)

Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day_demand,last_week_demand
29108,2023-04-26,79,2012.0,26,2,1677.0,1851.0
30418,2023-04-27,79,2260.0,27,3,2012.0,2292.0
30942,2023-04-28,79,3145.0,28,4,2260.0,2976.0
31204,2023-04-29,79,4581.0,29,5,3145.0,4641.0
31466,2023-04-30,79,3349.0,30,6,4581.0,3102.0
31728,2023-05-01,79,0.0,1,0,3349.0,1421.0
30156,2023-05-02,79,1.0,2,1,0.0,1677.0
30680,2023-05-03,79,1.0,3,2,1.0,2012.0


In [38]:
rides_df.to_parquet(OUTPUT_PATH)