# Feature Engineering In Advance of HPO

In [1]:
ACCOUNT = "dask-engineering"  # <--- Put your account here

In [2]:
import coiled
import dask.dataframe as dd
from distributed import Client
import optuna
import pandas as pd

### Load data

In [3]:
BOROUGH_MAPPING = {
    "Manhattan": "Superborough 1",
    "Bronx": "Superborough 1",
    "EWR": "Superborough 1",
    "Brooklyn": "Superborough 2",
    "Queens": "Superborough 2",
    "Staten Island": "Superborough 3",
    "Unknown": "Unknown",
    }

In [4]:
def load_data():
    print("loading data")
    to_exclude=["string", "category", "object"]
    ddf= dd.read_parquet("s3://coiled-datasets/prefect-dask/nyc-uber-lyft/processed_data.parquet")
    print(f"size of the total dataset is:  {len(ddf.index)}")
    ddf = ddf.assign(accessible_vehicle = 1)
    print("Make accessible feature")
    ddf.accessible_vehicle = ddf.accessible_vehicle.where(ddf.on_scene_datetime.isnull(),0)  # Only applies if the vehicle is wheelchair accessible
    ddf = ddf.assign(pickup_month = ddf.pickup_datetime.dt.month)
    ddf = ddf.assign(pickup_dow = ddf.pickup_datetime.dt.dayofweek)
    ddf = ddf.assign(pickup_hour = ddf.pickup_datetime.dt.hour)
    
    ddf = ddf.drop(columns=['on_scene_datetime', 'request_datetime',
                            'pickup_datetime', 'dispatching_base_num',
                            'originating_base_num', 'shared_request_flag',
                           'shared_match_flag','dropoff_datetime',
                            'base_passenger_fare', 'bcf', 'sales_tax',
                            'tips', 'driver_pay', 'access_a_ride_flag',
                            'wav_match_flag'
                           ]
                  )

    ddf = ddf.dropna(how="any")
    ddf = ddf.reset_index(drop=True)

    original_rowcount = len(ddf.index)

    # Remove outliers
    # Based on our earlier EDA, we will set the lower bound at zero, which is consistent with our
    # domain knowledge that no trip should have a duration less than zero.  We calculate the upper_bound
    # and filter the IQR
    lower_bound = 0
    Q3 = ddf['trip_time'].quantile(0.75, method="tdigest")
    print(f"Q3 is:  {Q3.compute()}")
    upper_bound = Q3 + (1.5*(Q3 - lower_bound))
    print(f"Upper bound is:  {upper_bound.compute()}")
    
    ddf = ddf.loc[(ddf['trip_time'] >= lower_bound) & (ddf['trip_time'] <= upper_bound)]
    
    ddf = ddf.repartition(partition_size="100MB").persist()
    print(f"Fraction of dataset left after removing outliers:  {len(ddf.index) / original_rowcount}")

    return ddf

In [5]:
def make_cross_borough_cat(df):
    PUSuperborough = [BOROUGH_MAPPING.get(i) for i in df.PUBorough.tolist()]
    DOSuperborough = [BOROUGH_MAPPING.get(i) for i in df.DOBorough.tolist()]
    PUSuperborough_DOSuperborough_Pair = [f"{i}-{j}" for i,j in zip(PUSuperborough, DOSuperborough)]
    return df.assign(PUSuperborough_DOSuperborough = PUSuperborough_DOSuperborough_Pair)

In [10]:
def make_taxi_data(ddf):
    print("Load taxi data")
    taxi_df = pd.read_csv("data/taxi+_zone_lookup.csv", usecols=["LocationID", "Borough"])

    ddf = dd.merge(ddf, taxi_df, left_on="PULocationID", right_on="LocationID", how="inner")
    ddf = ddf.rename(columns={"Borough": "PUBorough"})
    ddf = ddf.drop(columns="LocationID")

    ddf = dd.merge(ddf, taxi_df, left_on="DOLocationID", right_on="LocationID", how="inner")
    ddf = ddf.rename(columns={"Borough": "DOBorough"})
    ddf = ddf.drop(columns="LocationID")  
    
    print("Make superboroughs")
    ddf = ddf.map_partitions(lambda df: make_cross_borough_cat(df))
    ddf['airport_fee'] = ddf['airport_fee'].replace("None", 0)
    ddf['airport_fee'] = ddf['airport_fee'].replace('nan', 0)
    ddf['airport_fee'] = ddf['airport_fee'].astype(float)
    ddf['airport_fee'] = ddf['airport_fee'].fillna(0)

    ddf = ddf.drop(columns="wav_request_flag")
    ddf = ddf.repartition(partition_size="100MB").persist()

    print("Make categoricals")
    categories = ['hvfhs_license_num', 'PULocationID', "DOLocationID",
                  'accessible_vehicle', 'pickup_month', 'pickup_dow', 'pickup_hour', 
                  'PUBorough', 'DOBorough', 'PUSuperborough_DOSuperborough'
                 ]
    ddf[categories] = ddf[categories].astype('category')
    ddf = ddf.categorize(columns=categories)
    ddf = ddf.repartition(partition_size="100MB")
    return ddf

## Test Loading Dataset

In [7]:
cluster = coiled.Cluster(
    worker_vm_types=["m6i.2xlarge"],
    scheduler_vm_types=["m6i.2xlarge"],
    package_sync=True,         # copy local packages,
    shutdown_on_close=True,  
    show_widget=False,
    n_workers=20,
    use_best_zone=True,
    account=ACCOUNT,
    backend_options={"region": "us-east-2", "spot": True, "spot_on_demand_fallback": True},
    scheduler_options={"idle_timeout": "10 minutes"}
    )
client = Client(cluster)

In [12]:
client

0,1
Connection method: Cluster object,Cluster type: coiled.ClusterBeta
Dashboard: http://13.59.146.8:8787,

0,1
Dashboard: http://13.59.146.8:8787,Workers: 20
Total threads: 160,Total memory: 603.65 GiB

0,1
Comm: tls://10.0.14.47:8786,Workers: 20
Dashboard: http://10.0.14.47:8787/status,Total threads: 160
Started: 24 minutes ago,Total memory: 603.65 GiB

0,1
Comm: tls://10.0.13.137:37045,Total threads: 8
Dashboard: http://10.0.13.137:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.13.137:44823,
Local directory: /scratch/dask-worker-space/worker-c32m2wam,Local directory: /scratch/dask-worker-space/worker-c32m2wam

0,1
Comm: tls://10.0.10.157:39421,Total threads: 8
Dashboard: http://10.0.10.157:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.10.157:39145,
Local directory: /scratch/dask-worker-space/worker-axds2wz4,Local directory: /scratch/dask-worker-space/worker-axds2wz4

0,1
Comm: tls://10.0.6.158:44201,Total threads: 8
Dashboard: http://10.0.6.158:8787/status,Memory: 30.19 GiB
Nanny: tls://10.0.6.158:36765,
Local directory: /scratch/dask-worker-space/worker-q899mzvt,Local directory: /scratch/dask-worker-space/worker-q899mzvt

0,1
Comm: tls://10.0.0.97:43977,Total threads: 8
Dashboard: http://10.0.0.97:8787/status,Memory: 30.19 GiB
Nanny: tls://10.0.0.97:46873,
Local directory: /scratch/dask-worker-space/worker-nvhi82pm,Local directory: /scratch/dask-worker-space/worker-nvhi82pm

0,1
Comm: tls://10.0.4.148:43613,Total threads: 8
Dashboard: http://10.0.4.148:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.4.148:45291,
Local directory: /scratch/dask-worker-space/worker-aovnbaj6,Local directory: /scratch/dask-worker-space/worker-aovnbaj6

0,1
Comm: tls://10.0.3.64:39129,Total threads: 8
Dashboard: http://10.0.3.64:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.3.64:41335,
Local directory: /scratch/dask-worker-space/worker-t82lyqhn,Local directory: /scratch/dask-worker-space/worker-t82lyqhn

0,1
Comm: tls://10.0.11.50:36359,Total threads: 8
Dashboard: http://10.0.11.50:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.11.50:34771,
Local directory: /scratch/dask-worker-space/worker-x5w27li7,Local directory: /scratch/dask-worker-space/worker-x5w27li7

0,1
Comm: tls://10.0.11.19:35227,Total threads: 8
Dashboard: http://10.0.11.19:8787/status,Memory: 30.19 GiB
Nanny: tls://10.0.11.19:36671,
Local directory: /scratch/dask-worker-space/worker-ei51nxqe,Local directory: /scratch/dask-worker-space/worker-ei51nxqe

0,1
Comm: tls://10.0.0.79:45817,Total threads: 8
Dashboard: http://10.0.0.79:8787/status,Memory: 30.19 GiB
Nanny: tls://10.0.0.79:42033,
Local directory: /scratch/dask-worker-space/worker-a9w_j2k3,Local directory: /scratch/dask-worker-space/worker-a9w_j2k3

0,1
Comm: tls://10.0.14.240:44555,Total threads: 8
Dashboard: http://10.0.14.240:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.14.240:43035,
Local directory: /scratch/dask-worker-space/worker-t7gb7wxk,Local directory: /scratch/dask-worker-space/worker-t7gb7wxk

0,1
Comm: tls://10.0.0.127:39429,Total threads: 8
Dashboard: http://10.0.0.127:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.0.127:38099,
Local directory: /scratch/dask-worker-space/worker-xouybe8j,Local directory: /scratch/dask-worker-space/worker-xouybe8j

0,1
Comm: tls://10.0.14.125:41259,Total threads: 8
Dashboard: http://10.0.14.125:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.14.125:36195,
Local directory: /scratch/dask-worker-space/worker-_59kqc_w,Local directory: /scratch/dask-worker-space/worker-_59kqc_w

0,1
Comm: tls://10.0.4.130:34587,Total threads: 8
Dashboard: http://10.0.4.130:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.4.130:45763,
Local directory: /scratch/dask-worker-space/worker-5ylvgb_9,Local directory: /scratch/dask-worker-space/worker-5ylvgb_9

0,1
Comm: tls://10.0.1.40:44249,Total threads: 8
Dashboard: http://10.0.1.40:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.1.40:41691,
Local directory: /scratch/dask-worker-space/worker-dcsm7q7h,Local directory: /scratch/dask-worker-space/worker-dcsm7q7h

0,1
Comm: tls://10.0.1.153:36523,Total threads: 8
Dashboard: http://10.0.1.153:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.1.153:45531,
Local directory: /scratch/dask-worker-space/worker-ozmp1vt7,Local directory: /scratch/dask-worker-space/worker-ozmp1vt7

0,1
Comm: tls://10.0.0.152:35567,Total threads: 8
Dashboard: http://10.0.0.152:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.0.152:42309,
Local directory: /scratch/dask-worker-space/worker-ud8ta90r,Local directory: /scratch/dask-worker-space/worker-ud8ta90r

0,1
Comm: tls://10.0.15.187:39117,Total threads: 8
Dashboard: http://10.0.15.187:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.15.187:46735,
Local directory: /scratch/dask-worker-space/worker-nh983n73,Local directory: /scratch/dask-worker-space/worker-nh983n73

0,1
Comm: tls://10.0.1.49:38279,Total threads: 8
Dashboard: http://10.0.1.49:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.1.49:35129,
Local directory: /scratch/dask-worker-space/worker-v9062x75,Local directory: /scratch/dask-worker-space/worker-v9062x75

0,1
Comm: tls://10.0.10.14:43755,Total threads: 8
Dashboard: http://10.0.10.14:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.10.14:43981,
Local directory: /scratch/dask-worker-space/worker-61_dz24w,Local directory: /scratch/dask-worker-space/worker-61_dz24w

0,1
Comm: tls://10.0.15.237:42613,Total threads: 8
Dashboard: http://10.0.15.237:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.15.237:36157,
Local directory: /scratch/dask-worker-space/worker-dtm0yqxl,Local directory: /scratch/dask-worker-space/worker-dtm0yqxl


In [13]:
ddf = load_data()
ddf = make_taxi_data(ddf)

loading data
size of the total dataset is:  1454402254
Make accessible feature
Q3 is:  1459.7605849894153
Upper bound is:  3649.4014624735382
Fraction of dataset left after removing outliers:  0.9841959776756222
Load taxi data
Make superboroughs
Make categoricals


In [14]:
ddf.head()

Unnamed: 0,hvfhs_license_num,PULocationID,DOLocationID,trip_miles,trip_time,tolls,congestion_surcharge,airport_fee,accessible_vehicle,pickup_month,pickup_dow,pickup_hour,PUBorough,DOBorough,PUSuperborough_DOSuperborough
0,HV0005,136,265,24.306,2560,6.21,0.0,0.0,1,2,3,19,Bronx,Unknown,Superborough 1-Unknown
1,HV0005,136,265,6.904,1122,0.0,0.0,0.0,1,7,2,22,Bronx,Unknown,Superborough 1-Unknown
2,HV0003,136,265,6.39,1026,0.0,0.0,0.0,0,7,2,22,Bronx,Unknown,Superborough 1-Unknown
3,HV0005,136,265,17.771999,1988,0.21,0.0,0.0,1,7,2,23,Bronx,Unknown,Superborough 1-Unknown
4,HV0005,136,265,24.875,2212,6.48,0.0,0.0,1,7,2,23,Bronx,Unknown,Superborough 1-Unknown


In [15]:
ddf.columns.tolist()

['hvfhs_license_num',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'tolls',
 'congestion_surcharge',
 'airport_fee',
 'accessible_vehicle',
 'pickup_month',
 'pickup_dow',
 'pickup_hour',
 'PUBorough',
 'DOBorough',
 'PUSuperborough_DOSuperborough']

In [16]:
ddf.dtypes

hvfhs_license_num                category
PULocationID                     category
DOLocationID                     category
trip_miles                        float32
trip_time                           int32
tolls                             float32
congestion_surcharge              float32
airport_fee                       float64
accessible_vehicle               category
pickup_month                     category
pickup_dow                       category
pickup_hour                      category
PUBorough                        category
DOBorough                        category
PUSuperborough_DOSuperborough    category
dtype: object

In [17]:
ddf.to_parquet("s3://coiled-datasets/prefect-dask/nyc-uber-lyft/feature_table.parquet", overwrite=True)

In [18]:
client.shutdown()

2023-01-12 20:35:14,147 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
