In [None]:
!pip install duckdb --pre && pip install boto3

In [None]:
from google.cloud import aiplatform
import duckdb
import boto3
import time
import numpy as np
import pandas as pd

duckdb.__version__

In [None]:
pd.__version__

In [None]:
!gcloud config get project

In [None]:
PROJECT_ID = ""

if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
REGION = "us-west1"  # @param {type:"string"}
FEATURESTORE_ID = "taxidata_fs"

aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
!free -g

In [None]:
MAX_MEMORY = "35GB" # increase to available python memory -25%
TMP_DIR = "pit-data-v8"
DUCKDB_FILE = f"{TMP_DIR}/taxi.duckdb"
DATA_FOLDER = f"{TMP_DIR}/taxidata" 

# S3 Uploads
AWS_ACCESS_KEY=''
AWS_SECRET_ACCESS_KEY=''
AWS_REGION='us-east-2'
BUCKET = "hopsworks-bench-datasets"
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3')


In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}

In [None]:
#con.close()
con = duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY, 'temp_directory': TMP_DIR}) 
con.execute("INSTALL httpfs;")
con.execute("INSTALL parquet;")
con.execute("LOAD httpfs;")
con.execute("LOAD parquet;")
con.execute(f"""
    SET s3_region='{AWS_REGION}';
    SET s3_access_key_id='{AWS_ACCESS_KEY}';
    SET s3_secret_access_key='{AWS_SECRET_ACCESS_KEY}';
    """)

In [None]:
# Get tables as a dataframe
con.execute("PRAGMA threads=16")
con.execute("SET preserve_insertion_order=false")


In [None]:
def read_feature_data(limit, offset):
    lim = limit
    off = offset
    query = f'''
        CREATE 
        OR REPLACE VIEW taxidata 
        AS
        SELECT 
            *
        FROM 
            read_parquet([
                's3://{BUCKET}/taxidata_cleaned/2011.parquet',
                's3://{BUCKET}/taxidata_cleaned/2012.parquet',
                's3://{BUCKET}/taxidata_cleaned/2013.parquet',
                's3://{BUCKET}/taxidata_cleaned/2014.parquet',
                's3://{BUCKET}/taxidata_cleaned/2015.parquet',
                's3://{BUCKET}/taxidata_cleaned/2016.parquet'
            ])
    '''
    con.execute(query)
    raw_data = con.execute(f"SELECT * FROM taxidata LIMIT {lim} OFFSET {off}").df()
    # Add row_id index to raw_data
    raw_data['row_id'] = raw_data.reset_index().index
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    raw_data = raw_data.astype({"row_id": "string"})
    return raw_data

In [None]:
# Create featurestore
try:
    taxidata_feature_store = aiplatform.Featurestore(
        featurestore_name='taxidata_fs'
    )
    print("Featurestore already exists...")
except:
    print("Featurestore not found, creating it instead...")
    taxidata_feature_store = aiplatform.Featurestore.create(
        featurestore_id="taxidata_fs",
        online_store_fixed_node_count=0
    )
    
taxidata_feature_store

In [None]:
# Create pickup features entity type
try:
    print("pickup feature entity already exists...")
    pickup_fg_entity_type = taxidata_feature_store.get_entity_type(
        entity_type_id="pickup_fg_entity_type",
    )
except:
    print("pickup feature entity type not found, creating it instead...")
    pickup_fg_entity_type = taxidata_feature_store.create_entity_type(
        entity_type_id="pickup_fg_entity_type",
        description="Pickup features entity type",
    )

        
# Create dropoff features entity type
try:
    print("dropoff feature entity already exists...")
    dropoff_fg_entity_type = taxidata_feature_store.get_entity_type(
        entity_type_id="dropoff_fg_entity_type",
    )
except:
    print("dropoff feature entity type not found, creating it instead...")
    dropoff_fg_entity_type = taxidata_feature_store.create_entity_type(
        entity_type_id="dropoff_fg_entity_type",
        description="Dropoff features entity type",
    )

In [None]:
'''
    "pu_row_id": {
        "value_type": "INT64",
        "description": "pu_row_id",
    },
'''
pickup_fg_config = {
    "pu_location_id": {
        "value_type": "INT64",
        "description": "Pickup location ID",
    },
    "pu_borough": {
        "value_type": "STRING",
        "description": "Pickup borough",
    },
    "mean_fare_window_1h_pickup_zip": {
        "value_type": "DOUBLE",
        "description": "Mean fare pickup window 1 hour",
    },
    "count_trips_window_1h_pickup_zip": {
        "value_type": "INT64",
        "description": "Count trips pickup window 1 hour",
    },
}

try:
    pickup_fg_entity = pickup_fg_entity_type.batch_create_features(
        feature_configs=pickup_fg_config,
        sync = True
    )
    print("Entity feature group definition created")
except:
    print("Entity feature group definition already exists")


In [None]:
'''
    "do_row_id": {
        "value_type": "INT64",
        "description": "do_row_id",
    },
'''
dropoff_fg_config = {
    "do_location_id": {
        "value_type": "INT64",
        "description": "Dropoff location ID",
    },
    "do_borough": {
        "value_type": "STRING",
        "description": "Dropoff borough",
    },
    "dropoff_is_weekend": {
        "value_type": "BOOL",
        "description": "Dropoff is a weekend or not",
    },
    "count_trips_window_30m_dropoff_zip": {
        "value_type": "INT64",
        "description": "Count trips dropoff window 30 min",
    },
}


try:
    dropoff_fg_entity = dropoff_fg_entity_type.batch_create_features(
        feature_configs=dropoff_fg_config,
        sync = True
    )
    print("Entity feature group definition created")
except:
    print("Entity feature group definition already exists")


In [None]:
limit = 5000000
offset = 0
raw_data = read_feature_data(limit, offset)

In [None]:
raw_data

In [None]:
def filter_df_by_ts(df, ts_column, start_date, end_date):
    if ts_column and start_date:
        df = df[df[ts_column] >= start_date]
    if ts_column and end_date:
        df = df[df[ts_column] < end_date]
    return df

def pickup_features_fn(df, ts_column, start_date, end_date):
    df = filter_df_by_ts(df, ts_column, start_date, end_date)
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['window'] = pd.to_datetime(df['tpep_pickup_datetime']).dt.floor('15min').dt.strftime('%Y-%m-%d %H:%M:%S')
    pickup_features = (
        df.groupby(['pu_location_id', 'pu_borough', 'window'])
        .agg(
            mean_fare_window_1h_pickup_zip=('fare_amount', 'mean'),
            count_trips_window_1h_pickup_zip=('fare_amount', 'count')
        )
        .reset_index()
        .rename(columns={'window': 'ts'})
    )
    pickup_features['row_id'] = pickup_features.reset_index().index
    row_id = pickup_features.pop('row_id')
    pickup_features.insert(0, 'row_id', row_id)
    pickup_features.rename(columns={'row_id':'pu_row_id'}, inplace = True)

    return pickup_features

def dropoff_features_fn(df, ts_column, start_date, end_date):
    df = filter_df_by_ts(df, ts_column, start_date, end_date)
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['window'] = pd.to_datetime(df['tpep_dropoff_datetime']).dt.floor('30min').dt.strftime('%Y-%m-%d %H:%M:%S')
    dropoff_features = (
        df.groupby(['do_location_id', 'do_borough', 'window'])
        .agg(count_trips_window_30m_dropoff_zip=('do_borough', 'count'))
        .reset_index()
        .rename(columns={'window': 'ts'})
    )
    dropoff_features['ts'] = pd.to_datetime(dropoff_features['ts'])
    dropoff_features['dropoff_is_weekend'] = dropoff_features['ts'].dt.dayofweek.isin([5, 6])
    dropoff_features['row_id'] = dropoff_features.reset_index().index
    row_id = dropoff_features.pop('row_id')
    dropoff_features.insert(0, 'row_id', row_id)
    dropoff_features.rename(columns={'row_id':'do_row_id'}, inplace = True)

    return dropoff_features

In [None]:
from datetime import datetime, timedelta

pickup_features = pickup_features_fn(
    df=raw_data,
    ts_column="tpep_pickup_datetime",
    start_date=datetime(2011, 1, 1),
    end_date=datetime(2023, 1, 31),
)
pickup_features['ts'] = pd.to_datetime(pickup_features['ts'], format='%Y-%m-%d %H:%M:%S').astype('datetime64[ns, UTC]')
pickup_features = pickup_features.astype({"pu_row_id": "string"})

In [None]:
pickup_features

In [None]:
dropoff_features = dropoff_features_fn(
    df=raw_data,
    ts_column="tpep_dropoff_datetime",
    start_date=datetime(2011, 1, 1),
    end_date=datetime(2023, 1, 31),
)

dropoff_features['ts'] = pd.to_datetime(dropoff_features['ts'], format='%Y-%m-%d %H:%M:%S').astype('datetime64[ns, UTC]')
dropoff_features = dropoff_features.astype({"do_row_id": "string"})

In [None]:
dropoff_features

In [None]:
PICKUP_FEAT_IDS = [feature.name for feature in pickup_fg_entity_type.list_features()]
DROPOFF_FEAT_IDS = [feature.name for feature in dropoff_fg_entity_type.list_features()]
PICKUP_FEAT_IDS

In [None]:
pickup_fg_entity_type.ingest_from_df(
    feature_ids=PICKUP_FEAT_IDS,
    feature_time="ts",
    entity_id_field="pu_row_id",
    df_source=pickup_features
)

dropoff_fg_entity_type.ingest_from_df(
    feature_ids=DROPOFF_FEAT_IDS,
    feature_time="ts",
    entity_id_field="do_row_id",
    df_source=dropoff_features,
)

In [None]:
row_id = raw_data.pop('row_id')
raw_data.insert(0, 'pu_row_id', row_id)
raw_data.insert(1, 'do_row_id', row_id)
raw_data = raw_data.astype({"pu_row_id": "string"})
raw_data = raw_data.astype({"do_row_id": "string"})

In [None]:
raw_data.rename(columns = {'tpep_pickup_datetime':'timestamp'}, inplace = True)
raw_data.head(5)

In [None]:
SERVING_FEATURE_IDS = {
    "pickup_fg_entity_type": ["pu_location_id", "pu_borough", "mean_fare_window_1h_pickup_zip", "count_trips_window_1h_pickup_zip"],
    "dropoff_fg_entity_type": ["do_location_id", "do_borough", "count_trips_window_30m_dropoff_zip", "dropoff_is_weekend"],
}

In [None]:
read_inst_df = raw_data.copy()
read_inst_df.drop('tpep_dropoff_datetime', axis=1, inplace=True)
read_inst_df.drop('pu_location_id', axis=1, inplace=True)
read_inst_df.drop('do_location_id', axis=1, inplace=True)
read_inst_df.drop('pu_borough', axis=1, inplace=True)
read_inst_df.drop('do_borough', axis=1, inplace=True)
read_inst_df.drop('pu_svc_zone', axis=1, inplace=True)
read_inst_df.drop('do_svc_zone', axis=1, inplace=True)
read_inst_df.drop('pu_zone', axis=1, inplace=True)
read_inst_df.drop('do_zone', axis=1, inplace=True)
read_inst_df.rename(columns={'pu_row_id':'pickup_fg_entity_type', 'do_row_id':'dropoff_fg_entity_type'}, inplace = True)
ts = read_inst_df.pop('timestamp')
read_inst_df.insert(len(read_inst_df.columns), 'timestamp', ts)
print("before: ", read_inst_df['timestamp'].dtype)
read_inst_df['timestamp'] = pd.to_datetime(read_inst_df['timestamp'], format='%Y-%m-%dT%H:%M:%SZ').astype('datetime64[ns, UTC]')
print("after:  ", read_inst_df['timestamp'].dtype)
read_inst_df

In [None]:
start = time.time()
training_df = taxidata_feature_store.batch_serve_to_df(
    serving_feature_ids=SERVING_FEATURE_IDS,
    read_instances_df=read_inst_df,
    pass_through_fields=["trip_distance", "fare_amount", "tip_amount"],
)
print(f"Time taken for in-memory PIT Join training data: {time.time() - start}")

training_df

In [None]:
training_df.count()

In [None]:
# Create read-instances csv file from raw_data

READ_INSTANCES_CSV = 'read-instances-5m.csv'

# cols = ['pu_row_id','do_row_id','trip_distance','fare_amount','tip_amount','timestamp']
read_inst_df.to_csv(READ_INSTANCES_CSV, header=True, index=False)

In [None]:
! gsutil cp read-instances-5m.csv 'gs://ayush-bench/'

In [None]:
# Check first 5 lines of CSV
import csv

with open(READ_INSTANCES_CSV) as file:
    count = 0
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        if count < 5:
            print(row)
            count+=1
        else:
            break
    file.close()

In [None]:
from google.cloud import bigquery

READ_INSTANCES_CSV_URI = f'gs://ayush-bench/{READ_INSTANCES_CSV}'

# Output dataset
DESTINATION_DATA_SET = "taxidata"  # @param {type:"string"}
VERSION = "v1"
DESTINATION_DATA_SET = "{prefix}_{version}".format(
    prefix=DESTINATION_DATA_SET, version=VERSION
)

# Output table. Make sure that the table does NOT already exist; the BatchReadFeatureValues API cannot overwrite an existing table
DESTINATION_TABLE_NAME = "taxidata_pit_join_training_data"  # @param {type:"string"}

DESTINATION_PATTERN = "bq://{project}.{dataset}.{table}"
DESTINATION_TABLE_URI = DESTINATION_PATTERN.format(
    project=PROJECT_ID, dataset=DESTINATION_DATA_SET, table=DESTINATION_TABLE_NAME
)


# Create dataset
client = bigquery.Client(project=PROJECT_ID)
dataset_id = "{}.{}".format(client.project, DESTINATION_DATA_SET)
dataset = bigquery.Dataset(dataset_id)
dataset.location = REGION
dataset = client.create_dataset(dataset)
print("Created dataset {}.{}".format(client.project, dataset.dataset_id))


start = time.time()
training_df = taxidata_feature_store.batch_serve_to_bq(
    bq_destination_output_uri=DESTINATION_TABLE_URI,
    serving_feature_ids=SERVING_FEATURE_IDS,
    read_instances_uri=READ_INSTANCES_CSV_URI,
)
print(f"Time taken for BigQuery write PIT Join training data: {time.time() - start}")

training_df