In [None]:
# !pip install --pre pandas==2.*

In [None]:
import duckdb
import hopsworks
import time
import boto3
import pandas as pd

# duckdb.__version__

In [None]:
pd.__version__

In [None]:
MAX_MEMORY = "35GB" # increase to available python memory -25%
TMP_DIR = "fg-data-v8"
DUCKDB_FILE = f"{TMP_DIR}/taxi.duckdb"
DATA_FOLDER = f"{TMP_DIR}/taxidata" 

# S3 Uploads
AWS_ACCESS_KEY=''
AWS_SECRET_ACCESS_KEY=''
AWS_REGION='us-east-2'
BUCKET = "hopsworks-bench-datasets"
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3')

# HDFS Uploads
HOPS_HOST=''
HOPS_API_KEY=''
HDFS_PATH = "/Projects/testproj/Resources/"


In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}


In [None]:
con = duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY, 'temp_directory': TMP_DIR}) 
con.execute("INSTALL httpfs;")
con.execute("INSTALL parquet;")
con.execute("LOAD httpfs;")
con.execute("LOAD parquet;")
con.execute(f"""
    SET s3_region='{AWS_REGION}';
    SET s3_access_key_id='{AWS_ACCESS_KEY}';
    SET s3_secret_access_key='{AWS_SECRET_ACCESS_KEY}';
    """)

In [None]:
def read_feature_data(limit, offset):
    lim = limit
    off = offset
    query = f'''
        CREATE 
        OR REPLACE VIEW taxidata 
        AS
        SELECT 
            tpep_pickup_datetime, 
            pu_location_id, 
            pu_borough,
            pu_svc_zone,
            pu_zone 
        FROM 
            read_parquet([
                's3://{BUCKET}/taxidata_cleaned/2011.parquet',
                's3://{BUCKET}/taxidata_cleaned/2012.parquet',
                's3://{BUCKET}/taxidata_cleaned/2013.parquet',
                's3://{BUCKET}/taxidata_cleaned/2014.parquet',
                's3://{BUCKET}/taxidata_cleaned/2015.parquet',
                's3://{BUCKET}/taxidata_cleaned/2016.parquet'
            ])
    '''
    con.execute(query)
    raw_data = con.execute(f"SELECT * FROM taxidata LIMIT {lim} OFFSET {off}").df()
    # Add row_id index to raw_data
    raw_data['row_id'] = range(offset, offset + len(raw_data))
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    return raw_data

In [None]:
project = hopsworks.login()
fs = project.get_feature_store()

In [None]:
# This code needs pandas>=2.0.3
from hsfs.core import arrow_flight_client
def _get_dataset(descriptor):
        info = arrow_flight_client.get_instance()._connection.get_flight_info(descriptor)
        reader = arrow_flight_client.get_instance()._connection.do_get(arrow_flight_client.get_instance()._info_to_ticket(info))
        table = reader.read_all()
        return table.to_pandas(types_mapper=pd.ArrowDtype)
arrow_flight_client.get_instance()._get_dataset=_get_dataset

In [None]:
scale_factor = [50] # Number of millions of rows to scale 

for sf in scale_factor:
    if sf < 10:
        limit = 5000000 # Get 5M at once
    elif sf < 20: 
        limit = 10000000 # Get 10M
    elif sf < 50:
        limit = 20000000 # Get 20M
    else:
        limit = 50000000 # Get 50M
    offset=0
    total_rows = sf * 1000000  # Millions
    while offset < total_rows:
        print(f"Total rows: {total_rows}; Offset: {offset}")
        pickup_features = read_feature_data(limit, offset) 
        pickup_fg = fs.get_or_create_feature_group(
            name=f"pickup_features_{sf}",
            version=1,
            primary_key=["row_id"],
            event_time=["tpep_pickup_datetime"],
            online_enabled=False,
            description="NYC Taxi data pickup features")
        print(f"Inserting into FG: pickup_features_{sf}")
        pickup_fg.insert(
            pickup_features, 
            write_options={
                "wait_for_job" : True,
                "hoodie.deltastreamer.kafka.source.maxEvents" : limit,
            })
        offset += limit

## Benchmark Reads

In [None]:
scale_factor = [5, 10, 20, 50]

for sf in scale_factor:
    pickup_fg = fs.get_feature_group(
        name=f"pickup_features_{sf}",
        version=1
    )
    start = time.time()
    df = pickup_fg.select_all().read()
    print(f"time for SF {sf}: {time.time() - start}")
    print(f"Num of rows of training data:\n {df.count()}")