In [1]:
import pandas as pd
import hopsworks

In [2]:
# Read data from url
url = "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2016-12.parquet"
columns = ['lpep_pickup_datetime', 'PULocationID', 'DOLocationID', 'trip_distance', 'VendorID', 'payment_type', 'fare_amount', 'tip_amount']
data = pd.read_parquet(url, columns=columns)

In [3]:
# Choose only payments from Credit Cards
data = data.loc[data['payment_type'] == 1,].drop(columns='payment_type') # Credit card
# Choose only positive tip amounts
data = data[data['tip_amount'] >= 0]

# Sort data by pick up date
data = data.sort_values('lpep_pickup_datetime').reset_index(drop=True)

# Create column with pick up time
data['pickup_time'] = data['lpep_pickup_datetime'].dt.hour
data = data.reset_index()
data

Unnamed: 0,index,lpep_pickup_datetime,PULocationID,DOLocationID,trip_distance,VendorID,fare_amount,tip_amount,pickup_time
0,0,2016-12-01 00:00:02,82,129,0.60,1,5.0,1.00,0
1,1,2016-12-01 00:00:11,95,135,1.14,2,6.0,2.19,0
2,2,2016-12-01 00:00:16,181,79,6.91,2,22.0,3.50,0
3,3,2016-12-01 00:00:19,25,49,1.50,1,8.0,2.75,0
4,4,2016-12-01 00:00:25,52,181,1.96,2,8.5,1.96,0
...,...,...,...,...,...,...,...,...,...
581829,581829,2016-12-31 23:59:42,216,197,0.90,1,7.0,0.00,23
581830,581830,2016-12-31 23:59:47,256,48,9.02,2,30.5,0.00,23
581831,581831,2016-12-31 23:59:49,42,74,2.48,2,11.0,2.46,23
581832,581832,2016-12-31 23:59:51,42,42,0.54,2,4.0,1.06,23


In [4]:
proj = hopsworks.login()
fs = proj.get_feature_store()
fg = fs.get_or_create_feature_group("nycgreen2",
                                    version=1, 
                                    primary_key=["index"],
                                    event_time="lpep_pickup_datetime"
                                   )

2025-04-14 00:21:31,135 INFO: Initializing external client
2025-04-14 00:21:31,137 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-04-14 00:21:34,196 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/398


In [5]:
fg.insert(data)


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/398/fs/335/fg/1434137


Uploading Dataframe: 100.00% |████████████████████████| Rows 581834/581834 | Elapsed Time: 00:28 | Remaining Time: 00:00


Launching job: nycgreen2_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/398/jobs/named/nycgreen2_1_offline_fg_materialization/executions


(Job('nycgreen2_1_offline_fg_materialization', 'SPARK'), None)

In [6]:
select = fg.select_features()

2025-04-14 00:22:16,197 INFO: Using ['pulocationid', 'dolocationid', 'trip_distance', 'vendorid', 'fare_amount', 'tip_amount', 'pickup_time'] from feature group `nycgreen2` as features for the query. To include primary key and event time use `select_all`.


In [7]:
fv = fs.get_or_create_feature_view("nycgreen", 
                                   version=1,
                                   labels=['tip_amount'],
                                   query=select
                                  )
                                   

In [8]:
X_train, X_val, X_test, y_train, y_val, y_test = fv.train_validation_test_split(
    train_start="2016-12-01",
    train_end="2016-12-08",
    validation_start='2016-12-09',
    validation_end='2016-12-16',
    test_start='2016-12-16',
    test_end='2017-01-01'
)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (8.54s) 



In [9]:
X_train

Unnamed: 0,pulocationid,dolocationid,trip_distance,vendorid,fare_amount,pickup_time
1,80,62,5.42,2,20.5,22
10,134,197,1.50,1,8.0,20
15,166,42,1.74,2,8.5,20
22,70,70,0.11,2,16.0,6
25,166,151,0.69,2,4.5,19
...,...,...,...,...,...,...
581805,181,181,0.83,2,5.0,21
581819,166,74,1.52,2,12.5,17
581823,42,130,14.50,1,52.0,17
581824,52,189,1.40,2,10.0,8


In [10]:
X_val

Unnamed: 0,pulocationid,dolocationid,trip_distance,vendorid,fare_amount,pickup_time
2,212,182,0.60,1,5.0,18
3,256,238,8.46,2,28.5,0
5,112,36,2.89,2,12.0,0
6,174,238,12.27,2,33.5,0
8,181,37,7.43,2,23.5,21
...,...,...,...,...,...,...
581817,74,43,3.95,2,15.5,13
581818,95,95,1.18,2,8.0,13
581821,33,112,5.63,2,20.5,10
581829,92,7,6.67,2,22.5,14


In [11]:
X_test

Unnamed: 0,pulocationid,dolocationid,trip_distance,vendorid,fare_amount,pickup_time
0,40,229,7.48,2,26.0,22
4,112,36,7.60,1,32.5,4
7,145,137,2.76,2,13.0,9
11,166,74,1.74,2,10.0,13
12,116,151,2.65,2,9.0,9
...,...,...,...,...,...,...
581826,145,112,3.35,2,14.0,7
581827,244,42,3.75,2,12.0,21
581828,33,113,3.13,2,13.0,9
581831,7,163,3.35,2,13.5,20


In [12]:
features = []
for f in fv.features:
    features.append(f.name)

print(features)

['pulocationid', 'dolocationid', 'trip_distance', 'vendorid', 'fare_amount', 'tip_amount', 'pickup_time']


In [14]:
# labels = []
# for f in fv.labels:
#     labels.append(f.name)

print(fv.labels)

['tip_amount']
