In [1]:
import pandas as pd
import numpy as np
from feast.sdk.resources.entity import Entity
from feast.sdk.resources.storage import Storage
from feast.sdk.resources.feature import Feature, Datastore, ValueType
from feast.sdk.resources.feature_set import FeatureSet, FileType
import feast.specs.FeatureSpec_pb2 as feature_pb

from feast.sdk.importer import Importer
from feast.sdk.client import Client
import warnings
warnings.filterwarnings("ignore", "Your application has authenticated using end user credentials")

## Feature Engineering

In [2]:
# Feature engineering steps 
## Referenced from https://www.kaggle.com/karelrv/nyct-from-a-to-z-with-xgboost-tutorial/notebook

def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

df = pd.read_csv('~/Workspace/feast/sdk/python/examples/quickstart/taxi_small.csv')
df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
df['dropoff_datetime'] = pd.to_datetime(df.dropoff_datetime)
df['log_trip_duration'] = np.log(df['trip_duration'].values + 1)

# location features
df.loc[:, 'distance_haversine'] = haversine_array(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)
df.loc[:, 'distance_dummy_manhattan'] =  dummy_manhattan_distance(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)
df.loc[:, 'direction'] = bearing_array(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)

# time features
df['month'] = df['pickup_datetime'].dt.month
df['day_of_month'] = df['pickup_datetime'].dt.day
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

# one hot encoding
vendor = pd.get_dummies(df['vendor_id'], prefix='vi', prefix_sep='_')
store_and_fwd_flag = pd.get_dummies(df['store_and_fwd_flag'], prefix='sf', prefix_sep='_')

df = df.drop(['trip_duration','vendor_id','passenger_count','store_and_fwd_flag', 'dropoff_datetime',
           'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis = 1)
df.columns = ['ride'] + list(df.columns[1:])
df_complete = pd.concat([df, vendor, store_and_fwd_flag], axis=1)
df_complete.columns = [col.lower() for col in df_complete.columns]
df_complete.head()

Unnamed: 0,ride,pickup_datetime,log_trip_duration,distance_haversine,distance_dummy_manhattan,direction,month,day_of_month,hour,day_of_week,vi_1,vi_2,sf_n,sf_y
0,id2875421,2016-03-14 17:24:55,6.122493,1.498521,1.735433,99.970196,3,14,17,0,0,1,1,0
1,id2377394,2016-06-12 00:43:35,6.498282,1.805507,2.430506,-117.153768,6,12,0,6,1,0,1,0
2,id3858529,2016-01-19 11:35:24,7.661527,6.385098,8.203575,-159.680165,1,19,11,1,0,1,1,0
3,id3504673,2016-04-06 19:32:31,6.063785,1.485498,1.661331,-172.7377,4,6,19,2,0,1,1,0
4,id2181028,2016-03-26 13:30:55,6.077642,1.188588,1.199457,179.473585,3,26,13,5,0,1,1,0


## Ingesting into Feast

In [3]:
FEAST_CORE_URL = 'localhost:8433'
FEAST_SERVING_URL = 'localhost:8433'
STAGING_LOCATION = 'gs://staging-location/'

In [4]:
# Now that we have finished creating our features, we ingest them into feast

# Initialise client
fs = Client(core_url=FEAST_CORE_URL, verbose=True)

# Create importer
importer = Importer.from_df(df_complete, 
                           entity='ride', 
                           owner='user@website.com',  
                           staging_location=STAGING_LOCATION,
                           id_column='ride', 
                           timestamp_column='pickup_datetime')

# Update feature and entity metadata. Ideally you want to update these manually
# so that they contain adequate information for the next user
importer.entity.description = 'nyc taxi dataset' 
for feature_id in importer.features:
    importer.features[feature_id].description = 'nyc taxi dataset'
    
# Ingest the feature data into the store
fs.run(importer, apply_features=True, apply_entity=True)

Successfully applied entity with name: ride
---
name: ride
description: nyc taxi dataset

Successfully applied feature with id: ride.log_trip_duration
---
id: ride.log_trip_duration
name: log_trip_duration
owner: user@website.com
description: nyc taxi dataset
valueType: DOUBLE
entity: ride
dataStores: {}

Successfully applied feature with id: ride.distance_haversine
---
id: ride.distance_haversine
name: distance_haversine
owner: user@website.com
description: nyc taxi dataset
valueType: DOUBLE
entity: ride
dataStores: {}

Successfully applied feature with id: ride.distance_dummy_manhattan
---
id: ride.distance_dummy_manhattan
name: distance_dummy_manhattan
owner: user@website.com
description: nyc taxi dataset
valueType: DOUBLE
entity: ride
dataStores: {}

Successfully applied feature with id: ride.direction
---
id: ride.direction
name: direction
owner: user@website.com
description: nyc taxi dataset
valueType: DOUBLE
entity: ride
dataStores: {}

Successfully applied feature with id: ride

_Rendezvous: <_Rendezvous of RPC that terminated with:
	status = StatusCode.INTERNAL
	details = "Error running ingestion job: feast.core.exception.JobExecutionException: Error running ingestion job: feast.core.exception.JobExecutionException: Error running ingestion job: java.lang.RuntimeException: Could not submit job: 
Optional[Error: Unable to access jarfile feast-ingestion.jar]"
	debug_error_string = "{"created":"@1559191180.478136000","description":"Error received from peer ipv6:[::1]:8433","file":"src/core/lib/surface/call.cc","file_line":1041,"grpc_message":"Error running ingestion job: feast.core.exception.JobExecutionException: Error running ingestion job: feast.core.exception.JobExecutionException: Error running ingestion job: java.lang.RuntimeException: Could not submit job: \nOptional[Error: Unable to access jarfile feast-ingestion.jar]","grpc_status":13}"
>

## Creating a training dataset

Creating a training dataset allows you to isolate the data that goes into the model training step, allowing for reproduction and traceability.

In [8]:
# Retrieving data: Training

feature_set = FeatureSet(entity="ride", 
                         features=["ride.log_trip_duration", 
                                  "ride.distance_haversine",
                                  "ride.distance_dummy_manhattan",
                                  "ride.month",
                                  "ride.direction",
                                  "ride.day_of_month",
                                  "ride.hour",
                                  "ride.day_of_week",
                                  "ride.vi_1",
                                  "ride.vi_2",
                                  "ride.sf_n",
                                  "ride.sf_y"])
dataset_info = fs.create_dataset(feature_set, "2016-06-01", "2016-08-01")
dataset = fs.download_dataset_to_df(dataset_info, staging_location=STAGING_LOCATION)

dataset.head()

# train your model
# ...

creating training dataset for features: ['ride.log_trip_duration', 'ride.distance_haversine', 'ride.distance_dummy_manhattan', 'ride.month', 'ride.direction', 'ride.day_of_month', 'ride.hour', 'ride.day_of_week', 'ride.vi_1', 'ride.vi_2', 'ride.sf_n', 'ride.sf_y']


ValueError: Core API URL not set. Either set the environment variable FEAST_CORE_URL or set it explicitly.

## Retrieving serving data

In [7]:
# Retrieving data: Serving

# set serving endpoint
fs = Client(serving_url=FEAST_SERVING_URL, verbose=True)

feature_set = FeatureSet(entity="ride", 
                         features=["ride.log_trip_duration", 
                                  "ride.distance_haversine",
                                  "ride.distance_dummy_manhattan",
                                  "ride.direction",
                                  "ride.month",
                                  "ride.day_of_month",
                                  "ride.hour",
                                  "ride.day_of_week",
                                  "ride.vi_1",
                                  "ride.vi_2",
                                  "ride.sf_n",
                                  "ride.sf_y"])

# retrieve features
feats = fs.get_serving_data(feature_set, entity_keys=["id2875421","id1244481"])
feats.head()

# Feed data into model
# ...

Unnamed: 0,ride,ride.log_trip_duration,ride.distance_haversine,ride.distance_dummy_manhattan,ride.direction,ride.month,ride.day_of_month,ride.hour,ride.day_of_week,ride.vi_1,ride.vi_2,ride.sf_n,ride.sf_y
0,id1244481,8.084254,17.988218,23.770274,114.118984,1,15,13,4,0,1,1,0
1,id2875421,6.122493,1.498521,1.735433,99.970196,3,14,17,0,0,1,1,0
