In [None]:
import pandas as pd
import numpy as np

from feast.sdk.resources.entity import Entity
from feast.sdk.resources.storage import Storage
from feast.sdk.resources.feature import Feature, Datastore, ValueType
from feast.sdk.resources.feature_set import FeatureSet, FileType
import feast.specs.FeatureSpec_pb2 as feature_pb

from feast.sdk.importer import Importer

from feast.sdk.client import Client, ServingRequestType

## Feature Engineering

In [None]:
# Feature engineering steps 
## Referenced from https://www.kaggle.com/karelrv/nyct-from-a-to-z-with-xgboost-tutorial/notebook

def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

df = pd.read_csv('taxi_small.csv')
df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
df['dropoff_datetime'] = pd.to_datetime(df.dropoff_datetime)
df['log_trip_duration'] = np.log(df['trip_duration'].values + 1)

# location features
df.loc[:, 'distance_haversine'] = haversine_array(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)
df.loc[:, 'distance_dummy_manhattan'] =  dummy_manhattan_distance(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)
df.loc[:, 'direction'] = bearing_array(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)

# time features
df['month'] = df['pickup_datetime'].dt.month
df['day_of_month'] = df['pickup_datetime'].dt.day
df['hour'] = df['pickup_datetime'].dt.hour
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

# one hot encoding
vendor = pd.get_dummies(df['vendor_id'], prefix='vi', prefix_sep='_')
store_and_fwd_flag = pd.get_dummies(df['store_and_fwd_flag'], prefix='sf', prefix_sep='_')

df = df.drop(['trip_duration','vendor_id','passenger_count','store_and_fwd_flag', 'dropoff_datetime',
           'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis = 1)
df.columns = ['ride'] + list(df.columns[1:])
df_complete = pd.concat([df, vendor, store_and_fwd_flag], axis=1)
df_complete.columns = [col.lower() for col in df_complete.columns]
df_complete.head()

## Ingesting into Feast

In [None]:
FEAST_CORE_URL = 'localhost:6565'
FEAST_SERVING_URL = 'localhost:6566'
STAGING_LOCATION = 'gs://feast-bucket/staging'

In [None]:
# Now that we have finished creating our features, we ingest them into feast

# Initialise client
fs = Client(core_url=FEAST_CORE_URL, verbose=True)

serving_ds=Datastore(id='REDIS1')
warehouse_ds=Datastore(id='NOOP')

# Create importer
importer = Importer.from_df(df_complete, 
                           entity='ride', 
                           granularity=Granularity.NONE,
                           owner='user@website.com',  
                           staging_location=STAGING_LOCATION,
                           id_column='ride', 
                           timestamp_column='pickup_datetime',
                           serving_store=serving_ds,
                           warehouse_store=warehouse_ds)

# Update feature and entity metadata. Ideally you want to update these manually
# so that they contain adequate information for the next user
importer.entity.description = 'nyc taxi dataset' 
for feature_id in importer.features:
    importer.features[feature_id].description = 'nyc taxi dataset'
    
# Ingest the feature data into the store
fs.run(importer, apply_features=True, apply_entity=True)

## Creating a training dataset

Creating a training dataset allows you to isolate the data that goes into the model training step, allowing for reproduction and traceability.

In [None]:
# Retrieving data: Training

feature_set = FeatureSet(entity="ride", 
                         features=["ride.none.log_trip_duration", 
                                  "ride.none.distance_haversine",
                                  "ride.none.distance_dummy_manhattan",
                                  "ride.none.direction",
                                  "ride.none.month",
                                  "ride.none.day_of_month",
                                  "ride.none.hour",
                                  "ride.none.day_of_week",
                                  "ride.none.vi_1",
                                  "ride.none.vi_2",
                                  "ride.none.sf_n",
                                  "ride.none.sf_y"])
dataset_info = fs.create_dataset(feature_set, "2016-06-01", "2016-08-01")
dataset = fs.download_dataset_to_df(dataset_info, staging_location=STAGING_LOCATION)

dataset.head()

# train your model
# ...

## Retrieving serving data

In [None]:
# Retrieving data: Serving

# set serving endpoint
fs = Client(serving_url=FEAST_SERVING_URL, verbose=True)

feature_set = FeatureSet(entity="ride", 
                         features=["ride.none.log_trip_duration", 
                                  "ride.none.distance_haversine",
                                  "ride.none.distance_dummy_manhattan",
                                  "ride.none.direction",
                                  "ride.none.month",
                                  "ride.none.day_of_month",
                                  "ride.none.hour",
                                  "ride.none.day_of_week",
                                  "ride.none.vi_1",
                                  "ride.none.vi_2",
                                  "ride.none.sf_n",
                                  "ride.none.sf_y"])

# retrieve features
feats = fs.get_serving_data(feature_set, entity_keys=["id2875421","id1244481"])

# Feed data into model
# ...