# __Feast Python SDK Proposal 2__

In [1]:
# ideally we install feast first
import sys
sys.path.append("/Users/zhilingc/Documents/go-projects/src/github.com/gojektech/feast/sdk/python")

In [2]:
import pandas as pd

from feast.sdk.resources.entity import Entity
from feast.sdk.resources.feature import Feature
from feast.sdk.resources.storage import Datastore
from feast.types.Granularity_pb2 import Granularity
from feast.types.Value_pb2 import ValueType
import feast.specs.FeatureSpec_pb2 as feature_pb

from feast.sdk.importer import Importer

from feast.sdk.client import Client

### Connect to your Feast server

In [3]:
fs = Client('localhost:8433', verbose=True)

### Load feature data from a local file

In [4]:
df = pd.read_csv('driver_features.csv')
df.head()

Unnamed: 0,driver_id,ts,completed,avg_distance_completed,avg_customer_distance_completed,avg_distance_cancelled
0,1,2018-09-25T00:00:00.000,12,1.102,172.0,-1.0
1,2,2018-09-25T00:00:00.000,23,8.16,783.0,15.619
2,3,2018-09-25T00:00:00.000,14,2.833286,138.142857,18.5935
3,4,2018-09-25T00:00:00.000,7,4.593,1575.0,-1.0
4,5,2018-09-25T00:00:00.000,15,11.7656,314.0,6.733


### Create customer entity

In [5]:
customer_entity = Entity('customer', "desc", ["loyal", "customer"])
print(customer_entity)

name: customer
description: desc
tags:
- loyal
- customer



### Create customer feature

In [6]:
customer_age = Feature(name='age', 
                       entity="customer",
                       granularity=Granularity.DAY, 
                       owner='willem.p@go-jek.com',
                       description="Customer's age",
                       value_type=ValueType.INT64, 
                       serving_store=Datastore(id="REDIS1"),
                       warehouse_store=Datastore(id="BIGQUERY1"))
print(customer_age)

id: customer.day.age
name: age
owner: willem.p@go-jek.com
description: Customer's age
granularity: DAY
valueType: INT64
entity: customer
dataStores:
  serving:
    id: REDIS1
  warehouse:
    id: BIGQUERY1



In [7]:
customer_balance = Feature(name='balance', 
                           entity="customer",
                           granularity=Granularity.DAY, 
                           owner='willem.p@go-jek.com', 
                           value_type=ValueType.FLOAT, 
                           description="Customer's account balance",
                           serving_store=Datastore(id="REDIS1"),
                           warehouse_store=Datastore(id="BIGQUERY1"))

### Register customer entity in Feast

In [8]:
fs.apply(customer_entity)

Successfully applied entity with name: customer
---
name: customer
description: desc
tags:
- loyal
- customer



'customer'

### Register multiple customer features

In [9]:
fs.apply([customer_age, customer_balance])

Successfully applied feature with id: customer.day.age
---
id: customer.day.age
name: age
owner: willem.p@go-jek.com
description: Customer's age
granularity: DAY
valueType: INT64
entity: customer
dataStores:
  serving:
    id: REDIS1
  warehouse:
    id: BIGQUERY1

Successfully applied feature with id: customer.day.balance
---
id: customer.day.balance
name: balance
owner: willem.p@go-jek.com
description: Customer's account balance
granularity: DAY
valueType: FLOAT
entity: customer
dataStores:
  serving:
    id: REDIS1
  warehouse:
    id: BIGQUERY1



['customer.day.age', 'customer.day.balance']

### Register customer's "age" feature in Feast using the apply method
* This command is idempotent

In [10]:
fs.apply(customer_age)

Successfully applied feature with id: customer.day.age
---
id: customer.day.age
name: age
owner: willem.p@go-jek.com
description: Customer's age
granularity: DAY
valueType: INT64
entity: customer
dataStores:
  serving:
    id: REDIS1
  warehouse:
    id: BIGQUERY1



'customer.day.age'

### Create an importer from a csv

In [11]:
driver_importer = Importer.from_csv('driver_features.csv', 
                                    granularity=Granularity.DAY, 
                                    entity='driver', 
                                    owner='willem.p@go-jek.com',
                                    staging_location="gs://staging-bucket/feast",
                                    id_column="driver_id", 
                                    timestamp_column="ts")

### [Alternative] Create an importer from a BigQuery table

In [12]:
driver_importer_from_bq = Importer.from_bq("the-big-data-staging-007.feast_test.surge_gocar", 
                                           entity="s2id", 
                                           granularity=Granularity.MINUTE, 
                                           owner='willem.p@go-jek.com',
                                           timestamp_column="start_time")



### [Alternative] Create an importer from a Pandas dataframe

In [13]:
my_pandas_df = driver_importer.df
driver_importer_from_df = Importer.from_df(my_pandas_df, 
                                           entity='driver', 
                                           granularity=Granularity.DAY, 
                                           owner='willem.p@go-jek.com',  
                                           staging_location="gs://staging-bucket/feast",
                                           id_column="driver_id", 
                                           timestamp_column="ts")

### Preview the dataframe loaded by the importer

In [14]:
driver_importer_from_df.df.head()

Unnamed: 0,driver_id,ts,completed,avg_distance_completed,avg_customer_distance_completed,avg_distance_cancelled
0,1,2018-09-25T00:00:00.000,12,1.102,172.0,-1.0
1,2,2018-09-25T00:00:00.000,23,8.16,783.0,15.619
2,3,2018-09-25T00:00:00.000,14,2.833286,138.142857,18.5935
3,4,2018-09-25T00:00:00.000,7,4.593,1575.0,-1.0
4,5,2018-09-25T00:00:00.000,15,11.7656,314.0,6.733


### Describe the importer

In [15]:
driver_importer_from_df.describe()

type: file
options:
  format: csv
  url: gs://staging-bucket/feast/tmp_driver_120418
entities:
- driver
schema:
  entityIdColumn: driver_id
  fields:
  - name: driver_id
  - name: ts
  - featureId: driver.day.completed
    name: completed
  - featureId: driver.day.avg_distance_completed
    name: avg_distance_completed
  - featureId: driver.day.avg_customer_distance_completed
    name: avg_customer_distance_completed
  - featureId: driver.day.avg_distance_cancelled
    name: avg_distance_cancelled
  timestampColumn: ts



### Submit the import job
* This loads the CSV from GCS into Feast
* Automatically registers entities and features with Feast during submission

In [None]:
fs.apply(driver_importer, create_entity=True, create_features=True)

_starting import..._  
_10%_   
_50%_   
_100%_    
_10 rows imported successfully_ 

### Write out specification files for later use

In [None]:
driver_importer.dump("driver_feature_import.yaml")
customer_entity.dump("customer_entity.yaml")
customer_age.dump("customer_entity.yaml")

### Create a “feature set” which can be used to query both training data and serving data.
* The feature set is simply an object that locally tracks which entity, granularity, and features you are interested in.

In [None]:
feature_set = fs.create_feature_set(entity='driver', granularity='minute', features=['latitude', 'longitude', 'event_time'])

### Produce training dataset
* Stages a table in BQ with output data
* Returns information about the dataset that has been created

In [None]:
dataset_info = feature_set.create_training_dataset(start_date='2018-01-01', end_date='2018-02-01')

### Retrieve training dataset

In [None]:
file_path = 'mypath.feather'
dataset_info.download(destination=file_path, type='feather')

### Load training dataset into Pandas

In [None]:
import feather
df = feather.read_dataframe(file_path)

### [Alternative] Download dataset directly into a Pandas dataframe

In [None]:
df = dataset_info.download_to_df()

### Do your model training...

### Ensure you have the list of entity keys for which you want to retrieve features

In [None]:
keys = [12345, 67890]

### Fetch serving data from Feast by reusing the same feature set
* This is a pandas dataframe (with possibly some extra methods which we will add)

In [None]:
feature_data = feature_set.get_serving_data(keys, type='last')