# Create and store features

We saw in the previous notebook that data governance help to discover, browse and see lineage of the data, now we want to see ML specific governance, with the creation of model understandable features from data

## 0.0 install and import dependancies

In [None]:
# install dependancies : feast that is the feature store python package, and pyarrow that will be used to source our dataset
%pip install feast==0.34.1 pyarrow==10.0.1

In [None]:
# import dependancies
import pandas as pd
### feature store specific dependancies
from feast import (
    FeatureStore,
    Entity,
    FeatureService,
    FeatureView,
    Field,
    FileSource,
    PushSource,
    RequestSource,
    RepoConfig,
)
from feast.on_demand_feature_view import on_demand_feature_view
from feast.data_source import PushMode
from feast.types import Float32, Float64, Int64,UnixTimestamp, String
import requests
import json
from datetime import timedelta, datetime

## 0.1 create the store

In [None]:
# a yaml file is provided to create a feature store relying on postgres db
%cat feature_repo/feature_store.yaml

In [None]:
# here we will 
fs = FeatureStore(repo_path="./feature_repo")

## 0.2 Create a source that will be used in the registry

### 0.2.1 Ensure source requirements (provide an index, a time column and export it in parquet)

In [None]:
## read local data from local repository
data = pd.read_csv("../Data_Pipeline_2/chicagodata/trip.csv")

In [None]:
### Transform trip_start_timestamp field to a datetime field
data[['trip_start_timestamp']] = data[['trip_start_timestamp']].apply(pd.to_datetime)
### add an index column that will be used as a join key
data = data.reset_index()

In [None]:
%mkdir chicagodata

In [None]:
### Persist locally as parquet data
data.to_parquet('chicagodata/trip.parquet')

### 0.2.2 register the source 

In [None]:
taxi_data_source = FileSource(
    path="chicagodata/trip.parquet",
        event_timestamp_column="trip_start_timestamp"
)

## 0.3 Create an Entity

The entity represent a business object, a logical group

In [None]:
# fill with your initials --> john-doe initials : jd
username_initials = ''

In [None]:
### here we create an entity "taxi_trip", its features could be its duration, length ...
taxi_trip_entity = Entity(name=f"taxi_trip_{username_initials}", join_keys=["index"])

## 0.4 Create our modelisations

### 0.4.1 Create a feature view that map the source

In [None]:
### Let's create a feature view that will map our source into features
### using feast documentation define the schema with the right types.
taxi_trip_all_stats= FeatureView(
    name=f"taxi_trip_all_stats_{username_initials}",
    ttl=timedelta(days=365),
    entities=[taxi_trip_entity],
    schema=[
        Field(name="index", dtype=Int64),
        Field(name="tips",dtype=Int64),
        Field(name="trip_start_timestamp", dtype=UnixTimestamp),
        Field(name="trip_seconds", dtype=Int64),
        Field(name="trip_miles", dtype=Float64),
        Field(name="pickup_community_area", dtype=Float64),
        Field(name="pickup_centroid_latitude", dtype=Float64),
        Field(name="pickup_centroid_longitude", dtype=Float64),
        Field(name="dropoff_community_area", dtype=Float64),
        Field(name="fare", dtype=Float64),
        Field(name="tolls", dtype=Float64),
        Field(name="extras", dtype=Float64),
        Field(name="trip_total", dtype=Float64)
    ],
    source=taxi_data_source,
    tags={'sticktodata':"yes"}
)

## 0.5 Create a service that serve a set of feature views, corresponding to your futur model

In [None]:
taxi_trip_service = FeatureService(
    name="taxi_trip_service_v1",
    features=[taxi_trip_all_stats],
)

## 0.6 Create all feast resources

In [None]:
### we want to apply to objects we created (entities, features views)
fs.apply([taxi_data_source,taxi_trip_entity,taxi_trip_all_stats,taxi_trip_service])

Now in the FEAST UI, we can see the model that we just created, browse the feature view to see what has been created

## 1.0 Test the offline 


Here we will get a sample from the offline store, simulating a call for model training

In [None]:
# scope the features to retreive
# take the 1000 first index
entity_df = pd.DataFrame.from_dict({"index": [*range(1, 1001)]})
# take all timestamp older than now
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)       

# reset the store used
store = FeatureStore(repo_path="./feature_repo")

# get the list of the wanted features, for the scoped entity dataframe
training_data = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "taxi_trip_all_stats_ge:trip_seconds",
        "taxi_trip_all_stats_ge:fare",
        "taxi_trip_all_stats_ge:trip_start_timestamp"
    ],
).to_df()

In [None]:
training_data.head()

## 1.1 Test the online store

### 1.1.1 Materialization

materialization feed the source into the online store! (made for inference), here our dataset is static so we take a wide timedelta to get all data in the store

In [None]:
store.materialize(end_date=datetime.now(),start_date=datetime.now() - timedelta(days=700))

### 1.1.2 Query the online store

In [None]:
from pprint import pprint

In [None]:
# Now the scope is automaticaly the freshest version of the data, if a trip could existing several time, your we got only the last
# we use the scope of our service
feature_service = store.get_feature_service("taxi_trip_service_v1")
feature_vector  = store.get_online_features(
    features=feature_service,
    entity_rows=[
        {"index":12},{"index":78}
    ]
).to_dict()

In [None]:
pprint(feature_vector)

---

## 2.0 Create custom features

Using "on demand views" and user defined functions, we are able to create features that derivate from other features.

So Here we can create NEW features that were not part of our initial dataset

In [None]:
from feast.on_demand_feature_view import OnDemandFeatureView

### 2.0.1 Define the feature logic in an udf

In [None]:
# here we want to create a single feature to represent pickup centroid longitude and latitude
def customfeatures(features_df: pd.DataFrame) -> pd.DataFrame:
    """
    inputs:
    features_df : the feature inputs df
    return:
    a dataframe containing the new features
    """
    df = pd.DataFrame()
    df['pickup_centroid_latlon'] = (features_df['pickup_centroid_latitude'] * features_df['pickup_centroid_longitude'])
    return df

### 2.0.2 Create a feature view containing the udf result

In [None]:

taxitrip_pickup_lonlat = OnDemandFeatureView(
    name='taxitrip_pickup_lonlat', # the name of the feature view
    sources=[taxi_trip_all_stats], # the sources needed to create the custom feature
    schema=[
        Field(name='pickup_centroid_latlon', dtype=Float64)
    ], # the schema of the feature view (correspond to the udf return)
    udf=customfeatures # reference to the udf
)

### 2.0.3 Create a new service that add your custom feature in addition to the initial feature view

In [None]:
taxi_trip_service_v2 = FeatureService(
    name="taxi_trip_service_v2", # name of the service
    features=[
        taxi_trip_all_stats, # initial feature view (we need here to provide here at least all inputs features for our custom feature)
        taxitrip_pickup_lonlat, # the custom feature added
    ],
)

### 2.0.4 Apply new objects (custom feature view, new service)

In [None]:
fs.apply([taxi_data_source,taxi_trip_entity,taxi_trip_all_stats,taxi_trip_service,taxitrip_pickup_lonlat,taxi_trip_service_v2])

### 2.1.0 Test the offline store with the new service

In [None]:
# Choose a service to query features on
feature_service = store.get_feature_service("taxi_trip_service_v2")

# scope the features to retreive
# take the 1000 first index
entity_df = pd.DataFrame.from_dict({"index": [*range(1, 1001)]})

# take all timestamp older than now
entity_df["event_timestamp"] = pd.to_datetime("now", utc=True)    

# reset the store used
store = FeatureStore(repo_path="./feature_repo")

# get the list of the wanted features, for the scoped entity dataframe
training_data = store.get_historical_features(
    entity_df=entity_df,
    features=feature_service,
).to_df()

In [None]:
### check that the new columns are availables
training_data.head()