# This Notebook illustrates adding data to feature store

In [None]:

import sqlalchemy as db
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from pytz import timezone, utc
from new_api import (add_entity, create_new_entity_set, add_aggregation_features, get_training_df, get_prediction_df, 
add_entity_df, list_features)


In [None]:
days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \
        - timedelta(day * 365) for day in range(3)][::-1]

agents = [1001, 1002, 1003, 1004, 1005]


df = pd.DataFrame(
    {
        "effective_date": [day for day in days for agent in agents],
        "agent_id": [agent for day in days for agent in agents],
        "feature_1": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "feature_2": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "feature_3": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "feature_4": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "feature_5": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "feature_6": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "feature_7": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "feature_8": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
    }
)



agent_entity = {"name": "agent", "table": "agent", "type": "primary", "index": "agent_id", "time": {"field": "effective_date", "type": "effective_date"}}
add_entity_df(df, agent_entity)

In [24]:
days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \
        - timedelta(day * 365) for day in range(3)][::-1]

agents = [1001, 1002, 1003, 1004, 1005]


df = pd.DataFrame(
    {
        "date": [day for day in days for agent in agents],
        "agent_id": [agent for day in days for agent in agents],
        "zipcode": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],
        "num_household": [np.random.rand() * 10 for _ in range(len(days) * len(agents))],

    }
)



acxiom_entity = {"name": "acxiom", "table": "agent_acxiom", "type": "primary", "index": "agent_id", "time": {"field": "date", "type": "effective_date"}}
add_entity_df(df, acxiom_entity)


In [25]:

days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \
        - timedelta(day * 7) for day in range(3*52)][::-1]

agents = [1001, 1002, 1003, 1004, 1005]


df = pd.DataFrame(
    {
        "id": [1000 + x for x in range(len(days) * len(agents))],
        "date": [day for day in days for agent in agents],
        "agent_id": [agent for day in days for agent in agents],
        "amount": [np.random.rand() * 100 for _ in range(len(days) * len(agents))],
        "feature_2": [np.random.rand() * 100  for _ in range(len(days) * len(agents))],
        "feature_3": [np.random.rand() * 100 for _ in range(len(days) * len(agents))],

    }
)


comission_events = {"name": "agent_commission", "table": "agent_sales", "type": "event", 'index': "id", "time": {"field": "date", "type": "event"}}
add_entity_df(df, comission_events)

In [26]:
relationships =  [
            ("one_to_one", {"name": "agent", "index": "agent_id"}, {"name": "acxiom", "index": "agent_id"}),
            ("one_to_many", {"name": "agent", "index": "agent_id"}, {"name": "agent_commission",  "index": "agent_id"})
                ]


create_new_entity_set(name="nyl_agents", entities=["agent", "acxiom", "agent_commission"], relationships=relationships)


In [28]:
agent_commission_agg_features = {"total_sales": {"feature": "amount","function":"sum", "name": "total_sales", "time_window": "full_history"},
                             "max_sales":  {"feature": "amount", "function":"max", "name": "max_sales", "time_window": "full_history"},
                             "total_num_sales":   {"feature": "id", "function": "count", "name": "total_num_sales", "time_window": "full_history"}
                            }
add_aggregation_features("agent_commission", agent_commission_agg_features)


In [29]:
available_features = list_features("nyl_agents")
available_features

{'agent': {'raw_features': ['index',
   'effective_date',
   'agent_id',
   'feature_1',
   'feature_2',
   'feature_3',
   'feature_4',
   'feature_5',
   'feature_6',
   'feature_7',
   'feature_8']},
 'acxiom': {'raw_features': ['index',
   'date',
   'agent_id',
   'zipcode',
   'num_household']},
 'agent_commission': {'raw_features': ['index',
   'id',
   'date',
   'agent_id',
   'amount',
   'feature_2',
   'feature_3'],
  'calulated_features': ['total_sales', 'max_sales', 'total_num_sales']}}

In [30]:
days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \
        - timedelta(day * 365) for day in range(2)][::-1]

agents = [1001, 1002, 1003, 1004, 1005]


df = pd.DataFrame(
    {
        "observation_time": [day for day in days for customer in agents],
        "agent_id": [customer for day in days for customer in agents],
        "prediction": [np.random.rand()  for _ in range(len(days) * len(agents))],

    }
)

#this really applies more to the compliance model..

eol = {"pk": "agent_id", "observation_date": "observation_time", "label": "prediction"}

In [31]:
features = {"entity_set": "nyl_agents",
            "target_entity": "agent",
            "features": {
                         "agent": ["feature_1", "feature_2", "feature_6"],
                         "acxiom": ["zipcode", "num_household"],
                         "agent_commission": ["total_sales", "max_sales"]

                    },
            "observations": {"type": "eol", "eol": eol, "data": df}
           
            }
training_df = get_training_df(features)
training_df

Unnamed: 0,agent_id,observation_time,prediction,feature_1,feature_2,feature_6,zipcode,num_household,total_sales,max_sales
0,1001,2019-04-21,0.064332,9.461729,0.60703,4.984779,6.52797,0.232106,4884.966123,99.683471
1,1002,2019-04-21,0.529459,5.817378,1.339451,9.892323,2.318984,0.818003,5005.470049,99.829563
2,1003,2019-04-21,0.572366,2.748388,6.614324,4.362094,2.288449,2.280654,5533.966345,98.999679
3,1004,2019-04-21,0.059805,4.902936,2.310887,4.552108,1.094327,6.804726,5066.361018,99.753397
4,1005,2019-04-21,0.635581,7.21597,5.310957,0.708176,8.116019,9.217045,5710.476118,99.512503
5,1001,2020-04-20,0.546643,8.676654,6.617975,3.793197,0.189578,0.180422,7763.183214,99.683471
6,1002,2020-04-20,0.060695,0.619986,1.829505,4.545613,8.482659,8.79261,7620.647214,99.829563
7,1003,2020-04-20,0.335496,3.943603,5.452559,2.13342,8.535475,2.040803,8194.756111,99.406206
8,1004,2020-04-20,0.847669,0.795607,6.48711,0.468483,3.698678,5.276331,7870.330874,99.753397
9,1005,2020-04-20,0.484405,5.625281,2.807297,5.041419,9.19602,3.59473,8326.790783,99.512503


In [32]:
prediction_df = get_prediction_df(features)
prediction_df


Unnamed: 0,agent_id,feature_1,feature_2,feature_6,zipcode,num_household,total_sales,max_sales
0,1001,8.676654,6.617975,3.793197,0.189578,0.180422,7763.183214,99.683471
1,1002,0.619986,1.829505,4.545613,8.482659,8.79261,7620.647214,99.829563
2,1003,3.943603,5.452559,2.13342,8.535475,2.040803,8194.756111,99.406206
3,1004,0.795607,6.48711,0.468483,3.698678,5.276331,7870.330874,99.753397
4,1005,5.625281,2.807297,5.041419,9.19602,3.59473,8326.790783,99.512503


In [None]:

days = [datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0).replace(tzinfo=utc) \
        - timedelta(day * 7) for day in range(3*52)][::-1]

agents = [1001, 1002, 1003, 1004, 1005]


#fix this...don't need a date in the eol...it is the same as the event_date.
eol_df = pd.DataFrame(
    {
        "id": [1000 + x for x in range(len(days) * len(agents))],
        "date": [day for day in days for agent in agents],
        "prediction": [np.random.rand() * 100 for _ in range(len(days) * len(agents))],

    }
)

eol = {"pk": "id", "observation_date": "date", "label": "prediction"}

features = {"entity_set": "nyl_agents",
            "target_entity": "agent_commission",
            "features": {
                         "agent": ["feature_1", "feature_2", "feature_6"],
                         "acxiom": ["zipcode", "num_household"],
                         "agent_commission": ["amount", "date", "feature_2", "feature_3", "total_sales", "max_sales"]
                         #"agent_commission": ["amount", "date", "feature_2", "feature_3"]

                    },
            "observations": {"type": "el", "eol": eol, "data": eol_df}
            }



df = get_event_training_df(features)

df


In [33]:
from ludwig.api import LudwigModel

ModuleNotFoundError: No module named 'ludwig'