# Stream to Features
  --------------------------------------------------------------------

##### This notebook will create a function that create feature vectors from streaming events


## Create and Test a Local Function 
Import nuclio SDK and magics, <b>do not remove the cell and comment !!!</b>

In [1]:
# nuclio: ignore
import nuclio

#### Functions imports

In [2]:
# nuclio: start-code

In [3]:
import os
import json
import numpy as np
from v3io import dataplane, common
from datetime import datetime

<b>Specify function dependencies and configuration<b>

In [4]:
%nuclio cmd -c pip install v3io numpy

In [5]:
%%nuclio env
V3IO_ACCESS_KEY = ${V3IO_ACCESS_KEY}
CONTAINER = users
FEATURE_TABLE_PATH = ${V3IO_USERNAME}/examples/rapid-prototype/feature-table
FEATURE_LIST = ['socioeconomic_idx','purchase_sum','purchase_mean','purchase_count','purchase_var','bet_sum','bet_mean','bet_count','bet_var','win_sum','win_mean','win_count','win_var']
SERVING_EVENTS = ['bet','win']
OUTPUT_STREAM_PATH = ${V3IO_USERNAME}/examples/rapid-prototype/serving-stream
SHARDS_COUNT = 8
PARTITION_ATTR = user_id


%nuclio: setting 'V3IO_ACCESS_KEY' environment variable
%nuclio: setting 'CONTAINER' environment variable
%nuclio: setting 'FEATURE_TABLE_PATH' environment variable
%nuclio: setting 'FEATURE_LIST' environment variable
%nuclio: setting 'SERVING_EVENTS' environment variable
%nuclio: setting 'OUTPUT_STREAM_PATH' environment variable
%nuclio: setting 'SHARDS_COUNT' environment variable
%nuclio: setting 'PARTITION_ATTR' environment variable


In [6]:
%%nuclio config
spec.triggers.v3io_stream.kind = "v3ioStream"
spec.triggers.v3io_stream.disabled = false
spec.triggers.v3io_stream.url = "http://v3io-webapi:8081/users/${V3IO_USERNAME}/examples/rapid-prototype/enriched-events-stream@stream2features"
spec.triggers.v3io_stream.maxWorkers = 10
spec.triggers.v3io_stream.password = "${V3IO_ACCESS_KEY}"
spec.triggers.v3io_stream.attributes.pollingIntervalMs = 500
spec.triggers.v3io_stream.attributes.seekTo = "earliest"
spec.triggers.v3io_stream.attributes.readBatchSize = 64


%nuclio: setting spec.triggers.v3io_stream.kind to 'v3ioStream'
%nuclio: setting spec.triggers.v3io_stream.disabled to False
%nuclio: setting spec.triggers.v3io_stream.url to 'http://v3io-webapi:8081/users/michaelk/examples/rapid-prototype/enriched-events-stream@stream2features'
%nuclio: setting spec.triggers.v3io_stream.maxWorkers to 10
%nuclio: setting spec.triggers.v3io_stream.password to 'b01eb2f1-294a-4f63-b0a6-42561e3e1706'
%nuclio: setting spec.triggers.v3io_stream.attributes.pollingIntervalMs to 500
%nuclio: setting spec.triggers.v3io_stream.attributes.seekTo to 'earliest'
%nuclio: setting spec.triggers.v3io_stream.attributes.readBatchSize to 64


### Manage output stream

In [7]:
# nuclio: ignore


V3IO_ACCESS_KEY = os.getenv('V3IO_ACCESS_KEY')
CONTAINER = os.getenv('CONTAINER')
OUTPUT_STREAM_PATH = os.getenv('OUTPUT_STREAM_PATH')
SHARDS_COUNT = int(os.getenv('SHARDS_COUNT'))
    
v3io_client = dataplane.Client(endpoint='http://v3io-webapi:8081', access_key=V3IO_ACCESS_KEY)


In [None]:
# nuclio: ignore
resp = v3io_client.create_stream(container=CONTAINER,
                           path=OUTPUT_STREAM_PATH,
                           shard_count=SHARDS_COUNT)
resp.status_code

## Function code

In [8]:
def init_context(context):
    V3IO_ACCESS_KEY = os.getenv('V3IO_ACCESS_KEY')
    CONTAINER = os.getenv('CONTAINER')
    FEATURE_TABLE_PATH = os.getenv('FEATURE_TABLE_PATH')
    FEATURE_LIST = os.getenv('FEATURE_LIST').strip("'][").split("','")
    SERVING_EVENTS = os.getenv('SERVING_EVENTS').replace(' ','').strip("'][").split("','")
    OUTPUT_STREAM_PATH = os.getenv('OUTPUT_STREAM_PATH')    
    PARTITION_ATTR = os.getenv('PARTITION_ATTR')
    
    v3io_client = dataplane.Client(endpoint='http://v3io-webapi:8081', access_key=V3IO_ACCESS_KEY)
    
    event_handlers = {'registration': process_registration,
                      'purchase': process_purchase,
                      'bet': process_bet,
                      'win': process_win}
    
    setattr(context, 'v3io_client', v3io_client)
    setattr(context, 'container', CONTAINER)
    setattr(context, 'feature_table_path', FEATURE_TABLE_PATH)
    setattr(context, 'feature_list', FEATURE_LIST)
    setattr(context, 'serving_events', SERVING_EVENTS)
    setattr(context, 'output_stream_path', OUTPUT_STREAM_PATH)
    setattr(context, 'partition_attr', PARTITION_ATTR)
    setattr(context, 'event_handlers', event_handlers)
    


def handler(context, event):
    if type(event.body) is dict:
        event_dict = event.body
    else:
        event_dict = json.loads(event.body)
        
    if is_relevant_event(context, event_dict):
        event_type = get_event_type(event_dict)
        context.logger.info(f'Incoming event type: {event_type}')
        
        # python switch-case
        process_func = context.event_handlers.get(event_type)
        context.logger.info(f'Processing event {event_dict}')
        response = process_func(context, event_dict)
        context.logger.info(f'Finished processing with status: {response.status_code} - and response body: {response.body} , event: {event_dict}')
        if event_type in context.serving_events and (200 <= response.status_code < 300) :
            context.logger.info(f'sending event for serving')
            write_to_output_stream(context, event_dict)
    else:
        context.logger.info(f'Not relevant event')    

        
def get_event_type(event):
    return event['event_type']


def is_relevant_event(context, event):
    return get_event_type(event) in context.event_handlers
        
def get_features(context, event):
    user_id = event['user_id']
    features_list = context.feature_list
    resp = context.v3io_client.get_item(container=context.container, 
                                        path=common.helpers.url_join(context.feature_table_path, str(user_id)),
                                        raise_for_status=dataplane.RaiseForStatus.never)
    
    feat_list = [resp.output.item.get(feat) for feat in features_list]
    return json.dumps({'instances': np.array(feat_list).reshape(1,-1).tolist()})
    
def write_to_output_stream(context, event):
    partition_key = event.get(context.partition_attr)    
    data = get_features(context, event)
    
    record = {'partition_key': str(partition_key), 'data': data }
    resp = context.v3io_client.put_records(container=context.container, 
                                           path=context.output_stream_path, 
                                           records=[record], 
                                           raise_for_status=dataplane.RaiseForStatus.never)
    context.logger.info(f'Sent features for user: {event["user_id"]} to serving stream')


def event_time_to_ts(event_time):
    dt = datetime.strptime(event_time,'%Y-%m-%d %H:%M:%S.%f')
    return datetime.timestamp(dt)


def get_sum_count_mean_var_expr(feature: str, current_value):
    sum_str = f"SET {feature}_sum= if_not_exists({feature}_sum, 0) + {current_value};"
    count_str = f"SET {feature}_count= if_not_exists({feature}_count, 0) + 1;"
    delta_str = f"SET {feature}_delta= {current_value} - if_not_exists({feature}_mean, 0);"
    mean_str = f"SET {feature}_mean= if_not_exists({feature}_mean, 0) + ({feature}_delta / {feature}_count);"
    m2_str = f"SET {feature}_m2= if_not_exists({feature}_m2, 0) + ({feature}_delta * ({current_value} - {feature}_mean));"
    var_str = f"SET {feature}_var= {feature}_m2 / (max(2, {feature}_count)-1) ;"
    expression = sum_str + count_str + delta_str + mean_str + m2_str + var_str
    return expression


def update_features(context, user_id, expression, condition):
    return context.v3io_client.update_item(container=context.container,
                                          path=common.helpers.url_join(context.feature_table_path, str(user_id)),
                                          condition=condition,
                                          expression=expression,
                                          raise_for_status=dataplane.RaiseForStatus.never)


def process_registration(context, event):
    user_id = event['user_id']
    
    features = {'user_id': event['user_id'],
               'registration_date': event['event_time'],
               'date_of_birth': event['date_of_birth'],
               'socioeconomic_idx':  event['socioeconomic_idx'],
               'affiliate_url': event['affiliate_url'],
               'label': event['label']}
    
    response = context.v3io_client.put_item(container=context.container,
                                       path=common.helpers.url_join(context.feature_table_path, str(user_id)),
                                       attributes=features,
                                       raise_for_status=dataplane.RaiseForStatus.never)
    return response


def process_purchase(context, event):
    user_id = event['user_id']
    event_time = event['event_time']
    event_ts = event_time_to_ts(event_time)
    
    purchase_amount = event['amount']

    first_purchase_ts_str = f"SET first_purchase_ts=if_not_exists(first_purchase_ts, {event_ts});"
    sum_count_mean_var_expr = get_sum_count_mean_var_expr('purchase', purchase_amount)
    
    expression = first_purchase_ts_str + sum_count_mean_var_expr
    condition = f"exists(registration_date) AND (NOT exists(first_purchase_ts) OR first_purchase_ts >= ({event_ts} - 86400 ))"
    
    return update_features(context, user_id, expression, condition)


def process_bet(context, event):
    user_id = event['user_id']
    event_time = event['event_time']
    event_ts = event_time_to_ts(event_time)
    
    bet_amount = event['bet_amount']

    sum_count_mean_var_expr = get_sum_count_mean_var_expr('bet', bet_amount)
    
    expression = sum_count_mean_var_expr
    condition = f"first_purchase_ts >= ({event_ts} - 86400 )"
    
    return update_features(context, user_id, expression, condition)


def process_win(context, event):
    user_id = event['user_id']
    event_time = event['event_time']
    event_ts = event_time_to_ts(event_time)
    
    win_amount = event['win_amount']

    sum_count_mean_var_expr = get_sum_count_mean_var_expr('win', win_amount)
    
    expression = sum_count_mean_var_expr
    condition = f"first_purchase_ts >= ({event_ts} - 86400 )"
    
    return update_features(context, user_id, expression, condition)


The following end-code annotation tells ```nuclio``` to stop parsing the notebook from this cell. _**Please do not remove this cell**_:

In [9]:
# nuclio: end-code
# marks the end of a code section

## Test locally

In [10]:
reg_event = nuclio.Event(body=b'{"user_id" : 111111 ,"affiliate_url":"aa.biz", "event_type": "registration", "postcode": 11014, "event_time": "2020-07-20 11:00:00","date_of_birth": "1970-03-03", "socioeconomic_idx": 3, "label":0}')
pur_event = nuclio.Event(body=b'{"user_id" : 111111 ,"amount": 3000, "event_type": "purchase", "event_time": "2020-07-20 11:00:00.009"}') 
bet_event = nuclio.Event(body=b'{"user_id" : 111111 ,"bet_amount": 300, "event_type": "bet", "event_time": "2020-07-20 11:00:00.889"}') 
init_context(context)
handler(context, reg_event)
handler(context, pur_event)
handler(context, bet_event)




Python> 2020-07-30 14:38:50,113 [info] Incoming event type: registration
Python> 2020-07-30 14:38:50,113 [info] Processing event {'user_id': 111111, 'affiliate_url': 'aa.biz', 'event_type': 'registration', 'postcode': 11014, 'event_time': '2020-07-20 11:00:00', 'date_of_birth': '1970-03-03', 'socioeconomic_idx': 3, 'label': 0}
Python> 2020-07-30 14:38:50,115 [info] Finished processing with status: 200 - and response body: b'' , event: {'user_id': 111111, 'affiliate_url': 'aa.biz', 'event_type': 'registration', 'postcode': 11014, 'event_time': '2020-07-20 11:00:00', 'date_of_birth': '1970-03-03', 'socioeconomic_idx': 3, 'label': 0}
Python> 2020-07-30 14:38:50,116 [info] Incoming event type: purchase
Python> 2020-07-30 14:38:50,116 [info] Processing event {'user_id': 111111, 'amount': 3000, 'event_type': 'purchase', 'event_time': '2020-07-20 11:00:00.009'}
Python> 2020-07-30 14:38:50,119 [info] Finished processing with status: 200 - and response body: b'' , event: {'user_id': 111111, 'am

## Deploy function

In [11]:
%nuclio deploy -p rapid-prototype-mk -n ${V3IO_USERNAME}-stream-to-features

[nuclio] 2020-07-30 14:39:13,000 (info) Build complete
[nuclio] 2020-07-30 14:39:16,051 done updating michaelk-stream-to-features, function address: 192.168.226.12:30268
%nuclio: function deployed
