# Stream to Parquet

In [1]:
import nuclio

In [2]:
%nuclio env BATCH_SIZE = 1024
%nuclio env TARGET_PATH = /User/examples/rapid-prototype/events-pq
#optional %nuclio env PQ_PARTITIONS = part1,part2

%nuclio: setting 'BATCH_SIZE' environment variable
%nuclio: setting 'TARGET_PATH' environment variable


In [3]:
# Define function spec
%nuclio config kind = "nuclio"

%nuclio: setting kind to 'nuclio'


In [4]:
%%nuclio cmd -c

python -m pip install pandas
python -m pip install pyarrow

In [5]:
%%nuclio config
spec.readinessTimeoutSeconds = 200
spec.triggers.v3io_stream.kind = "v3ioStream"
spec.triggers.v3io_stream.disabled = false
spec.triggers.v3io_stream.url = "http://v3io-webapi:8081/users/${V3IO_USERNAME}/examples/rapid-prototype/incoming-events-stream@stream2pq"
spec.triggers.v3io_stream.maxWorkers = 10
spec.triggers.v3io_stream.password = "${V3IO_ACCESS_KEY}"
spec.triggers.v3io_stream.attributes.pollingIntervalMs = 500
spec.triggers.v3io_stream.attributes.seekTo = "earliest"
spec.triggers.v3io_stream.attributes.readBatchSize = 64


%nuclio: setting spec.readinessTimeoutSeconds to 200
%nuclio: setting spec.triggers.v3io_stream.kind to 'v3ioStream'
%nuclio: setting spec.triggers.v3io_stream.disabled to False
%nuclio: setting spec.triggers.v3io_stream.url to 'http://v3io-webapi:8081/users/michaelk/examples/rapid-prototype/incoming-events-stream@stream2pq'
%nuclio: setting spec.triggers.v3io_stream.maxWorkers to 10
%nuclio: setting spec.triggers.v3io_stream.password to 'b01eb2f1-294a-4f63-b0a6-42561e3e1706'
%nuclio: setting spec.triggers.v3io_stream.attributes.pollingIntervalMs to 500
%nuclio: setting spec.triggers.v3io_stream.attributes.seekTo to 'earliest'
%nuclio: setting spec.triggers.v3io_stream.attributes.readBatchSize to 64


In [6]:
%nuclio mount /User ~/

mounting volume path /User as ~/


In [7]:
# nuclio: start-code

In [8]:
import os
import pandas as pd
import numpy as np
import json
import datetime

In [9]:
def init_context(context):
    setattr(context, 'batch', [])
    setattr(context, 'batch_size', int(os.getenv('BATCH_SIZE', 1024)))
    setattr(context, 'batch_count',int(os.getenv('BATCH_COUNT', 0)))
    
    pq_partitions = os.getenv('PQ_PARTITIONS')
    if pq_partitions:
        setattr(context, 'pq_partitions', pq_partitions.split(','))
    else:
        setattr(context, 'pq_partitions', pq_partitions)
    
    setattr(context, 'target_path', os.getenv('TARGET_PATH'))
    os.makedirs(context.target_path, exist_ok=True)
    

In [10]:
def handler(context, event):
    if type(event.body) is dict:
        event_dict = event.body
    else:
        event_dict = json.loads(event.body)
        
    context.logger.info_with('Got invoked',
                             trigger_kind=event.trigger.kind,
                             event_body=event_dict)
    
    # add the incoming event to the current batch
    context.batch.append(event_dict)
    
    #check if batch size reached
    if context.batch_size == len(context.batch):
        context.logger.info_with('Writing batch',
                                 batch_count=context.batch_count,
                                 batch_size=len(context.batch))
        write_batch(context)
        context.logger.info_with('Written batch',
                                 batch_count=context.batch_count,
                                 batch_size=len(context.batch))
        
        
def write_batch(context):
    file_name = str(context.worker_id)+'_'+str(context.batch_count)
    df = pd.DataFrame.from_records(context.batch)
    df.to_parquet(path=os.path.join(context.target_path, file_name), partition_cols=context.pq_partitions)
    # post write cleanup and counter update
    context.batch = []
    context.batch_count += 1
        
    

In [11]:
# nuclio: end-code

## Test Locally

In [12]:
init_context(context)
#reduce the batch size to 10
context.batch_size = 10

# trigger with 9 events:

nine_events = [b'{"user_id" : 1 , "event_type": "spin"}',
              b'{"user_id" : 2 , "event_type": "spin"}',
              b'{"user_id" : 3 , "event_type": "spin"}',
              b'{"user_id" : 4 , "event_type": "spin"}',
              b'{"user_id" : 5 , "event_type": "spin"}',
              b'{"user_id" : 6 , "event_type": "spin"}',
              b'{"user_id" : 7 , "event_type": "spin"}',
              b'{"user_id" : 8 , "event_type": "spin"}',
              b'{"user_id" : 9 , "event_type": "spin"}']

for e in nine_events:
    event = nuclio.Event(body=e)
    handler(context, event)

Python> 2020-07-28 14:54:34,814 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 1, 'event_type': 'spin'}}
Python> 2020-07-28 14:54:34,814 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 2, 'event_type': 'spin'}}
Python> 2020-07-28 14:54:34,815 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 3, 'event_type': 'spin'}}
Python> 2020-07-28 14:54:34,815 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 4, 'event_type': 'spin'}}
Python> 2020-07-28 14:54:34,816 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 5, 'event_type': 'spin'}}
Python> 2020-07-28 14:54:34,816 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 6, 'event_type': 'spin'}}
Python> 2020-07-28 14:54:34,817 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 7, 'event_type': 'spin'}}
Python> 2020-07-28 14:54:34,817 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 8, 'event_type': 'spin'}}


In [13]:
# check weather a parquet has been created
!ls -l ~/examples/rapid-prototype/events-pq

total 0


In [14]:
# trigger the tenth event which should trigger the creation of the parquet file.
tenth_event = b'{"user_id" : 10 , "event_type": "spin"}'
event = nuclio.Event(body=tenth_event)
handler(context, event)

Python> 2020-07-28 14:55:11,069 [info] Got invoked: {'trigger_kind': '', 'event_body': {'user_id': 10, 'event_type': 'spin'}}
Python> 2020-07-28 14:55:11,070 [info] Writing batch: {'batch_count': 0, 'batch_size': 10}
Python> 2020-07-28 14:55:11,132 [info] Written batch: {'batch_count': 1, 'batch_size': 0}


In [22]:
# check weather a parquet has been created
!ls -l ~/examples/rapid-prototype/events-pq/

total 0


In [21]:
# cleanup
!rm ~/examples/rapid-prototype/events-pq/*

rm: cannot remove '/User/examples/rapid-prototype/events-pq/*': No such file or directory


## Deploy  function

In [23]:
%nuclio deploy -p rapid-prototype-mk -n ${V3IO_USERNAME}-stream-to-parquet

[nuclio] 2020-07-28 14:59:01,108 project name not found created new (rapid-prototype-mk)
[nuclio] 2020-07-28 14:59:03,215 (info) Build complete
[nuclio] 2020-07-28 14:59:09,289 (info) Function deploy complete
[nuclio] 2020-07-28 14:59:09,296 done creating michaelk-stream-to-parquet, function address: 192.168.226.12:31891
%nuclio: function deployed
