# MLRun
# Nuclio - Inference unction
## Environment
Preperation for MLRun (Until pip-install version)

In [1]:
# nuclio: ignore
!pip install nest_asyncio
!pip install aiohttp
!pip install git+https://github.com/nuclio/nuclio-jupyter.git@enhance-spec
!pip install git+https://github.com/v3io/mlrun.git

Collecting git+https://github.com/nuclio/nuclio-jupyter.git@enhance-spec
  Cloning https://github.com/nuclio/nuclio-jupyter.git (to revision enhance-spec) to /tmp/pip-req-build-akmgcxx7
Branch 'enhance-spec' set up to track remote branch 'enhance-spec' from 'origin'.
Switched to a new branch 'enhance-spec'
Building wheels for collected packages: nuclio-jupyter
  Running setup.py bdist_wheel for nuclio-jupyter ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-q5tx6v4k/wheels/27/dd/f2/d906ffa5224575ab70b4c6a404d0d2acda1808960485a5624e
Successfully built nuclio-jupyter
Collecting git+https://github.com/v3io/mlrun.git
  Cloning https://github.com/v3io/mlrun.git to /tmp/pip-req-build-m61j3k1t
Building wheels for collected packages: mlrun
  Running setup.py bdist_wheel for mlrun ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-qr0sb_9a/wheels/2b/0a/b1/2800e59ea6571091083a35a67a92d5d5744a64b61928c849ab
Successfully built mlrun


Add MLRun to our python path (Fix until it will be pip-install)

In [49]:
# nuclio: ignore
import nuclio

### Configurations

In [72]:
# nuclio: ignore

# Setup environment variables
env_vars = {

}

# Function configurations
configs = {
    # Base image
    'spec.build.baseImage': 'python:3.6-jessie',
    
    # Triggers
    'spec.triggers': {
        'web': {
            'kind': 'http', 
            'maxWorkers': 1
        }
    }
}

# Build commands
build_commands = """pip install pyyaml
pip install pyarrow
pip install pandas
pip install v3io_frames --upgrade
pip install scikit-learn==0.20.1
pip install xgboost
pip install git+https://github.com/v3io/mlrun.git""".splitlines()


In [88]:
# DB Config
%nuclio env V3IO_FRAMESD=${V3IO_FRAMESD}
%nuclio env V3IO_USERNAME=${V3IO_USERNAME}
%nuclio env V3IO_ACCESS_KEY=${V3IO_ACCESS_KEY}

%nuclio: setting 'V3IO_FRAMESD' environment variable
%nuclio: setting 'V3IO_USERNAME' environment variable
%nuclio: setting 'V3IO_ACCESS_KEY' environment variable


## Function

In [82]:
# Utilities
import os
import pickle

# ML Pipeline Context
from mlrun import get_or_create_ctx, run_start

# DB Connection
import v3io_frames as v3f

# Dataframe
import pandas as pd

# Model framework
import xgboost as xgb

### Helper functions

In [100]:
def get_data_parquet(client, metrics_table):
    # Get parquet files
    mpath = [os.path.join(context.metrics_table, file) for file in os.listdir(context.metrics_table)]
    
    # Get latest filename
    latest = max(mpath, key=os.path.getmtime)
    
    # Load parquet
    df = pd.read_parquet(latest)
    
    # Set index names
    df.index.names = indexes
    return df

In [92]:
def save_to_parquet(df, indexes, features_table):
    print('Saving features to Parquet')
    
    # Need to fix timestamps from ns to ms if we write to parquet
    df = df.reset_index()
    df['timestamp'] = df.loc[:, 'timestamp'].astype('datetime64[ms]')
    
    # Fix indexes
    df= df.set_index(indexes)
    
    # Save parquet
    first_timestamp = df.index[0][0].strftime('%Y%m%dT%H%M%S')
    last_timestamp = df.index[-1][0].strftime('%Y%m%dT%H%M%S')
    filename = first_timestamp + '-' + last_timestamp + '.parquet'
    filepath = os.path.join(features_table, filename)
    with open(filepath, 'wb+') as f:
        df.to_parquet(f)

### Handler

In [94]:
# Persistent objects
model = None
model_filepath = ''
client = v3f.Client(address='framesd:8081', container='bigdata')

In [110]:
def handler(context, event):
    
    # Expose persistent objects
    global model
    global model_filepath
    
    mlctx = get_or_create_ctx('netops_training', event=event)
    
    context.logger.info(
        f'Run: {mlctx.name} uid={mlctx.uid}:{mlctx.iteration}')
    
    
    # Do we need to load or change the model ?
    current_model_filepath = mlctx.get_param('model_filepath', '/v3io/bigdata/netops/models/')
    if current_model_filepath != model_filepath:
        try:
            model = xgb.XGBModel()
            model.load_model(model_filepath)
            model_filepath = current_model_filepath
        except:
            context.logger.error(f'Model file was not found at {model_filepath}')
    
    if model is None:
        context.logger.error('Please specify model_filepath')
        return None
    
    # Get the data
    features_table = mlctx.get_param('features_table', 'netops_features')
    indexes = mlctx.get_param('indexes', ['timestamp', 'company', 'data_center', 'device'])
    input_type = mlctx.get_param('input_type', 'v3io')
    if input_type == 'v3io':
        range_start = mlctx.get_param('input_range_start', 'now-1m')
        range_end = mlctx.get_param('input_range_end', 'now')
        # Get the data from TSDB
        df = client.read(backend='tsdb', query=f'select * from {features_table}',
                         start=range_start, end=range_end, multi_index=True)

        # Format the df and load it to dask 
        df.index.names = indexes
        
    elif input_type == 'parquet':
         # Create saving directory if needed
        filepath = os.path.join(features_table)
        os.makedirs(filepath, exist_ok=True)
            
        # Set Parquet reading function
        df = get_data_parquet(client, metrics_table)
        
    elif input_type == 'http':
        # Create Dataframe from event
        df = pd.read_json(event.body)

    # Predict - one line
    prediction = model.predict(df.values)
    
    # Build response
    return_full_df = mlctx.get_param('return_full_df', False)
    if return_full_df:
        prediction_col = mlctx.get_param('prediction_col', 'prediction')
        df[prediction_col] = prediction
        df = df.set_index(indexes)
    else:
        df = prediction

    # Return response
    prediction_table = mlctx.get_param('prediction_table', '/v3io/bigdata/netops_predictions/')
    response_type = mlctx.get_param('response_type', 'return')
    if response_type == 'return':
        return df.to_json()
    elif response_type == 'parquet':
        save_to_parquet(df, indexes, prediction_table)
    elif response_type == 'v3io':
        client.write('tsdb', predictions_table, df)
    

### Test

In [None]:
# nuclio: ignore
resp = run_start({}, handler=handler)

## Deployment

In [96]:
# nuclio: ignore
spec = nuclio.ConfigSpec(env=env_vars, 
                         cmd=build_commands,
                         config=configs,
                         mount=nuclio.Volume('User','~/'))

addr = nuclio.deploy_file('',name='inference', project='netops_mlrun', verbose=False, spec=spec)

[nuclio.deploy] 2019-08-15 14:43:50,859 (info) Building processor image
[nuclio.deploy] 2019-08-15 14:43:52,881 (info) Pushing image
[nuclio.deploy] 2019-08-15 14:43:52,882 (info) Build complete
[nuclio.deploy] 2019-08-15 14:43:58,949 (info) Function deploy complete
[nuclio.deploy] 2019-08-15 14:43:58,957 done updating inference, function address: 3.120.15.118:30064
