# MLRun
# Nuclio - Training function
## Environment
Preperation for MLRun (Until pip-install version)

In [4]:
# nuclio: ignore
!pip install nest_asyncio
!pip install aiohttp
!pip install git+https://github.com/nuclio/nuclio-jupyter.git@enhance-spec
!pip install git+https://github.com/v3io/mlrun.git

Collecting git+https://github.com/nuclio/nuclio-jupyter.git@enhance-spec
  Cloning https://github.com/nuclio/nuclio-jupyter.git (to revision enhance-spec) to /tmp/pip-req-build-3uf_0yf7
Branch 'enhance-spec' set up to track remote branch 'enhance-spec' from 'origin'.
Switched to a new branch 'enhance-spec'
Building wheels for collected packages: nuclio-jupyter
  Running setup.py bdist_wheel for nuclio-jupyter ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-aeapvxjk/wheels/27/dd/f2/d906ffa5224575ab70b4c6a404d0d2acda1808960485a5624e
Successfully built nuclio-jupyter
Collecting git+https://github.com/v3io/mlrun.git
  Cloning https://github.com/v3io/mlrun.git to /tmp/pip-req-build-v1bdxxsf
Collecting kubernetes>=9.0.0 (from mlrun==0.1.2)
[?25l  Downloading https://files.pythonhosted.org/packages/2a/09/365f4ad63f71c698c76edb3e666852b87a751ee4b6d23222b09952557d17/kubernetes-10.0.0-py2.py3-none-any.whl (1.5MB)
[K    100% |████████████████████████████████| 1.5MB 17.5MB

Add MLRun to our python path (Fix until it will be pip-install)

### Load nuclio

In [2]:
# nuclio: ignore
import nuclio

## Configurations

In [3]:
# nuclio: ignore

# Setup environment variables
env_vars = {
    
}

# Function configurations
configs = {
    # Base image
    'spec.build.baseImage': 'python:3.6-jessie',
    
    # Triggers
    'spec.triggers': {
        'web': {
            'kind': 'http', 
            'maxWorkers': 1
        }
    }
}

# Build commands
build_commands = """pip install pyyaml
pip install pyarrow
pip install pandas
pip install v3io_frames --upgrade
pip install scikit-learn==0.20.1
pip install xgboost --upgrade
pip install dask["complete"] --upgrade
pip install dask-ml["complete"] --upgrade
pip install git+https://github.com/v3io/mlrun.git""".splitlines()


## Function

In [4]:
# Utils
import os
import time
import yaml
import pandas as pd
import datetime
import itertools
import pickle

# ML Pipeline Context
from mlrun import get_or_create_ctx, run_start

# DB Connection
import v3io_frames as v3f

# Parallelization
import dask.dataframe as dd
from dask.distributed import Client

# Function
import dask_ml.model_selection as dcv
import xgboost as xgb

### Helper functions

In [5]:
def format_df_from_tsdb(df, shards):
    df.index.names = ['timestamp', 'company', 'data_center', 'device']
    df = df.reset_index()
    df = dd.from_pandas(df, npartitions=shards)
    return df

In [6]:
def get_data_tsdb(client, features_table, train_on_last, shards):
    df = client.read(backend='tsdb', query=f'select * from {features_table}',
                     start=f'now-{train_on_last}', end='now', multi_index=True)
    df = df.reset_index(drop=True)
    df = df[sorted(df.columns)]
    df = dd.from_pandas(df, npartitions=shards)
    return df

In [7]:
def get_data_parquet(features_table):
    # Get parquet files
    mpath = [os.path.join(features_table, file) for file in os.listdir(features_table)]
    
    # Get latest filename
    latest = max(mpath, key=os.path.getmtime)
    
    # Load parquet to dask
    df = dd.read_parquet(latest)
    
    return df

In [8]:
def get_train_test_sets_from_data(df: pd.DataFrame, train_size: float=0.7, ):
    drop_columns = [col for col in df.columns if 'is_error' in col]
    X = df.drop(drop_columns, axis=1)
    y = df.loc[:, 'is_error']
    X_train, X_test, y_train, y_test = dcv.train_test_split(X, y, train_size=train_size, test_size=1-train_size)
    return X_train, X_test, y_train, y_test

### Handler

In [9]:
def handler(context, event):
    
    mlctx = get_or_create_ctx('netops_training', event=event)
    
    context.logger.info(
        f'Run: {mlctx.name} uid={mlctx.uid}:{mlctx.iteration}')

    # Get properties from mlrun context
    save_to_tsdb = mlctx.get_param('from_tsdb', True)
    features_table = mlctx.get_param('features_table', 'netops_metrics')
    model_filepath = mlctx.get_param('model_filepath', '/v3io/bigdata/netops/models/')
    train_on_last = mlctx.get_param('TRAIN_ON_LAST', '7d')
    shards = mlctx.get_param('shards', 4)
    train_set_size = float(mlctx.get_param('TRAIN_SIZE', 0.7))
    
    # Verify model_filepath exists
    os.makedirs(model_filepath, exist_ok=True)

    # Handle source specific part
    if save_to_tsdb:
        # Create V3IO client
        client = v3f.Client(address='framesd:8081')
        
        # Create features table
        try:
            client.create('tsdb', features_table, attrs={'rate': '1/s'}, if_exists=1)
        except:
            context.logger.info(f'Frames bug IG-12910 handling {features_table}')

        # get training data reader from v3io source
        df = get_data_tsdb(client, features_table, train_on_last, shards)

    else:
        # Create features table
        filepath = os.path.join(features_table)
        os.makedirs(filepath, exist_ok=True)

        # get training data reader from parquet source
        df = get_data_parquet(features_table)

    # Split to Train / Test datasets
    X_train, X_test, y_train, y_test = get_train_test_sets_from_data(df, train_set_size)
    
    # Train
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    # Score
    score = model.score(X_test, y_test)
    mlctx.log_result('accuracy', score)
    
    # Save model
    full_model_path = f'{model_filepath}/netops_xgb.model'
    model.save_model(full_model_path)
    mlctx.log_artifact('model', full_model_path)


## Test

In [None]:
# nuclio: ignore
event = nuclio.Event(body='')
output = handler(context, event)
output

## Deployment

In [11]:
# nuclio: ignore
resp = run_start({}, handler=handler)

Python> 2019-08-09 09:00:03,747 [info] Run: netops_training uid=0b2b0c12fa65400ea7068c6a44972860:0


uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
...972860,,,completed,,owner=iguaziohost=jupyter-70u91h6hx0-mmfj1-56b8bf8cf5-447q7runtime=handlerrepo=https://github.com/v3io/tutorials.gitcommit=ead08b6662dbddced9974f5973c95396622e8407,,,,


In [12]:
# nuclio: ignore

### Working ###
spec = nuclio.ConfigSpec(env=env_vars, 
                         cmd=build_commands,
                         config=configs,
                         mount=nuclio.Volume('User','~/'))

addr = nuclio.deploy_file('',name='training', project='netops_mlrun', verbose=False, spec=spec)

[nuclio.deploy] 2019-08-09 09:04:44,566 (info) Building processor image
[nuclio.deploy] 2019-08-09 09:04:47,610 (info) Pushing image
[nuclio.deploy] 2019-08-09 09:04:47,611 (info) Build complete
[nuclio.deploy] 2019-08-09 09:04:53,690 (info) Function deploy complete
[nuclio.deploy] 2019-08-09 09:04:53,697 done updating training, function address: 3.120.15.118:32242
