In [None]:
manifest = {
    'memory': 2048,
    'disk_quota': 2048,
    'buildpack': 'python_buildpack',
    'requirements': [
        'numpy',
        'pandas',
        'scikit-learn',
        'influxdb',
        'requests',
        'scipy',
        'urllib3',
        'afs'
    ],
    'type': 'API'
}

In [1]:
from afs import config_handler
from pandas import DataFrame
import json
cfg = config_handler()
cfg.set_param('criterion', type='string', required=True, default="gini")
cfg.set_param('random_state', type='string', required=True, default="2")
cfg.set_param('max_depth', type='string', required=True, default="3")
cfg.set_param('K_fold', type='integer', required=True, default=10)

cfg.set_param('model_name', type='string', required=True, default="model.pkl")
cfg.set_features(True)
cfg.set_column('data')
cfg.summary()


{"features": true, "param": [{"name": "criterion", "type": "string", "required": true, "default": "gini"}, {"name": "random_state", "type": "string", "required": true, "default": "2"}, {"name": "max_depth", "type": "string", "required": true, "default": "3"}, {"name": "K_fold", "type": "integer", "required": true, "default": 10}, {"name": "model_name", "type": "string", "required": true, "default": "model.pkl"}], "column": ["data"]}


In [None]:
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.externals import joblib
from afs import models
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import pandas as pd
import numpy as np
import json
import requests


In [None]:
def grid(data , target , parameters_dt , cv):
    clf = tree.DecisionTreeClassifier()
    grid = GridSearchCV(estimator = clf, param_grid = parameters_dt, cv = cv, 
                        scoring = 'accuracy')
    grid.fit(data,target)
    best_accuracy = grid.best_score_
    best_params = grid.best_params_
    return best_accuracy,best_params

In [None]:
def training_model(data , target ,best_params , best_accuracy ,model_name):
    clf = tree.DecisionTreeClassifier(**best_params)
    clf = clf.fit(data, target)
    #save model
    joblib.dump(clf , model_name)
    client = models()
    client.upload_model(model_name, accuracy=best_accuracy, loss=0.0, tags=dict(machine='dt'))

    return model_name

In [None]:
def parameters(criterion , random_state , max_depth):
    criterion = criterion.split(",")
    random_state = random_state.split(",")
    max_depth = max_depth.split(",")
    
    random_state = list(map(lambda i : i.strip() , random_state))
    max_depth = list(map(lambda i : i.strip() , max_depth))
    
    random_state = [x for x in random_state if x.isdigit()]
    max_depth = [x for x in max_depth if x.isdigit()]
    
    random_state = list(map(int, random_state))
    max_depth = list(map(int, max_depth))
    
    if(len(random_state)==0):
        random_state=[2]
    if(len(max_depth)==0):
        max_depth = [3]
    
    random_state = list(map(int, random_state))
    max_depth = list(map(int, max_depth))
    
    criterion_list = ["gini","entropy"]
    criterion = [i for i in criterion if i in criterion_list]
    if(len(criterion)==0):
        criterion = ["gini"]
    
    
    parameters_dt = {"criterion" : criterion , "random_state" : random_state , 
                      "max_depth" : max_depth }
    return parameters_dt

In [None]:
def pre_process_features(select_feature,numerical_features,target,df):
    category_features = [x for x in select_feature if x not in numerical_features]
    category_features = [x for x in category_features if x not in target]

    if(category_features==[]):
        category_features=["No"]

    if(numerical_features==[]):
        numerical_features=["No"]
   
    a1=["time"]
    category_features = [x for x in category_features if x not in a1]
    numerical_features = [x for x in numerical_features if x not in a1]
    
    target = np.array(df.loc[:,[target]])
    target=target.ravel()

    if (numerical_features[0]=="No"):
        data = np.array([]).reshape(df.shape[0],0)
        if (category_features[0]!="No"):
            for i in category_features:   
                if ((False in map((lambda x: type(x) == str), df[i].tolist()))==False):
                    label2 = LabelBinarizer().fit_transform(df[i])
                    data = np.hstack((data,label2))

                if ((False in map((lambda x: type(x) == int), df[i].tolist()))==False):
                    label2 = OneHotEncoder( sparse=False ).fit_transform(df[i].values.reshape(-1,1))
                    data = np.hstack((data,label2))

    else:    
        data = np.array(df.loc[:,numerical_features])
        if (category_features[0]!="No"):
            for i in category_features:   
                if ((False in map((lambda x: type(x) == str), df[i].tolist()))==False):
                    label2 = LabelBinarizer().fit_transform(df[i])
                    data = np.hstack((data,label2))
                    
                if ((False in map((lambda x: type(x) == int), df[i].tolist()))==False):
                    label2 = OneHotEncoder( sparse=False ).fit_transform(df[i].values.reshape(-1,1))
                    data = np.hstack((data,label2))
            
    return data,target

In [None]:
# POST /
cfg.set_kernel_gateway(REQUEST)

# # Get the parameter from node-red setting
criterion = str(cfg.get_param('criterion'))
random_state = str(cfg.get_param('random_state'))
max_depth = str(cfg.get_param('max_depth'))
cv = cfg.get_param('K_fold')


model_name =str(cfg.get_param('model_name'))
select_feature = cfg.get_features_selected()
numerical_features = cfg.get_features_numerical()
target = cfg.get_features_target()

if target=="time":
    try:
        raise NameError('Please select another target feature')
    except NameError:
        raise
# Get the data from request, and transform to DataFrame Type
df = cfg.get_data()
df = pd.DataFrame(df)

#parameter to dict
parameters_dt = parameters(criterion , random_state , max_depth)

#process features
data,target = pre_process_features(select_feature,numerical_features,target,df)

#find best parameter
best_accuracy,best_params = grid(data , target , parameters_dt , cv)

#the best model name
result = training_model(data , target ,best_params , best_accuracy ,model_name)


# # Send the result to next node, and result is  DataFrame Type
df2 = pd.DataFrame([model_name], columns=['model_name'])
ret = cfg.next_node(df2, debug=False) 

# # The printing is the API response.
print(json.dumps(ret))