# xgboost regressor 

## Stage 0 - import libraries

In [1]:
# this definition exposes all python module imports that should be available in all subsequent commands
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
# ...
# global constants
MODEL_DIRECTORY = "/srv/app/model/data/"

In [2]:
# THIS CELL IS NOT EXPORTED - free notebook cell for testing or development purposes
print("numpy version: " + np.__version__)
print("pandas version: " + pd.__version__)

numpy version: 1.18.1
pandas version: 1.0.1


## Stage 1 - get a data sample from Splunk

| inputlookup server_power.csv <br>
| fields - total-cpu-utilization <br>
| fit MLTKContainer mode=stage algo=xgboost_regressor ac_power from total* into app:mj_model <br>

In [9]:
# this cell is not executed from MLTK and should only be used for staging data into the notebook environment
def stage(name):
    with open("data/"+name+".csv", 'r') as f:
        df = pd.read_csv(f)
    with open("data/"+name+".json", 'r') as f:
        param = json.load(f)
    return df, param

In [10]:
df, param = stage("mj_model")
print(df[0:1])
print(df.shape)
print(str(param))

   ac_power  total-unhalted_core_cycles  total-instructions_retired  \
0     220.0                   4708152.0                   3924639.0   

   total-last_level_cache_references  total-memory_bus_transactions  \
0                            75140.0                         5130.0   

   total-cpu-utilization  total-disk-accesses  total-disk-blocks  \
0                   0.99                  0.0                0.0   

   total-disk-utilization  
0                     0.0  
(31271, 9)
{'options': {'params': {'mode': 'stage', 'algo': 'xgboost_mj'}, 'args': ['ac_power', 'total*'], 'target_variable': ['ac_power'], 'feature_variables': ['total*'], 'model_name': 'mj_model', 'algo_name': 'MLTKContainer', 'mlspl_limits': {'handle_new_cat': 'default', 'max_distinct_cat_values': '100', 'max_distinct_cat_values_for_classifiers': '100', 'max_distinct_cat_values_for_scoring': '100', 'max_fit_time': '600', 'max_inputs': '100000', 'max_memory_usage_mb': '1000', 'max_model_size_mb': '15', 'max_score_

## Stage 2 - create and initialize a model

In [11]:
# initialize your model
# available inputs: data and parameters
# returns the model object which will be used as a reference to call fit, apply and summary subsequently
def init(df,param):
    #model = {}
    #model['hyperparameter'] = 42.0
    model = XGBRegressor()
    return model

In [12]:
# test mltkc_stage_create_model
model = init(df,param)
print(model)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             objective='reg:squarederror', random_state=None, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=None,
             tree_method=None, validate_parameters=None, verbosity=None)


## Stage 3 - fit the model

In [13]:
# train your model
# returns a fit info json object and may modify the model object
def fit(model,df,param):
    returns = {}
    X = df[param['feature_variables']]
    y = df[param['target_variables']]
    #train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3)
    #model.fit(train_X, train_y, verbose=False)
    #predictions = model.predict(test_X)
    #returns['Mean_Absolute_Error'] = str(mean_absolute_error(predictions, test_y))
    
    model.fit(X, y, verbose=False)
    
    info = {"message": "model trained"}
    return info

In [14]:
returns = fit(model,df,param)
print(returns)

{'message': 'model trained'}


## Stage 4 - apply the model

In [17]:
# apply your model
# returns the calculated results
def apply(model,df,param):
    
    X = df[param['feature_variables']]    
    y_hat = model.predict(X)
    result = pd.DataFrame(y_hat, columns=['predicted_value'])
    
    return result

In [18]:
returns = apply(model,df,param)
print(returns)

       predicted_value  total-unhalted_core_cycles  \
0           220.204880                4.708152e+06   
1           220.855728                4.082736e+07   
2           222.982483                2.171773e+07   
3           220.731644                5.042548e+06   
4           220.139694                4.418062e+06   
...                ...                         ...   
31266       248.072601                2.337313e+09   
31267       248.036270                2.338347e+09   
31268       248.091721                2.337886e+09   
31269       248.141388                2.337613e+09   
31270       242.140778                2.337861e+09   

       total-instructions_retired  total-last_level_cache_references  \
0                    3.924639e+06                            75140.0   
1                    2.843336e+07                           590082.0   
2                    1.041723e+07                           360018.0   
3                    3.985700e+06                            80

## Stage 5 - save the model

In [20]:
# save model to name in expected convention "<algo_name>_<model_name>"
def save(model,name):
    path = MODEL_DIRECTORY + name + ".json"
    model.save_model(path)
    
    return model

## Stage 6 - load the model

In [21]:
# load model from name in expected convention "<algo_name>_<model_name>"
def load(name):
    model = XGBRegressor()
    model.load_model(MODEL_DIRECTORY + name + ".json")
    return model

## Stage 7 - provide a summary of the model

In [22]:
# return a model summary
def summary(model=None):
    returns = {"version": {"numpy": np.__version__, "pandas": pd.__version__} }
    return returns