## Import packages and data

In [1]:
# import packages
import pandas as pd
import numpy as np
import os 
import re

from sklearn import datasets
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

import m2cgen as m2c

import warnings
warnings.filterwarnings("ignore")

In [2]:
# import data
iris = datasets.load_iris()
X = iris.data
Y = iris.target

## Train a simple model

In [3]:
# split data into train and test sets
seed = 2020
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [4]:
# fit model on training data
model = XGBClassifier()
model.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=24, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

## Convert XGBoost model to VBA

In [5]:
code = m2c.export_to_visual_basic(model, function_name = 'pred')

In [6]:
print(code)

Module Model
Function pred(ByRef inputVector() As Double) As Double()
    Dim var0 As Double
    If (inputVector(2)) >= (2.45) Then
        var0 = -0.21827246
    Else
        var0 = 0.42043796
    End If
    Dim var1 As Double
    If (inputVector(2)) >= (2.45) Then
        var1 = -0.19534391
    Else
        var1 = 0.2908748
    End If
    Dim var2 As Double
    If (inputVector(2)) >= (2.45) Then
        var2 = -0.17986836
    Else
        var2 = 0.23301946
    End If
    Dim var3 As Double
    If (inputVector(2)) >= (2.45) Then
        var3 = -0.16835201
    Else
        var3 = 0.20049937
    End If
    Dim var4 As Double
    If (inputVector(2)) >= (2.45) Then
        var4 = -0.15951659
    Else
        var4 = 0.17969704
    End If
    Dim var5 As Double
    If (inputVector(2)) >= (2.45) Then
        var5 = -0.15227905
    Else
        var5 = 0.16499345
    End If
    Dim var6 As Double
    If (inputVector(2)) >= (2.45) Then
        var6 = -0.14613919
    Else
        var6 = 0.153659

## Manual Scripts to convert VBA to SAS

In [7]:
# remove unnecessary things
code = re.sub('Dim var.* As Double', '', code)
code = re.sub('End If', '', code)

# change the beginning
code = re.sub('Module Model\nFunction pred\(ByRef inputVector\(\) As Double\) As Double\(\)\n', 
                'DATA pred_result;\nSET dataset_name;', code)

# change the ending
code = re.sub('End Function\nEnd Module\n', 'RUN;', code)

# insert ';'
all_match_list = re.findall('[0-9]+\n', code)
for idx in range(len(all_match_list)):
    original_str = all_match_list[idx]
    new_str = all_match_list[idx][:-1]+';\n'
    code = code.replace(original_str, new_str)
all_match_list = re.findall('\)\n', code)
for idx in range(len(all_match_list)):
    original_str = all_match_list[idx]
    new_str = all_match_list[idx][:-1]+';\n'
    code = code.replace(original_str, new_str)

# replace the 'inputVector' with var name
dictionary = {'inputVector(0)':'sepal_length',
              'inputVector(1)':'sepal_width',
              'inputVector(2)':'petal_length',
              'inputVector(3)':'petal_width'} 
for key in dictionary.keys():
    code = code.replace(key, dictionary[key])

# change the prediction labels
code = re.sub('Math.Exp', 'Exp', code)
code = re.sub('pred = .*\n', '', code)
temp_var_list = re.findall(r"var[0-9]+\(\d\)", code)
for var_idx in range(len(temp_var_list)):
    code = re.sub(re.sub('\\(', '\\(', re.sub('\\)', '\\)', temp_var_list[var_idx])), iris.target_names[var_idx]+'_prob', code)

In [8]:
print(code)

DATA pred_result;
SET dataset_name;    
    If (petal_length) >= (2.45) Then
        var0 = -0.21827246;
    Else
        var0 = 0.42043796;
    
    
    If (petal_length) >= (2.45) Then
        var1 = -0.19534391;
    Else
        var1 = 0.2908748;
    
    
    If (petal_length) >= (2.45) Then
        var2 = -0.17986836;
    Else
        var2 = 0.23301946;
    
    
    If (petal_length) >= (2.45) Then
        var3 = -0.16835201;
    Else
        var3 = 0.20049937;
    
    
    If (petal_length) >= (2.45) Then
        var4 = -0.15951659;
    Else
        var4 = 0.17969704;
    
    
    If (petal_length) >= (2.45) Then
        var5 = -0.15227905;
    Else
        var5 = 0.16499345;
    
    
    If (petal_length) >= (2.45) Then
        var6 = -0.14613919;
    Else
        var6 = 0.1536592;
    
    
    If (petal_length) >= (2.45) Then
        var7 = -0.14011411;
    Else
        var7 = 0.14420119;
    
    
    If (petal_length) >= (2.45) Then
        var8 = -0.13456865;
    Else


In [9]:
# save output
vb = open('vb1.sas', 'w')
vb.write(code)
vb.close()

In [10]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [11]:
col_names = ['sepal_length','sepal_width','petal_length','petal_width']

In [12]:
pd.DataFrame(X_test, columns = col_names).to_csv('iris_test.csv',index=False)

## Compare prediction output

In [13]:
# python pred
python_pred = pd.DataFrame(model.predict_proba(X_test))
python_pred.columns = ['setosa_prob','versicolor_prob','virginica_prob']
python_pred

Unnamed: 0,setosa_prob,versicolor_prob,virginica_prob
0,0.001542,0.000793,0.997665
1,0.890853,0.107015,0.002132
2,0.012138,0.982334,0.005527
3,0.003852,0.992223,0.003925
4,0.002903,0.995775,0.001322
5,0.00959,0.795294,0.195116
6,0.002376,0.022084,0.97554
7,0.001774,0.997418,0.000808
8,0.990309,0.00732,0.002371
9,0.990309,0.00732,0.002371


In [14]:
# sas pred
sas_pred = pd.read_csv('pred_result.csv')
sas_pred = sas_pred.iloc[:,-3:]
sas_pred

Unnamed: 0,setosa_prob,versicolor_prob,virginica_prob
0,0.001542,0.000793,0.997665
1,0.890853,0.107015,0.002132
2,0.012138,0.982334,0.005527
3,0.003852,0.992223,0.003925
4,0.002903,0.995775,0.001322
5,0.00959,0.795294,0.195116
6,0.002376,0.022084,0.97554
7,0.001774,0.997418,0.000808
8,0.990309,0.00732,0.002371
9,0.990309,0.00732,0.002371


In [15]:
(abs(python_pred - sas_pred) > 0.00001).sum()

setosa_prob        0
versicolor_prob    0
virginica_prob     0
dtype: int64