In [1]:
# Testing ONNX input types 
import sys
import os

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.datasets import load_iris
import onnxruntime as rt
from mlisne.dataset import EstimatorDataset
from mlisne.qps import estimate_qps

In [2]:
model_out_path = "test_models"

## Generating Sklearn Logistic Regression Models

In [3]:
iris = load_iris()
X, y = iris.data, iris.target

In [4]:
X = np.array(X, dtype=np.float32)
y[y > 0] = 1 # change y into a "binary" recommendation 

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression 

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
clr = LogisticRegression()
clr.fit(X_train, y_train)

LogisticRegression()

In [7]:
clr.predict_proba(X_test)

array([[9.82363143e-01, 1.76368574e-02],
       [2.76526261e-02, 9.72347374e-01],
       [9.85912678e-01, 1.40873216e-02],
       [4.11916325e-03, 9.95880837e-01],
       [9.74995556e-01, 2.50044441e-02],
       [6.02257620e-04, 9.99397742e-01],
       [1.01974562e-03, 9.98980254e-01],
       [9.94070222e-01, 5.92977754e-03],
       [9.78988802e-01, 2.10111978e-02],
       [9.84074105e-03, 9.90159259e-01],
       [3.07861304e-04, 9.99692139e-01],
       [4.30432480e-02, 9.56956752e-01],
       [9.67283304e-01, 3.27166964e-02],
       [9.80984715e-01, 1.90152853e-02],
       [2.93292035e-02, 9.70670797e-01],
       [9.77933385e-01, 2.20666145e-02],
       [9.85043732e-01, 1.49562681e-02],
       [2.10545670e-02, 9.78945433e-01],
       [2.77591366e-02, 9.72240863e-01],
       [9.84134796e-02, 9.01586520e-01],
       [9.47765787e-01, 5.22342130e-02],
       [2.50357447e-03, 9.97496426e-01],
       [1.06592593e-02, 9.89340741e-01],
       [9.83453561e-01, 1.65464387e-02],
       [8.375432

In [8]:
import pickle 

with open(f"{model_out_path}/iris_logreg.pickle", "wb") as f:
    pickle.dump(clr, f)

In [9]:
from skl2onnx import convert_sklearn, to_onnx
from skl2onnx.common.data_types import FloatTensorType, DoubleTensorType, Int64TensorType

initial_type = [('float_input', FloatTensorType([None, 4]))]
onx = convert_sklearn(clr, initial_types=initial_type)
with open(f"{model_out_path}/logreg_iris.onnx", "wb") as f:
    f.write(onx.SerializeToString())
    
initial_type = [('double_input', DoubleTensorType([None, 4]))]
onx = convert_sklearn(clr, initial_types=initial_type)
with open(f"{model_out_path}/logreg_iris_double.onnx", "wb") as f:
    f.write(onx.SerializeToString())
    
onx = to_onnx(clr, X = X_train)
with open(f"{model_out_path}/logreg_iris_infertype.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [175]:
sess = rt.InferenceSession(f"{model_out_path}/logreg_iris.onnx")

In [176]:
input_name = sess.get_inputs()[0].name
input_name

'float_input'

In [177]:
label_name = sess.get_outputs()[1].name
label_name

'output_probability'

In [183]:
pred_onx = sess.run([label_name], {input_name: X})[0]

In [22]:
np.apply_along_axis(lambda x: (min(x), max(x)), axis=0, arr=X)

array([[4.3, 2. , 1. , 0.1],
       [7.9, 4.4, 6.9, 2.5]], dtype=float32)

## Generating Pytorch binary classification models

Training code adapted from: https://stackabuse.com/introduction-to-pytorch-for-classification/

In [51]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [52]:
data_path = "../examples/data"
churn = pd.read_csv(f"{data_path}/Kaggle_Churn_Modelling.csv")
churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### We will first simulate treatment effect data

In [53]:
categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

In [54]:
# Convert categorical to discrete types 
for category in categorical_cols:
    churn[category] = churn[category].astype('category')

In [55]:
# Take 3 categorical and 3 numerical columns to simulate outcome
treatment_cols = ['Geography', 'Gender', 'IsActiveMember', 'CreditScore', 'EstimatedSalary', 'NumOfProducts']

In [56]:
# Set coefficient distributions
means = [5,10,3,1,0.02,7]
sd = [2,5,1,1,0.001,3]

In [57]:
sd

[2, 5, 1, 1, 0.001, 3]

In [58]:
# Generate Y0 
churn['Y0'] = 0
for i in range(len(treatment_cols)):
    coefs = np.random.normal(means[i], sd[i], len(churn))
    if churn[treatment_cols[i]].dtype.name == 'category':
        churn['Y0'] += coefs * churn[treatment_cols[i]].cat.codes.values
    else:
        churn['Y0'] += coefs * churn[treatment_cols[i]]

err = np.random.normal(0,3) 
churn['Y0'] += err 

In [88]:
# Generate Y1 
treatment_effect = np.random.normal(100, 10, len(churn))
err_iv = np.random.normal(size=len(churn))
churn['Y1'] = churn['Y0'] + treatment_effect + err_iv

In [89]:
print(churn['Y1'].mean(), churn['Y0'].mean())

2775.104512581401 2675.0436909859122


In [90]:
treatment_effect.mean()

100.07302173915662

## Train Pytorch Model

In [61]:
# Get data as tensors
cat = []
for c in categorical_cols:
    cat.append(churn[c].cat.codes.values)
cat_data = np.stack(cat, 1)

cat_tensor = torch.tensor(cat_data).double()
num_tensor = torch.tensor(np.array(churn[numerical_cols]))
tot_tensor = torch.cat((cat_tensor, num_tensor), 1)
output = torch.tensor(churn['Exited'])

In [62]:
# Create categorical embeddings 
categorical_column_sizes = [len(churn[column].cat.categories) for column in categorical_cols]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_column_sizes]

In [63]:
categorical_column_sizes

[3, 2, 2, 2]

We will create two different models: one with categorical embeddings, and one without 

In [64]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split

train_length = round(0.8 * len(cat_tensor))
test_length = len(cat_tensor) - train_length
cat_dataset = TensorDataset(cat_tensor, num_tensor, output) 
cat_dataset_train, cat_dataset_test = random_split(cat_dataset, [train_length, test_length])

dataset = TensorDataset(tot_tensor, output) 
dataset_train, dataset_test = random_split(dataset, [train_length, test_length])

In [65]:
train_sampler = RandomSampler(cat_dataset_train)
train_dataloader = DataLoader(cat_dataset_train, sampler=train_sampler, batch_size=train_length)

In [66]:
# Model with categorical embeddings
class CatModel(nn.Module):

    def __init__(self, embedding_size, num_numerical_cols, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)

        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols + num_numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)
        
        self.m = nn.Softmax(dim=1)

    def forward(self, x_categorical, x_numerical):
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)

        x_numerical = self.batch_norm_num(x_numerical)
        x = torch.cat([x, x_numerical], 1)
        x = self.layers(x)
        x = self.m(x)
        
        return x[:,1]

In [67]:
cat_model = CatModel(categorical_embedding_sizes, num_tensor.shape[1], 2, [200,100,50], p=0.4)

In [68]:
categorical_embedding_sizes

[(3, 2), (2, 1), (2, 1), (2, 1)]

In [69]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(cat_model.parameters(), lr=0.001)

In [70]:
epochs = 300
aggregated_losses = []
cat_model.train()

for i in range(epochs):
    i += 1
    for step, batch in enumerate(train_dataloader):
        cat_train, num_train, output = batch
        y_pred = cat_model(cat_train.long(), num_train.float())
        single_loss = loss_function(y_pred.float(), output.float())
        aggregated_losses.append(single_loss)
        single_loss.backward()
        optimizer.step()
    optimizer.zero_grad()
    
    if i%25 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

epoch:   1 loss: 0.27313590
epoch:  26 loss: 0.17774700
epoch:  51 loss: 0.15001221
epoch:  76 loss: 0.13582264
epoch: 101 loss: 0.12679212
epoch: 126 loss: 0.12011691
epoch: 151 loss: 0.11477953
epoch: 176 loss: 0.11117698
epoch: 201 loss: 0.11223534
epoch: 226 loss: 0.11037559
epoch: 251 loss: 0.11042570
epoch: 276 loss: 0.10917238
epoch: 300 loss: 0.1087926328


In [71]:
# Model without categorical embeddings
class Model(nn.Module):

    def __init__(self, input_size, output_size, layers, p=0.4):
        super().__init__()

        all_layers=[]
        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)
        
        self.m = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.layers(x)
        x = self.m(x)
        
        return x[:,1]

In [72]:
model = Model(tot_tensor.shape[1], 2, [200,100,50], p=0.4)
train_sampler = RandomSampler(dataset_train)
train_dataloader = DataLoader(dataset_train, sampler=train_sampler, batch_size=train_length)

In [73]:
tot_tensor.shape[1]

10

In [74]:
model.eval()
model(tot_tensor[0:1,].float())

tensor([0.], grad_fn=<SelectBackward>)

In [75]:
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [76]:
epochs = 300
aggregated_losses = []
model.train()

for i in range(epochs):
    i += 1
    for step, batch in enumerate(train_dataloader):
        train, output = batch
        y_pred = model(train.float())
        single_loss = loss_function(y_pred.float(), output.float())
        aggregated_losses.append(single_loss)
        single_loss.backward()
        optimizer.step()
    optimizer.zero_grad()
    
    if i%25 == 1:
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f}')

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

epoch:   1 loss: 0.29241750
epoch:  26 loss: 0.23277746
epoch:  51 loss: 0.20136672
epoch:  76 loss: 0.17731732
epoch: 101 loss: 0.16731592
epoch: 126 loss: 0.16514130
epoch: 151 loss: 0.16463365
epoch: 176 loss: 0.16350816
epoch: 201 loss: 0.16302647
epoch: 226 loss: 0.16283369
epoch: 251 loss: 0.16103180
epoch: 276 loss: 0.16060691
epoch: 300 loss: 0.1608925909


In [79]:
# Save trained models 
torch.save(cat_model.state_dict(), f"{model_out_path}/churn_categorical.pt")
torch.save(model.state_dict(), f"{model_out_path}/churn.pt")

In [80]:
# Save data and keep track of treatment columns, true treatment effects, etc. 
cat_model.eval()
with torch.no_grad():
    cat_out = cat_model(cat_tensor.long(), num_tensor.float())

rec_draws = np.random.uniform(size=len(churn))
churn['Z_cat'] = (rec_draws <= cat_out.numpy()).astype(int)

In [81]:
model.eval()
with torch.no_grad():
    out = model(tot_tensor.float())

# ML recommendation is a probability
rec_draws = np.random.uniform(size=len(churn))
churn['Z'] = (rec_draws <= out.numpy()).astype(int)

In [82]:
# Follow the ML recommendation Z 75% of the time 
treat_probs = np.random.uniform(size=len(churn))
D = []
for i in range(len(treat_probs)):
    if treat_probs[i] >= 0.75:
        D.append(0)
    else:
        D.append(churn["Z"][i])

churn['D'] = D 
treat_probs = np.random.uniform(size=len(churn))
D = []
for i in range(len(treat_probs)):
    if treat_probs[i] >= 0.75:
        if churn["Z_cat"][i] == 1:
            D.append(0)
        else:
            D.append(1)
    else:
        D.append(churn["Z_cat"][i])
churn['D_cat'] = D

In [83]:
churn.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Y0,Y1,Z_cat,Z,D,D_cat
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,2846.16569,2942.50609,0,0,0,0
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,3348.43358,3357.479332,1,0,0,1
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,3289.261374,3379.228072,1,1,1,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,2659.722696,2774.024431,1,0,0,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,2821.396257,2866.61182,0,0,0,1


In [91]:
churn.to_csv(f"{data_path}/churn_data.csv", index=False)

### Testing LightGBM

In [1]:
# Testing ONNX input types 
import sys
import os

import pandas as pd
import numpy as np
from pathlib import Path
import lightgbm as lgb
import onnxruntime as rt
from mlisne.dataset import EstimatorDataset
from mlisne.qps import estimate_qps

In [2]:
data_path = "../examples/data"
model_path = "../examples/models"

In [3]:
model = lgb.Booster(model_file = f"{model_path}/lgbm_example.txt")
data = pd.read_csv(f"{data_path}/lgbm_regression.test", header=None, sep='\t')

In [19]:
X = data.drop(0, axis=1)
X_np = np.array(X)
og_preds = model.predict(X_np)

X_dummy = np.array(X)[0,:]
print(X_dummy.shape)

(28,)


In [6]:
from mlisne.helpers import convert_to_onnx

In [8]:
f = "test_models/lgbm.onnx"
convert_to_onnx(model, X_dummy, f, "lightgbm")

The maximum opset needed by this model is only 1.


True

In [9]:
sess = rt.InferenceSession(f)

In [16]:
sess.get_outputs()[0].name

'variable'

In [17]:
sess.get_inputs()[0].name

'input'

In [20]:
out = sess.run(['variable'], {'input':X_np})

TypeError: run(): incompatible function arguments. The following argument types are supported:
    1. (self: onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession, arg0: List[str], arg1: Dict[str, object], arg2: onnxruntime.capi.onnxruntime_pybind11_state.RunOptions) -> List[object]

Invoked with: <onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession object at 0x000002248A065030>, 'variable', {'input': array([[ 0.644,  0.247, -0.447, ...,  0.581,  0.905,  0.796],
       [ 0.385,  1.8  ,  1.037, ...,  0.813,  1.149,  1.116],
       [ 1.214, -0.166,  0.004, ...,  1.058,  0.744,  0.696],
       ...,
       [ 0.815, -1.263,  0.057, ...,  0.477,  0.886,  0.836],
       [ 3.512, -1.094, -0.22 , ...,  2.   ,  1.626,  1.349],
       [ 0.904,  1.248,  0.325, ...,  0.904,  1.012,  0.961]])}, None