# Pytorch Example: Neural Network with Categorical Embeddings

In this example we will demonstrate conversion of a Pytorch model that takes numeric and categorical inputs separately, and estimate the treatment effect using some simulated historical data. The model input data is sourced from this kaggle competition: https://www.kaggle.com/kmalit/bank-customer-churn-prediction/data

## 1. Conversion

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from pathlib import Path
import onnxruntime as rt
from pathlib import Path

from mlisne import convert_to_onnx

Below we define the model architecture. The `estimate_qps` function currently only supports models which output 1D arrays of treatment probabilities, which is why we only return the second output column after the Softmax layer.

In [2]:
class CatModel(nn.Module):

    def __init__(self, embedding_size, num_numerical_cols, output_size, layers, p=0.4):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.embedding_dropout = nn.Dropout(p)
        self.batch_norm_num = nn.BatchNorm1d(num_numerical_cols)

        all_layers = []
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        input_size = num_categorical_cols + num_numerical_cols

        for i in layers:
            all_layers.append(nn.Linear(input_size, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size = i

        all_layers.append(nn.Linear(layers[-1], output_size))

        self.layers = nn.Sequential(*all_layers)

        self.m = nn.Softmax(dim=1)

    def forward(self, x_categorical, x_numerical):
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x_categorical[:,i]))
        x = torch.cat(embeddings, 1)
        x = self.embedding_dropout(x)

        x_numerical = self.batch_norm_num(x_numerical)
        x = torch.cat([x, x_numerical], 1)
        x = self.layers(x)
        x = self.m(x)

        return x[:,1]

Let's load in the state dict of the pretrained model

In [3]:
model = CatModel([(3, 2), (2, 1), (2, 1), (2, 1)], 6, 2, [200,100,50], p=0.4)
model.load_state_dict(torch.load(f"models/churn_categorical.pt"))
model.eval()

CatModel(
  (all_embeddings): ModuleList(
    (0): Embedding(3, 2)
    (1): Embedding(2, 1)
    (2): Embedding(2, 1)
    (3): Embedding(2, 1)
  )
  (embedding_dropout): Dropout(p=0.4, inplace=False)
  (batch_norm_num): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=11, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=50, bias=True)
    (9): ReLU(inplace=True)
    (10): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): Dropout(p=0.4, inplace=False)
    (12): Linear(in_fe

Load in simulated data and preprocess inputs

In [4]:
churn_data = pd.read_csv("data/churn_data.csv")
churn_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Y0,Y1,Z_cat,Z,D,D_cat
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,2846.16569,2959.735825,0,0,0,0
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,3348.43358,3443.613535,1,0,0,1
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,3289.261374,3370.080692,1,1,1,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,2659.722696,2761.695307,1,0,0,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,2821.396257,2915.473778,0,0,0,1


In [5]:
categorical_cols = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
for category in categorical_cols:
    churn_data[category] = churn_data[category].astype('category')
cat = []
for c in categorical_cols:
    cat.append(churn_data[c].cat.codes.values)
cat_data = np.stack(cat, 1)
num_data = np.array(churn_data[numerical_cols])

cat_tensor = torch.tensor(cat_data)
num_tensor = torch.tensor(num_data)

Create our dummy inputs and convert to ONNX

In [6]:
cat_dummy = cat_tensor[0,None]
num_dummy = num_tensor[0,None]
f = "models/churn_categorical.onnx"

**Important:** Your dummy data must match the expected input types of your model. Otherwise the conversion will fail. Pytorch embedding layers expect 'long' types and the model expects 'float32' types for the continuous data. 

In [7]:
print(cat_dummy.dtype, num_dummy.dtype)

torch.int8 torch.float64


In [8]:
try:
    convert_to_onnx(model, (cat_dummy, num_dummy), f, "pytorch", input_type=2, input_names=("d_inputs", "c_inputs"))
except Exception as e:
    print(e)

Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.CharTensor instead (while checking arguments for embedding)


In [9]:
try:
    convert_to_onnx(model, (cat_dummy.long(), num_dummy), f, "pytorch", input_type=2, input_names=("d_inputs", "c_inputs"))
except Exception as e:
    print(e)

expected scalar type Double but found Float


To convert a model with separate discrete and continuous inputs, we must pass the arguments `input_type=2`, the dummy inputs as a tuple, and the input names. 

In [10]:
convert_to_onnx(model, (cat_dummy.long(), num_dummy.float()), f, "pytorch", input_type=2, input_names=("d_inputs", "c_inputs"))

True

We can verify that the ONNX model's predictions match those of the original Pytorch model.

In [11]:
with torch.no_grad():
    torch_preds = model(cat_tensor.long(), num_tensor.float()).numpy()
sess = rt.InferenceSession(f)
onnx_preds = sess.run(["output_0"], {"c_inputs": num_data.astype(np.float32),
                                               "d_inputs": cat_data.astype(np.int64)})[0]

np.testing.assert_array_almost_equal(torch_preds, onnx_preds, decimal=5)

## 2. QPS Estimation

The qps estimation procedure is nearly identical to the single-input case, albeit with a few extra arguments that need to be passed.

In [12]:
from mlisne import estimate_qps_onnx

In [13]:
# Generate the outcome based on the treatment assignment
churn_data['Y_cat'] = churn_data['Y0']
churn_data.loc[churn_data['D_cat'] == 1, 'Y_cat'] = churn_data.loc[churn_data['D_cat'] == 1, 'Y1']

In [14]:
churn_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,...,IsActiveMember,EstimatedSalary,Exited,Y0,Y1,Z_cat,Z,D,D_cat,Y_cat
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,...,1,101348.88,1,2846.16569,2959.735825,0,0,0,0,2846.16569
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,...,1,112542.58,0,3348.43358,3443.613535,1,0,0,1,3443.613535
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,...,0,113931.57,1,3289.261374,3370.080692,1,1,1,1,3370.080692
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,...,0,93826.63,0,2659.722696,2761.695307,1,0,0,0,2659.722696
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,...,1,79084.1,0,2821.396257,2915.473778,0,0,0,1,2915.473778


For separate inputs, we need to pass `input_type=2` and `input_names` in the format (continuous name, discrete name). Since the input data are not the same types as expected by the model, we will need to pass those as well.

In [15]:
qps = estimate_qps_onnx(X_c = num_data, X_d = cat_data, S=100, delta=0.8, ML_onnx=f, input_type=2, input_names=('c_inputs', 'd_inputs'), 
                  types = (np.float32, np.int64))
qps[:5]

array([0.3224629 , 0.25274602, 1.        , 0.05241935, 0.22616214],
      dtype=float32)

## 3. Treatment Effect Estimation

Following QPS estimation, we have everything we need to obtain a causal estimate of treatment effect. Our primary estimation function is `estimate_treatment_effect`.

In [16]:
from mlisne import estimate_treatment_effect

In [17]:
fitted_model = estimate_treatment_effect(qps = qps, data = churn_data[['Y_cat', 'Z_cat', 'D_cat']])

Indices for ['Y', 'Z', 'D'] not explicitly passed. Assuming remaining columns in order ['Y', 'Z', 'D']...
We will fit on 9920 values out of 10000 from the dataset for which the QPS estimation is nondegenerate.
                          IV-2SLS Estimation Summary                          
Dep. Variable:                      Y   R-squared:                     -0.0037
Estimator:                    IV-2SLS   Adj. R-squared:                -0.0039
No. Observations:                9920   F-statistic:                    2.8669
Date:                Fri, Sep 18 2020   P-value (F-stat)                0.2385
Time:                        13:57:14   Distribution:                  chi2(2)
Cov. Estimator:                robust                                         
                                                                              
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI


Compare estimated LATE against true treatment effects

In [18]:
# Treatment effects 
ate = (churn_data.Y1 - churn_data.Y0).mean()
atet = (churn_data.loc[churn_data['D_cat'] == 1, 'Y1'] - churn_data.loc[churn_data['D_cat'] == 1, 'Y0']).mean()
late = (churn_data.loc[(churn_data['D_cat'] == churn_data['Z_cat']), 'Y1'] - churn_data.loc[(churn_data['D_cat'] == churn_data['Z_cat']), 'Y0']).mean()
print(f"ATE: {ate}")
print(f"ATET: {atet}")
print(f"LATE: {late}")

ATE: 100.0608215954891
ATET: 100.07063117908788
LATE: 100.02601896940502


In [22]:
print(fitted_model.first_stage)

    First Stage Estimation Results   
                                    D
-------------------------------------
R-squared                      0.1650
Partial R-squared              0.1163
Shea's R-squared               0.1163
Partial F-statistic            1298.9
P-value (Partial F-stat)       0.0000
Partial F-stat Distn          chi2(1)
const                          0.2520
                             (42.221)
qps                            0.0284
                             (1.1479)
Z                              0.4889
                             (36.041)
-------------------------------------

T-stats reported in parentheses
T-stats use same covariance type as original model
