In [None]:
import pandas as pd
# Read in data
df = pd.read_csv('./data/train.csv')
df.columns = map(str.lower, df.columns)
df.head()

In [None]:
# Fix value from int to float
df["applicantincome"] = df["applicantincome"].astype("float64")

In [None]:
# show null values
df.isnull().sum().sort_values(ascending=False)

# Data Preparation

In [None]:
# Drop some features
df.drop(["loan_id", "gender"], axis=1, inplace=True)

### Label Encoding

In [None]:
## Label Encoding
# Remove label
# Target is "loan_status"
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# transform the target column

target_values = {'Y': 0, 'N': 1}

target = df['loan_status']
features = df.drop('loan_status', axis=1)

target = target.map(target_values)

### Numeric and categorical features

In [None]:
from pandas.api.types import is_numeric_dtype

categorical_features = []
numerical_features = []

for column in features.columns:
    if is_numeric_dtype(features[column]):
        numerical_features.append(column)
    else:
        categorical_features.append(column)

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

### Missing data

In [None]:
numerical_data = features[numerical_features]
numerical_data = numerical_data.apply(lambda x:x.fillna(x.mean()))
categorical_data = features[categorical_features]
categorical_data = categorical_data.apply(lambda x:x.fillna("-"))

### Data Splitting

In [None]:
X = pd.concat([numerical_data, categorical_data], axis=1)
y = target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

### Numerical features: Missing data and MinMax Scaling

In [None]:
numerical_data_std = (numerical_data - numerical_data.min(axis=0)) / (numerical_data.max(axis=0) - numerical_data.min(axis=0))
numerical_scaled = numerical_data_std * (1 - 0) + 1
numerical_scaled.isnull().sum().any() # no more missing data 

### Categorical features: Feature Engineering with One-Hot-Encoding

In [None]:
for feature in categorical_features:
    one_hot = pd.get_dummies(categorical_data[feature])
    for column in one_hot.columns:
        categorical_data[f"{feature}_{column}"] = one_hot[column]

categorical_data.drop(categorical_features, axis=1, inplace=True)

# Train Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)])

clf = LogisticRegression()
pipeline = Pipeline(
    steps=[('preprocessor', preprocessor), ('classifier', clf)]
)

In [None]:
model = pipeline.fit(X_train, y_train)

# Convert model to ONNX

In [None]:
from skl2onnx.common.data_types import FloatTensorType, StringTensorType

input_types = []
for categorical in categorical_features:
    input_types.append((categorical, StringTensorType([None, 1])))
for numerical in numerical_features:
    input_types.append((numerical, FloatTensorType([None, 1])))

In [None]:
input_types

In [None]:
from skl2onnx import to_onnx
model_onnx = to_onnx(pipeline, initial_types=input_types)
with open("model/loan_model.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

# Test ONNX model

In [None]:
import numpy as np
input_data = {column: X_test[column].values for column in X_test.columns}
for numeric in numerical_features:
    input_data[numeric] = input_data[numeric].astype(np.float32)
for categorical in input_data:
    input_data[categorical] = input_data[categorical].reshape((input_data[categorical].shape[0], 1))

In [None]:
import onnxruntime as rt
sess = rt.InferenceSession("model/loan_model.onnx")
pred_onnx = sess.run(None, input_data)
pred_onnx[1]