In [None]:
import pandas as pd
# Read in data
df = pd.read_csv('./data/train.csv')
df.columns = map(str.lower, df.columns)
df.head()

In [None]:
# Fix value from int to float
df["applicantincome"] = df["applicantincome"].astype("float64")

In [None]:
# show null values
df.isnull().sum().sort_values(ascending=False)

# Data Deletion

In [None]:
# Drop some features
df.drop(["loan_id", "gender"], axis=1, inplace=True)

# Data Preparation

### Label Encoding

In [None]:
## Label Encoding
# Remove label
# Target is "loan_status"
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# transform the target column

target_values = {'Y': 0, 'N': 1}

target = df['loan_status']
features = df.drop('loan_status', axis=1)

target = target.map(target_values)

### Numeric and categorical features

In [None]:
# Numeric and categorical features
from pandas.api.types import is_numeric_dtype

categorical_features = []
numerical_features = []

for column in features.columns:
    if is_numeric_dtype(features[column]):
        numerical_features.append(column)
    else:
        categorical_features.append(column)

print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

### Missing data

In [None]:
numerical_data = features[numerical_features]
numerical_data = numerical_data.apply(lambda x:x.fillna(x.mean()))
categorical_data = features[categorical_features]
categorical_data = categorical_data.apply(lambda x:x.fillna("-"))

### Data Splitting

In [None]:
X = pd.concat([numerical_data, categorical_data], axis=1)
y = target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

### Numerical features: Missing data and MinMax Scaling

In [None]:
# MinMax Scaling
numerical_data_std = (numerical_data - numerical_data.min(axis=0)) / (numerical_data.max(axis=0) - numerical_data.min(axis=0))
numerical_scaled = numerical_data_std * (1 - 0) + 1
numerical_scaled.isnull().sum().any() # no more missing data 

### Categorical features: Feature Engineering with One-Hot-Encoding

In [None]:
for feature in categorical_features:
    one_hot = pd.get_dummies(categorical_data[feature])
    for column in one_hot.columns:
        categorical_data[f"{feature}_{column}"] = one_hot[column]

categorical_data.drop(categorical_features, axis=1, inplace=True)

In [None]:
categorical_data.head()

# Train Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, accuracy_score

def loss(y_true, y_pred, retu=False):
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    loss = log_loss(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)

    if retu:
        return pre, rec, f1, loss, acc
    else:
        print('  pre: %.3f\n  rec: %.3f\n  f1: %.3f\n  loss: %.3f\n  acc: %.3f' % (pre, rec, f1, loss, acc))

#loss(clf.predict(X_test), y_test)

# Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)])

clf = LogisticRegression()
pipeline = Pipeline(
    steps=[('preprocessor', preprocessor), ('classifier', clf)]
)

In [None]:
model = pipeline.fit(X_train, y_train)

In [None]:
loss(model.predict(X_test), y_test)

# skl2onnx

In [None]:
!pip install skl2onnx onnxruntime

### Input preparation

In [None]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType, Int64TensorType

def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        #if v == 'int64':
        #    t = Int64TensorType([None, 1])
        if v == 'float64':
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs

inputs = []
for categorical in categorical_features:
    inputs.append((categorical, StringTensorType([None, 1])))
for numerical in numerical_features:
    inputs.append((numerical, FloatTensorType([None, 1])))
#inputs = convert_dataframe_schema(X)


In [None]:
inputs

In [None]:
# Write model
model_onnx = convert_sklearn(pipeline, initial_types=inputs)
with open("model/loan_model.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [None]:
import numpy as np

In [None]:
# Prepare input data
input_data = {column: X_test[column].values for column in X_test.columns}
for numeric in numerical_features:
    input_data[numeric] = input_data[numeric].astype(np.float32)
for categorical in input_data:
    input_data[categorical] = input_data[categorical].reshape((input_data[categorical].shape[0], 1))

In [None]:
import onnxruntime as rt
sess = rt.InferenceSession("model/loan_model.onnx")
pred_onnx = sess.run(None, input_data)

In [None]:
loss(pred_onnx[0], model.predict(X_test))