In [1]:
import pandas as pd
# Read in data
df = pd.read_csv('./data/train.csv')
df.columns = map(str.lower, df.columns)
df.head()

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [2]:
# Fix value from int to float
df["applicantincome"] = df["applicantincome"].astype("float64")

In [3]:
# show null values
df.isnull().sum().sort_values(ascending=False)

credit_history       50
self_employed        32
loanamount           22
dependents           15
loan_amount_term     14
gender               13
married               3
loan_status           0
property_area         0
coapplicantincome     0
applicantincome       0
education             0
loan_id               0
dtype: int64

# Data Deletion

In [4]:
# Drop some features
df.drop(["loan_id", "gender"], axis=1, inplace=True)

# Data Preparation

### Label Encoding

In [5]:
## Label Encoding
# Remove label
# Target is "loan_status"
from sklearn.preprocessing import LabelEncoder  
le = LabelEncoder()
# transform the target column

target_values = {'Y': 0 , 'N' : 1}

target = df['loan_status']
features = df.drop('loan_status', axis=1)

target = target.map(target_values)

### Numeric and categorical features

In [6]:
## Numeric and categorical features
from pandas.api.types import is_numeric_dtype

categorical_features = []
numerical_features = []

for column in features.columns:
    if is_numeric_dtype(features[column]):
        numerical_features.append(column)
    else :
        categorical_features.append(column)
        
print(f"Numerical features: {numerical_features}")
print(f"Categorical features: {categorical_features}")

Numerical features: ['applicantincome', 'coapplicantincome', 'loanamount', 'loan_amount_term', 'credit_history']
Categorical features: ['married', 'dependents', 'education', 'self_employed', 'property_area']


### Missing data

In [7]:
numerical_data = features[numerical_features]
numerical_data = numerical_data.apply(lambda x:x.fillna(x.mean()))
categorical_data = features[categorical_features]
categorical_data = categorical_data.apply(lambda x:x.fillna("-"))

### Data Splitting

In [8]:
X = pd.concat([numerical_data, categorical_data], axis=1)
y = target 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

### Numerical features: Missing data and MinMax Scaling

In [9]:
# MinMax Scaling
numerical_data_std = (numerical_data - numerical_data.min(axis=0)) / (numerical_data.max(axis=0) - numerical_data.min(axis=0))
numerical_scaled = numerical_data_std * (1 - 0) + 1
numerical_scaled.isnull().sum().any() # no more missing data 

False

### Categorical features: Feature Engineering with One-Hot-Encoding

In [10]:
for feature in categorical_features:
    one_hot = pd.get_dummies(categorical_data[feature])
    for column in one_hot.columns:
        categorical_data[f"{feature}_{column}"] = one_hot[column]

categorical_data.drop(categorical_features, axis=1, inplace=True)

In [11]:
categorical_data.head()

Unnamed: 0,married_-,married_No,married_Yes,dependents_-,dependents_0,dependents_1,dependents_2,dependents_3+,education_Graduate,education_Not Graduate,self_employed_-,self_employed_No,self_employed_Yes,property_area_Rural,property_area_Semiurban,property_area_Urban
0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1
1,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0
2,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,1
3,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,1
4,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,1


# Train Model

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score, log_loss, accuracy_score

def loss(y_true, y_pred, retu=False):
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    loss = log_loss(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    
    if retu:
        return pre, rec, f1, loss, acc
    else:
        print('  pre: %.3f\n  rec: %.3f\n  f1: %.3f\n  loss: %.3f\n  acc: %.3f' % (pre, rec, f1, loss, acc))
        
#loss(clf.predict(X_test), y_test)    

# Pipeline

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)])

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf)])

In [23]:
model = pipeline.fit(X_train,y_train)

In [24]:
loss(model.predict(X_test), y_test)

  pre: 0.440
  rec: 0.898
  f1: 0.591
  loss: 6.863
  acc: 0.801


# skl2onnx

In [26]:
!pip install skl2onnx onnxruntime

Collecting skl2onnx
  Downloading skl2onnx-1.7.0-py2.py3-none-any.whl (191 kB)
[K     |████████████████████████████████| 191 kB 1.4 MB/s eta 0:00:01
[?25hCollecting onnxruntime
  Downloading onnxruntime-1.5.1-cp37-cp37m-manylinux2014_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 2.7 MB/s eta 0:00:01     |████████████████████████▎       | 2.9 MB 2.7 MB/s eta 0:00:01
[?25hCollecting onnxconverter-common>=1.5.1
  Downloading onnxconverter_common-1.7.0-py2.py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 1.6 MB/s eta 0:00:01
Collecting onnx>=1.2.1
  Downloading onnx-1.7.0-cp37-cp37m-manylinux1_x86_64.whl (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 10.0 MB/s eta 0:00:01     |███████████████████████████▏    | 6.3 MB 10.0 MB/s eta 0:00:01
Installing collected packages: onnx, onnxconverter-common, skl2onnx, onnxruntime
Successfully installed onnx-1.7.0 onnxconverter-common-1.7.0 onnxruntime-1.5.1 skl2onnx-1.7.0


### Input preparation

In [27]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, StringTensorType, Int64TensorType

def convert_dataframe_schema(df, drop=None):
    inputs = []
    for k, v in zip(df.columns, df.dtypes):
        if drop is not None and k in drop:
            continue
        #if v == 'int64':
        #    t = Int64TensorType([None, 1])
        if v == 'float64':
            t = FloatTensorType([None, 1])
        else:
            t = StringTensorType([None, 1])
        inputs.append((k, t))
    return inputs

inputs = []
for categorical in categorical_features:
    inputs.append((categorical, StringTensorType([None, 1])))
for numerical in numerical_features:
    inputs.append((numerical, FloatTensorType([None, 1])))
#inputs = convert_dataframe_schema(X)


In [28]:
inputs

[('married', StringTensorType(shape=[None, 1])),
 ('dependents', StringTensorType(shape=[None, 1])),
 ('education', StringTensorType(shape=[None, 1])),
 ('self_employed', StringTensorType(shape=[None, 1])),
 ('property_area', StringTensorType(shape=[None, 1])),
 ('applicantincome', FloatTensorType(shape=[None, 1])),
 ('coapplicantincome', FloatTensorType(shape=[None, 1])),
 ('loanamount', FloatTensorType(shape=[None, 1])),
 ('loan_amount_term', FloatTensorType(shape=[None, 1])),
 ('credit_history', FloatTensorType(shape=[None, 1]))]

In [30]:
# Write model
model_onnx = convert_sklearn(pipeline, initial_types=inputs)
with open("model/loan_model.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())

In [31]:
import numpy as np

In [32]:
# Prepare input data
input_data = {column: X_test[column].values for column in X_test.columns}
for numeric in numerical_features:
    input_data[numeric] = input_data[numeric].astype(np.float32)
for categorical in input_data:
    input_data[categorical] = input_data[categorical].reshape((input_data[categorical].shape[0], 1))

In [33]:
import onnxruntime as rt
sess = rt.InferenceSession("model/loan_model.onnx")
pred_onnx = sess.run(None, input_data)

In [34]:
loss(pred_onnx[0], model.predict(X_test))

  pre: 1.000
  rec: 1.000
  f1: 1.000
  loss: 0.000
  acc: 1.000
