In [1]:
from optbinning import OptimalBinning

from os import listdir
from os.path import isfile, join
from datetime import datetime, timedelta
import time
import sys

import pandas as pd
import numpy as np
import random

from scipy import stats
from scipy.stats import chi2_contingency
from scipy.stats import chi2

from optbinning import OptimalBinning
from catboost import CatBoostClassifier, Pool, cv

import scikitplot as skplt
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from bayes_opt import BayesianOptimization
import catboost
import pickle
import csv
import scipy as sc

## Create a dataset

In [41]:
def create_df(seed):
    np.random.seed(seed)
    x1=np.random.normal(0,1,1000)
    x2=np.random.normal(0,1,1000)
    x3=np.random.normal(0,1,1000)
    x4=np.random.normal(0,1,1000)/5

    z=1 + 2*x1 + x2 + 2*x2*x3 +x4
    pr=1/(1+np.exp(-z))
    y=sc.stats.binom.rvs(1, pr, size=1000)
    y1= z+x4
    df = pd.DataFrame(data={'y':y, 'y1': y1,'x1':x1, 'x2':x2, 'x3':x3, 'x4':x4})
    return [df,z]

In [42]:
df = create_df(30)[0]

In [43]:
df['x5'] = 'B'
df.loc[df.x1>0.1,'x5'] = 'B'
df.loc[df.x1>0.4,'x5'] = 'C'
df.loc[df.x1>0.6,'x5'] = 'D'

In [44]:
#data = df['data'].copy()
#data.columns = df['feature_names'].copy()
#df_view = pd.DataFrame(data, columns = list(df['feature_names']))
#df_view['target'] = df['target'].copy()

In [45]:
df_view = df.copy()

In [46]:
df_view.shape

(1000, 7)

In [47]:
df_types = df_view.dtypes
df_types.value_counts()

float64    5
object     1
int64      1
dtype: int64

In [48]:
categorical_cols = df_types[df_types=='object']
categorical_cols = list(categorical_cols.index)
categorical_cols = [x for x in categorical_cols if x!='uuid']
categorical_cols

['x5']

In [49]:
numerical_cols = df_types[df_types!='object']
numerical_cols = list(numerical_cols.index)
numerical_cols = [x for x in numerical_cols if x!='default']
to_cat = [x for x in numerical_cols if 'status' in x] 
categorical_cols = categorical_cols + to_cat
#numerical_cols = list(set(numerical_cols)-set(ordinal_cols))
numerical_cols = list(set(numerical_cols)-set(categorical_cols))

In [50]:
#numerical_cols.pop('y')
pos = numerical_cols.index("y")
numerical_cols.pop(pos)
numerical_cols

['x4', 'x3', 'x2', 'x1', 'y1']

In [51]:
df_view= df_view[df_view.y.isnull()==False]
random.seed(1)
df_view['random'] = np.random.randint(1, df_view.shape[0], df_view.shape[0])/df_view.shape[0]
porc_train=0.6
porc_test=0.2

df_view['flag_train'] = 0 #train
df_view['flag_train'] = ((df_view['random'] > porc_train)).astype('int') #eval
df_view.loc[df_view['random'] > (1-porc_test),'flag_train']=2 #test

df_view.flag_train.value_counts()

0    613
2    199
1    188
Name: flag_train, dtype: int64

In [52]:
#print(categorical_cols)
predictors = numerical_cols + categorical_cols 
end_num = len(numerical_cols)
end_cat = len(predictors)
predictors

['x4', 'x3', 'x2', 'x1', 'y1', 'x5']

In [53]:
l = predictors.copy()
l.extend(['y', 'flag_train'])
#print(l)
for x in categorical_cols:
    df_view[x] = df_view[x].astype(str)
df_view2 = df_view[l].copy()

cat_features = [df_view2.columns.get_loc(c) for c in categorical_cols  if c in df_view2]
cat_features

[5]

In [54]:

X_train = df_view2[(df_view2['flag_train'] ==0)]
y_train = X_train.pop('y')

X_val = df_view2[(df_view2['flag_train'] ==1)]
y_val = X_val.pop('y')

X_test = df_view2[(df_view2['flag_train'] ==2)]
y_test = X_test.pop('y')

x_train = X_train[predictors].values

__Save dataset for test__

In [56]:
df_view2[(df_view2['flag_train'] == 2)].to_csv('X_test.csv', index = False)


In [None]:
def catboost_classifier(depth, l2_leaf_reg, num_boost_round, subsample):
    params = {
        "loss_function": "Logloss",
        "eval_metric" : "AUC", 
        "depth" : int(depth),
        "min_data_in_leaf": 100,
        "l2_leaf_reg" : int(l2_leaf_reg),
        "learning_rate" : 0.01,
        "random_state" : 42,
        "logging_level" : "Silent",
        "thread_count": 24,
        "num_boost_round": int(num_boost_round),
        "subsample": float(subsample)
    }
    train_data = catboost.Pool(data=x_train, 
                               label=y_train, 
                               cat_features=cat_features)
    cv_result = catboost.cv(
                       train_data,
                       params,
                       early_stopping_rounds=20,
                       stratified=True,
                       nfold=3)
    return cv_result['test-AUC-mean'].iloc[-1]

catboostBO = BayesianOptimization(catboost_classifier, {
                                                'depth': (3,  10),
                                                'l2_leaf_reg': (2, 9),
                                                'num_boost_round': (100, 1000),
                                                'subsample':(0.2,.9)
                                                })


In [None]:
model = CatBoostClassifier(thread_count=5, 
                           max_depth=10, 
                           loss_function='Logloss' ,
                           verbose=100,
                           eval_metric= "AUC",#"CrossEntropy",
                           #early_stopping_rounds=20,
                           random_seed=42,
                           learning_rate=0.01,
                           l2_leaf_reg=5.9,
                           min_data_in_leaf=100,
                           iterations=900,
                           subsample=0.4)

model.fit(x_train,  y_train,  eval_set = (X_val[predictors].values, y_val),   cat_features = cat_features, plot=True)

In [None]:
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

filename = "predictors.csv"
with open(filename, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(predictors)

filename = "to_cat.csv"
with open(filename, 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(categorical_cols)   

## Create pyhon file

In [6]:
%%writefile model.py
import numpy as np
import csv
import os
import pandas as pd
import pickle
default_model_path = '/opt/ml'

model_cache = {}

def load_model(algorithm, model_path):
    if model_cache.get(algorithm) is None:
        model_filename = os.path.join(model_path, 'model.pkl')
        with open(model_filename, newline='') as file:
            model_cache[algorithm] = pickle.load(open(model_filename, 'rb'))
    
    return model_cache[algorithm]


def __read_csv_list(filename):
    with open(filename, newline='') as file:
        reader = csv.reader(file)
        return list(reader)[0]
        
    return []

predictors_cache = {}

def load_predictors(algorithm, model_path):
    if predictors_cache.get(algorithm) is None:
        predictors_filename = os.path.join(model_path, 'predictors.csv')
        predictors_cache[algorithm] = __read_csv_list(predictors_filename)
            
    return predictors_cache[algorithm]

to_cat_cache = {}

def load_to_cat(algorithm, model_path):
    if to_cat_cache.get(algorithm) is None:
        to_cat_filename = os.path.join(model_path, 'to_cat.csv')
        to_cat_cache[algorithm] = __read_csv_list(to_cat_filename)
            
    return to_cat_cache[algorithm]


def predict(data, model_path = default_model_path):
    algorithm = "algorithm_catboost"
    
    model = load_model(algorithm, model_path)
    predictors = load_predictors(algorithm, model_path)
    to_cat = load_to_cat(algorithm, model_path)
    
    if data.shape[0] == 0:
        return pd.DataFrame()

    for x in to_cat:
        data[x] = data[x].astype(str)

    y_pred_probs = model.predict_proba( data[predictors].values )
    probabilities = [item[1] for item in y_pred_probs]
    data['pd'] = probabilities
    
    
    return data

Overwriting model.py


__Test model__

In [57]:
import model
import os
import pandas as pd
#model_path = os.path.dirname(os.getcwd())
model_path = os.getcwd()
ret = None
with open(os.path.join(model_path,'X_test.csv'), newline='') as file:
    data = pd.read_csv(file, sep=',', low_memory=False, error_bad_lines=False)
    ret = model.predict(data, model_path)
ret

Unnamed: 0,x4,x3,x2,x1,y1,x5,y,flag_train,pd
0,-0.010777,-0.732013,1.134899,1.527905,3.507635,D,1,2,0.624604
1,0.507330,0.359118,0.526648,-0.100697,2.718173,B,1,2,0.592694
2,0.165429,-0.450724,1.590835,0.303793,2.095225,B,1,2,0.592271
3,-0.337947,0.538282,1.398086,-0.764048,1.699226,B,1,2,0.587154
4,-0.204668,0.223539,-0.050285,1.064482,2.646861,D,1,2,0.623080
...,...,...,...,...,...,...,...,...,...
194,0.270106,-1.220430,-1.789519,0.855495,5.829651,D,1,2,0.617781
195,-0.046588,1.232337,0.883816,0.783989,5.536935,D,1,2,0.618870
196,-0.302814,-1.168124,-1.160325,-1.800726,-1.656599,B,1,2,0.433109
197,-0.068465,2.190936,-1.399620,1.002345,-4.664816,D,0,2,0.432097


# Create web server app
__Bottle__ is the webservice api

__bjoern__ is the WSGI server

In [64]:
%%writefile app.py
import pandas as pd
import pickle
import csv
import sys
import os
import io
import bjoern
import bottle
from bottle import run, request, post, get

# adds the model.py path to the list
model_path = os.path.dirname(os.getcwd())
if 'MODEL_PATH' in os.environ:
    model_path = os.environ['MODEL_PATH']

sys.path.insert(0,model_path)

import model

@get('/ping')
def ping():
    return "Ok"

@post('/invocations')
def invoke():
    # load image from POST and convert it to json
    try:
        req = request.body

        data = pd.read_csv(req, sep=',', low_memory=False, error_bad_lines=False)
        predictions = model.predict(data, model_path)

        return predictions.to_csv(sep=',', index=False)
    except Exception as e:
        print(e)
        return bottle.HTTPResponse(status=500)
    

if __name__ == '__main__':
    if len(sys.argv) < 2 or ( not sys.argv[1] in [ "serve", "train"] ):
        raise Exception("Invalid argument: you must inform 'train' for fake training mode or 'serve' predicting mode") 

    train = (sys.argv[1] == "train")
    
    if train:
        print( "Fake training completed" )
       
    else:
        print("Server started")
        if 'PORT' in os.environ: 
            port = int(os.environ['PORT'])
        else:
            port = 8080
        
        print(f"Port: {port}")
        print(f"Model path: {model_path}")
        bjoern.run(bottle.app(), "0.0.0.0", port)
        
        

Overwriting app.py


## Create Dockerfile

In [66]:
%%writefile Dockerfile
FROM python:3.9.1

RUN apt-get update -y && apt-get install -y libev-dev
RUN pip install bottle
RUN pip install bjoern
RUN pip install pandas==1.2.2
RUN pip install numpy==1.20.1
RUN pip install catboost==0.24.4

RUN mkdir -p /opt/program
RUN mkdir -p /opt/ml

ENV PYTHONUNBUFFERED=TRUE
ENV PYTHONDONTWRITEBYTECODE=TRUE
ENV PATH="/opt/program:${PATH}"
ENV MODEL_PATH='/opt/ml'

COPY app.py /opt/program
COPY model.py /opt/ml
COPY model.pkl /opt/ml
COPY predictors.csv /opt/ml
COPY to_cat.csv /opt/ml

WORKDIR /opt/program

ENTRYPOINT ["python", "app.py"]

Overwriting Dockerfile


## Build docker image

In [67]:
!docker build -t containerdocker:v0.0.1 .

Sending build context to Docker daemon  12.74MB
Step 1/20 : FROM python:3.9.1
 ---> 2a93c239d591
Step 2/20 : RUN apt-get update -y && apt-get install -y libev-dev
 ---> Using cache
 ---> 6596e41441ae
Step 3/20 : RUN pip install bottle
 ---> Using cache
 ---> 9c477085055a
Step 4/20 : RUN pip install bjoern
 ---> Using cache
 ---> 8458496a66ce
Step 5/20 : RUN pip install pandas==1.2.2
 ---> Using cache
 ---> f114805d12d7
Step 6/20 : RUN pip install numpy==1.20.1
 ---> Using cache
 ---> 7e192144e093
Step 7/20 : RUN pip install catboost==0.24.4
 ---> Using cache
 ---> 9d83467dc052
Step 8/20 : RUN mkdir -p /opt/program
 ---> Using cache
 ---> 6efac8035fb6
Step 9/20 : RUN mkdir -p /opt/ml
 ---> Using cache
 ---> df2caa0db5de
Step 10/20 : ENV PYTHONUNBUFFERED=TRUE
 ---> Using cache
 ---> 2a4e8d9623a2
Step 11/20 : ENV PYTHONDONTWRITEBYTECODE=TRUE
 ---> Using cache
 ---> cd203f5fb80e
Step 12/20 : ENV PATH="/opt/program:${PATH}"
 ---> Using cache
 ---> 623ce389700e
Step 13/20 : ENV MODEL_PATH='/

In [68]:
!docker run -p 8081:8080 -d --rm --name test_img containerdocker:v0.0.1 serve

28b195383cdb04232d3741a3e48af88022c672e9a4e32cf2b80c76a7d798587b


In [69]:
import urllib.request
contents = urllib.request.urlopen("http://localhost:8081/ping").read()
contents

b'Ok'

In [91]:
with open(os.path.join(model_path,'X_test.csv'), newline='') as file:
    #data = pd.read_csv(file, sep=',', low_memory=False, error_bad_lines=False)
    data = file.read().encode('utf-8')

    # create request
    req = urllib.request.Request("http://localhost:8081/invocations", data = data)
    

# execute request
resp = str(urllib.request.urlopen(req).read(),'utf-8')
resp = StringIO(resp) 
res = pd.read_csv(resp)
res

Unnamed: 0,x4,x3,x2,x1,y1,x5,y,flag_train,pd
0,-0.010777,-0.732013,1.134899,1.527905,3.507635,D,1,2,0.624604
1,0.507330,0.359118,0.526648,-0.100697,2.718173,B,1,2,0.592694
2,0.165429,-0.450724,1.590835,0.303793,2.095225,B,1,2,0.592271
3,-0.337947,0.538282,1.398086,-0.764048,1.699226,B,1,2,0.587154
4,-0.204668,0.223539,-0.050285,1.064482,2.646861,D,1,2,0.623080
...,...,...,...,...,...,...,...,...,...
194,0.270106,-1.220430,-1.789519,0.855495,5.829651,D,1,2,0.617781
195,-0.046588,1.232337,0.883816,0.783989,5.536935,D,1,2,0.618870
196,-0.302814,-1.168124,-1.160325,-1.800726,-1.656599,B,1,2,0.433109
197,-0.068465,2.190936,-1.399620,1.002345,-4.664816,D,0,2,0.432097
