In [10]:
#imports
import numpy as np
import pandas as pd
import random
import string

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import os
import csv
import datetime
from datetime import timedelta,datetime, date

from scipy import stats
import pickle

import catboost

import sklearn
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [2]:
os.chdir("streamlit")

In [3]:
exptypes = [x.strip()   for x in open("exptypes.txt", "r").readlines()]
prodtypes = [x.strip()   for x in open("prodtypes.txt", "r").readlines()]

In [4]:
df = pd.read_csv("test.csv")

In [5]:
df.EXP_TYPE = pd.Categorical(df.EXP_TYPE, exptypes)

In [6]:
def transform_to_input_format(df):
  df2 = df.groupby("EXP_TYPE").apply(lambda x: pd.Series([x.shape[0], 
    x.AMOUNT.sum()], index= ["n", "AMOUNT"]))
  df2.loc[:, "AMOUNT_prop"] = df2.AMOUNT / df2.AMOUNT.sum()
  df2.loc[:, "n_prop"] = df2.n / df2.n.sum()
  df2.loc[:, "AMOUNT_per_n"] = [0.0 if np.isnan(x) else x  
                                for x in df2.AMOUNT / df2.n]
  nc = len(exptypes)
  datax = np.zeros((1, 3*len(exptypes)))
  datax[0, range(nc)] = df2.AMOUNT_prop
  datax[0, range(nc, nc + nc)] = df2.n_prop
  datax[0, range(2*nc, 2*nc + nc)] = df2.AMOUNT_per_n

  return(datax)

In [8]:
datax = transform_to_input_format(df)

In [11]:
def load_pretrained_models():
  def cbload(i):
    from_file = catboost.CatBoostClassifier()
    from_file.load_model("catboost_p{}.cbm".format(i))  
    return(from_file)

  cb_models = [cbload(i)   for i in range(10)]
  lr_models = pickle.load(open("logregr_models.pickle", 'rb'))
  cb_thrstats = pickle.load(open("catboost_thr.pickle", 'rb'))
  lr_thrstats = pickle.load(open("logregr_thr.pickle", 'rb'))
  return(cb_models, lr_models, cb_thrstats, lr_thrstats)

cb_models, lr_models, cb_thrstats, lr_thrstats = load_pretrained_models()

In [65]:
def predictions(datax):
  return(pd.DataFrame({
      "cb": [cb_models[i].predict_proba(datax)[0][1] 
             for i in range(len(prodtypes))],
      "lr": [lr_models[i].predict_proba(datax)[0][1] 
             for i in range(len(prodtypes))]
    }))
  
pr = predictions(datax)
pr

Unnamed: 0,cb,lr
0,0.825178,0.166101
1,0.513856,0.207607
2,0.009735,0.044351
3,0.080404,0.114877
4,0.013476,0.100944
5,0.021562,0.047452
6,0.009385,0.036396
7,0.014083,0.021882
8,0.008065,0.033759
9,0.079762,0.000835


In [48]:
def find_approp_thresholds(thrstats, target, what): #what=1 -- prec, 2 -- recall
  r = []
  for i in range(len(thrstats[1])):
    l = thrstats[what][i] >= target
    if l.sum() > 0:
      j = np.where(l)[0][np.argmin(thrstats[what][i][l])]
    else:
      j = np.argmax(thrstats[what][i])
    r.append({"thr": thrstats[0][j], "prec": thrstats[1][i][j], 
              "recall": thrstats[2][i][j]})
  return pd.DataFrame(r)
      

In [51]:
find_approp_thresholds(cb_thrstats, 0.9, 1)

Unnamed: 0,thr,prec,recall
0,0.49,0.900015,0.596045
1,0.45,0.901465,0.756062
2,0.4,0.902136,0.725309
3,0.53,0.901634,0.445041
4,0.38,0.903598,0.444886
5,0.5,0.902088,0.136383
6,0.42,0.902931,0.574913
7,0.39,0.902317,0.655975
8,0.35,0.900002,0.391265
9,0.35,0.902749,0.631253


In [52]:
find_approp_thresholds(lr_thrstats, 0.9, 1)

Unnamed: 0,thr,prec,recall
0,0.99,0.9,0.009351
1,0.87,0.901049,0.187999
2,0.78,0.900551,0.259826
3,0.97,0.917081,0.051772
4,0.97,0.911664,0.116204
5,0.74,0.55,0.017124
6,0.95,0.906032,0.149094
7,0.98,0.917483,0.121473
8,0.94,0.9,0.026849
9,0.46,0.902733,0.673388


In [68]:
targmetr = 1
targmetrval = 0.9
metrother = 2 if targmetr == 1 else 1

def find_approp_thresholds(thrstats, target, what): #what=1 -- prec, 2 -- recall
  r = []
  for i in range(len(thrstats[1])):
    l = thrstats[what][i] >= target
    if l.sum() > 0:
      j = np.where(l)[0][np.argmin(thrstats[what][i][l])]
    else:
      j = np.argmax(thrstats[what][i])
    r.append({"thr": thrstats[0][j], "prec": thrstats[1][i][j], 
              "recall": thrstats[2][i][j]})
  return pd.DataFrame(r)
cbt = find_approp_thresholds(cb_thrstats, targmetrval, targmetr)
lrt = find_approp_thresholds(lr_thrstats, targmetrval, targmetr)

print(cbt)
print(lrt)

    thr      prec    recall
0  0.49  0.900015  0.596045
1  0.45  0.901465  0.756062
2  0.40  0.902136  0.725309
3  0.53  0.901634  0.445041
4  0.38  0.903598  0.444886
5  0.50  0.902088  0.136383
6  0.42  0.902931  0.574913
7  0.39  0.902317  0.655975
8  0.35  0.900002  0.391265
9  0.35  0.902749  0.631253
    thr      prec    recall
0  0.99  0.900000  0.009351
1  0.87  0.901049  0.187999
2  0.78  0.900551  0.259826
3  0.97  0.917081  0.051772
4  0.97  0.911664  0.116204
5  0.74  0.550000  0.017124
6  0.95  0.906032  0.149094
7  0.98  0.917483  0.121473
8  0.94  0.900000  0.026849
9  0.46  0.902733  0.673388


In [74]:
r = []
for i in range(pr.shape[0]):
  if cbt.iloc[i, metrother] > lrt.iloc[i, metrother]:
    if pr.cb[i] >= cbt.thr[i]:
      r.append(prodtypes[i])
  else:
    if pr.lr[i] >= lrt.thr[i]:
      r.append(prodtypes[i])
print(r)

['Автокредитование', 'Вклады']


In [77]:
pd.DataFrame({
    "product": prodtypes,   "score": pr.cb,   "score_threshold": cbt.thr,   
    "rule precision": cbt.prec, "rule recall": cbt.recall,
  })

Unnamed: 0,product,score,score_threshold,rule precision,rule recall
0,Автокредитование,0.825178,0.49,0.900015,0.596045
1,Вклады,0.513856,0.45,0.901465,0.756062
2,Инвестиции,0.009735,0.4,0.902136,0.725309
3,Кредиты,0.080404,0.53,0.901634,0.445041
4,"Лайфстайл (бронирование столиков, покупка биле...",0.013476,0.38,0.903598,0.444886
5,Накопительные счета,0.021562,0.5,0.902088,0.136383
6,Прайват обслуживание,0.009385,0.42,0.902931,0.574913
7,Премиальное обслуживание,0.014083,0.39,0.902317,0.655975
8,Страхование,0.008065,0.35,0.900002,0.391265
9,Услуги для бизнеса,0.079762,0.35,0.902749,0.631253


In [78]:
pd.DataFrame({
    "product": prodtypes,   "score": pr.lr,   "score_threshold": lrt.thr,   
    "rule precision": lrt.prec, "rule recall": lrt.recall,
  })

Unnamed: 0,product,score,score_threshold,rule precision,rule recall
0,Автокредитование,0.166101,0.99,0.9,0.009351
1,Вклады,0.207607,0.87,0.901049,0.187999
2,Инвестиции,0.044351,0.78,0.900551,0.259826
3,Кредиты,0.114877,0.97,0.917081,0.051772
4,"Лайфстайл (бронирование столиков, покупка биле...",0.100944,0.97,0.911664,0.116204
5,Накопительные счета,0.047452,0.74,0.55,0.017124
6,Прайват обслуживание,0.036396,0.95,0.906032,0.149094
7,Премиальное обслуживание,0.021882,0.98,0.917483,0.121473
8,Страхование,0.033759,0.94,0.9,0.026849
9,Услуги для бизнеса,0.000835,0.46,0.902733,0.673388
