In [None]:
import pandas as pd
import numpy as np
import math
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import Lasso, LogisticRegression, RidgeClassifier, Ridge
from sklearn.feature_selection import RFE
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler, BorderlineSMOTE, SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier
#import xgboost as xgb

In [None]:
pd.options.display.max_rows = 150
pd.options.display.max_columns = 150

In [None]:
df = pd.read_csv('data/bank_data_train.zip', compression='zip')

In [None]:
df.describe()

In [None]:
cols_to_log = ['REST_AVG_CUR', 'TURNOVER_CC', 'TURNOVER_PAYM', 'REST_AVG_PAYM', 'LDEAL_AMT_MONTH', 'CLNT_SALARY_VALUE']
df[cols_to_log].describe()
for i in cols_to_log:
    df[i] = df[i].apply(lambda x: np.log(x + 1))
df[cols_to_log].describe()

In [None]:
df[cols_to_log].hist(figsize=(20,7));

In [None]:
df[cols_to_log].hist(figsize=(20,7));

In [None]:
df.set_index('ID', inplace=True)
df.drop_duplicates(inplace=True)
df.head()

In [None]:
binary = ['CNT_ACCEPTS_TK', 'PRC_ACCEPTS_A_MTP', 'PRC_ACCEPTS_A_AMOBILE', 'PRC_ACCEPTS_A_ATM', 'PRC_ACCEPTS_A_TK', 
          'PRC_ACCEPTS_A_EMAIL_LINK', 'PRC_ACCEPTS_MTP', 'PRC_ACCEPTS_A_POS', 'CNT_ACCEPTS_MTP']#, 'PRC_ACCEPTS_TK']
df['PRC_ACCEPTS_TK'] = df['PRC_ACCEPTS_TK'].fillna(1).astype(int)
df.drop(binary, axis=1, inplace=True)

In [None]:
df_not_null = df.notnull().sum().div(len(df)).sort_values(0, ascending=False)
df_not_null.plot(kind='bar', figsize=(20,5), title='Features with not null values');

In [None]:
#df_not_null = pd.DataFrame(df_not_null, columns=['not_null'])
#features_drop = list(df_not_null[df_not_null['not_null'] < 0.1].index)
#df.drop(features_drop, axis=1, inplace=True)
#df.fillna(df.mean(), inplace=True)

In [None]:
to_int = ['APP_REGISTR_RGN_CODE']
df[to_int] = df[to_int].fillna(-1).astype(int)

In [None]:
category = ['APP_CAR', 'CLNT_JOB_POSITION_TYPE', 'APP_EMP_TYPE', 'APP_TRAVEL_PASS', 'APP_KIND_OF_PROP_HABITATION', 
            'APP_DRIVING_LICENSE', 'APP_POSITION_TYPE', 'APP_MARITAL_STATUS', 'APP_EDUCATION', 'CLNT_JOB_POSITION', 
            'CLNT_TRUST_RELATION', 'APP_REGISTR_RGN_CODE', 'APP_COMP_TYPE', 'PACK', 'TARGET']
df[category] = df[category].astype(str).fillna('NULL')
relations = {'мать': 'mother', 'отец': 'father', 'сын': 'son', 'сестра': 'sister', 'брат': 'brother', 'дочь': 'daughter',
             'друг': 'friend', 'жена': 'relative', 'муж': 'relative', 'близкий ро': 'relative', 'дальний ро': 'other'}

df['CLNT_TRUST_RELATION'] = df['CLNT_TRUST_RELATION'].apply(lambda x: x.lower())
df['CLNT_TRUST_RELATION'] = df['CLNT_TRUST_RELATION'].replace(relations)
for i in df[category]:
    df[i] = df[i].str.upper()
#df.drop(, axis=1, inplace=True) ######## drop job

In [None]:
job_list = {'ЗАМЕСТИТЕЛЬ': 'ЗАМ. ', 'ЗАМ ': 'ЗАМ. ', 'ГЕНЕРАЛЬНЫЙ': 'ГЕН. ', 'ГЕНЕРАЛЬНОГО': 'ГЕН. ', 'ГЕН ': 'ГЕН. ', 
       'ПРОГРАМИСТ': 'ПРОГРАММИСТ', 'ГЛ.': 'ГЛАВНЫЙ ', 'КОМЕРЧЕСКИЙ': 'КОМ.', 'КОММЕРЧЕСКИЙ': 'КОМ.', 
       'ИНДИВИДУАЛЬНЫЙ ПРЕДПРИНИМАТЕЛЬ': 'ИП'}

def replace_job(job):
    for k, v in job_list.items():
        job = job.replace(k, v)
    return job

In [None]:
df['CLNT_JOB_POSITION'] = df['CLNT_JOB_POSITION'].apply(lambda x: replace_job(x))
df['CLNT_JOB_POSITION'] = df['CLNT_JOB_POSITION'].apply(lambda x: re.sub(r'\s+', ' ', str(x).replace('.', '. ')).strip())

In [None]:
top = df['CLNT_JOB_POSITION'].value_counts().reset_index().head(100)['index'].values
df['CLNT_JOB_POSITION'] = df['CLNT_JOB_POSITION'].apply(lambda x: x if x in top else "ПРОЧЕЕ").value_counts()

In [None]:
cnt_prod = ['CR_PROD_CNT_CC','CR_PROD_CNT_CCFP','CR_PROD_CNT_IL','CR_PROD_CNT_PIL','CR_PROD_CNT_TOVR','CR_PROD_CNT_VCU']

In [None]:
df.shape

df.drop(corr_features, axis=1, inplace=True, errors='ignore')

In [None]:
df['TARGET'].value_counts().plot(kind='barh')

In [None]:
na_avg = ['DEAL_YQZ_IR_MAX', 'DEAL_YQZ_IR_MIN', 'DEAL_YWZ_IR_MAX', 'DEAL_YWZ_IR_MIN']

In [None]:
%%time
for i in df.columns:
    #df[i] = df[i].astype(np.float32)
    if df[i].isnull().any():
        if str(df[i].values.dtype) == 'float64':
            if i in na_avg:
                df[i].fillna(df[i].median(), inplace=True)
            else:
                 df[i].fillna(0, inplace=True)
                #df['ZERO_' + i] = df[i].fillna(0)
        else:
            print(i)
        #df.drop(i, axis=1, inplace=True)

In [None]:
with_null = pd.DataFrame(df.isnull().any(), columns={'null'})

In [None]:
col = sorted(with_null[with_null['null']].index)

In [None]:
df[col].min()

In [None]:
set(df[col].min().index)

In [None]:
df.describe()

In [None]:
not_imp = ['AMOUNT_RUB_CLO_PRC', 'AMOUNT_RUB_SUP_PRC', 'APP_CAR', 'APP_COMP_TYPE', 'APP_EDUCATION', 'APP_EMP_TYPE', 
           'APP_KIND_OF_PROP_HABITATION', 'APP_MARITAL_STATUS', 'APP_POSITION_TYPE', 'APP_REGISTR_RGN_CODE', 
           'APP_TRAVEL_PASS',  'AVG_PCT_MONTH_TO_PCLOSE', 'CLNT_JOB_POSITION_TYPE', 'CLNT_JOB_POSITION',
           'CLNT_SALARY_VALUE', 'CLNT_SETUP_TENOR', 'CLNT_TRUST_RELATION', 'CNT_TRAN_ATM_TENDENCY1M', 
           
           'CNT_TRAN_CLO_TENDENCY3M', 'TURNOVER_DYNAMIC_CC_3M', 'CNT_TRAN_CLO_TENDENCY1M', 'TURNOVER_CC',
           'CNT_TRAN_ATM_TENDENCY3M', 'CNT_TRAN_AUT_TENDENCY1M', 'CNT_TRAN_AUT_TENDENCY3M',
            'CNT_TRAN_MED_TENDENCY1M', 'CNT_TRAN_MED_TENDENCY3M', 'CNT_TRAN_SUP_TENDENCY1M',
           'CNT_TRAN_SUP_TENDENCY3M', 'CR_PROD_CNT_CC', 'CR_PROD_CNT_IL', 'CR_PROD_CNT_PIL', 'CR_PROD_CNT_TOVR',
           'CR_PROD_CNT_VCU', 'DEAL_GRACE_DAYS_ACC_AVG', 'DEAL_GRACE_DAYS_ACC_MAX', 'DEAL_GRACE_DAYS_ACC_S1X1', 
           'DEAL_YQZ_IR_MAX', 'DEAL_YQZ_IR_MIN',
 'DEAL_YWZ_IR_MIN',
 'LDEAL_ACT_DAYS_ACC_PCT_AVG',
 'LDEAL_ACT_DAYS_PCT_CURR',
 'LDEAL_ACT_DAYS_PCT_TR',
 'LDEAL_ACT_DAYS_PCT_TR3',
 'LDEAL_AMT_MONTH',
 'LDEAL_DELINQ_PER_MAXYQZ',
 'LDEAL_DELINQ_PER_MAXYWZ',
 'LDEAL_GRACE_DAYS_PCT_MED',
 'LDEAL_TENOR_MAX',
 'LDEAL_TENOR_MIN',
 'LDEAL_USED_AMT_AVG_YQZ',
 'LDEAL_USED_AMT_AVG_YWZ',
 'LDEAL_YQZ_CHRG',
 'LDEAL_YQZ_COM',
 'LDEAL_YQZ_PC',
 'MAX_PCLOSE_DATE',
 'MED_DEBT_PRC_YQZ',
 'MED_DEBT_PRC_YWZ',
 'PRC_ACCEPTS_TK',
 'REST_DYNAMIC_CC_1M',
 'REST_DYNAMIC_CUR_3M',
 'REST_DYNAMIC_FDEP_1M',
 'REST_DYNAMIC_FDEP_3M',
 'REST_DYNAMIC_IL_3M',
 'REST_DYNAMIC_PAYM_3M',
 'SUM_TRAN_ATM_TENDENCY3M',
 'SUM_TRAN_AUT_TENDENCY1M',
 'SUM_TRAN_AUT_TENDENCY3M',
 'SUM_TRAN_CLO_TENDENCY1M',
 'SUM_TRAN_CLO_TENDENCY3M',
 'SUM_TRAN_MED_TENDENCY3M',
 'SUM_TRAN_SUP_TENDENCY3M',
 'TRANS_CNT_TENDENCY3M',
 'TRANS_COUNT_ATM_PRC',
 'TRANS_COUNT_NAS_PRC',
 'TRANS_COUNT_SUP_PRC',
 'TURNOVER_DYNAMIC_CC_1M',
 'TURNOVER_DYNAMIC_PAYM_1M',
 'TURNOVER_DYNAMIC_PAYM_3M',
 'TURNOVER_PAYM', 'REST_DYNAMIC_CC_3M', 'TURNOVER_CC',
 'APP_DRIVING_LICENSE', 'SUM_TRAN_MED_TENDENCY1M',
 'TRANS_AMOUNT_TENDENCY3M',
 'REST_DYNAMIC_SAVE_3M',
 'AMOUNT_RUB_NAS_PRC',
 'LDEAL_ACT_DAYS_PCT_TR4', 'CR_PROD_CNT_CCFP']


%%time
df = pd.get_dummies(df.drop(not_imp, axis=1))#.drop('CLNT_JOB_POSITION', axis=1))#.drop(category, axis=1))
df.shape

In [None]:
df.describe(include=[object])

In [None]:
#df.drop(not_important,  axis=1, inplace=True)
X  = pd.get_dummies(df.drop(not_imp, axis=1).drop('TARGET', axis=1), drop_first=True).values
sc = StandardScaler()
X = sc.fit_transform(X)
y = df['TARGET'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

In [None]:
%%time
#rf = RandomForestClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=51, max_depth=24, class_weight='balanced_subsample', n_jobs=-1, random_state=42,
                           min_samples_leaf=13, min_samples_split=162)
rf.fit(X_train, y_train)
roc_auc_score(y_test, rf.predict(X_test)) #0.7355830457619096 0.7354541023605872 0.7355947492192749, 0.7358521616827666
#0.7337170299839584 0.7349149655645675 0.7358463755400841 0.7367887337402368

In [None]:
proba = pd.DataFrame(rf.predict_proba(X)[:,1], columns={'proba'})
proba['predict'] = rf.predict(X)
proba['y'] = y

In [None]:
proba['diff'] = proba['predict'].astype(int) + proba['y'].astype(int)

In [None]:
proba[(proba['diff'] == 1)].sort_values('proba')

In [None]:
roc_auc_score(y, rf.predict(X))

In [None]:
proba[(proba['diff'] == 1)].median()

In [None]:
plt.boxplot(proba[(proba['diff'] == 1)]['proba'].values);

In [None]:
y

In [None]:
def calc_score(df, max_depth=10, n_estimators=55, min_samples_leaf=10, min_samples_split=270, random_state=21): #10,270
    X  = pd.get_dummies(df.drop('TARGET', axis=1), drop_first=True).values
    X = StandardScaler().fit_transform(X)
    y = df['TARGET'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=random_state)
    rf = RandomForestClassifier(n_estimators=n_estimators, 
                                max_depth=max_depth, 
                                class_weight='balanced_subsample', 
                                n_jobs=-1,
                                random_state=random_state)#, 
                                #min_samples_leaf=min_samples_leaf, 
                                #min_samples_split=min_samples_split)
    
    rf.fit(X_train, y_train)
    return roc_auc_score(y_test, rf.predict(X_test)) 

In [None]:
def drop_feature():
    max_auc = 0.0
    features = list(df.drop(not_imp, axis=1).drop('TARGET', axis=1).columns)
    while len(features) > 1:
        i = features.pop()   
        curr_auc = calc_score_2(df.drop(not_imp + [i],  axis=1))   
        if curr_auc > max_auc:
            print('drop', i, round(curr_auc, 4))
            max_auc = curr_auc
            worst_feature = i
    return worst_feature, max_auc

In [None]:
def add_feature():
    max_auc = 0.0
    features = not_imp.copy()
    while len(features) > 1:
        i = features.pop()   
        curr_auc = calc_score_2(df.drop(list(set(not_imp) - set(i)),  axis=1))   
        if curr_auc > max_auc:
            print('add', i, round(curr_auc, 4))
            max_auc = curr_auc
            best_feature = i
    return best_feature, max_auc

In [None]:
calc_score(df)

In [None]:
calc_score(df.drop(not_imp, axis=1))

In [None]:
df.drop(not_imp, axis=1).corr()

In [None]:
df.drop(not_imp, axis=1)

In [None]:
score = [] #0.7611
for i in range(1, 20):
    score.append([i, calc_score(df.drop(not_imp, axis=1), max_depth=i)])
    print(score[-1])
pd.DataFrame(score).set_index(0).plot(figsize=(20,10), grid=True)

In [None]:
def calc_score_2(df, random_state=21):
    X  = pd.get_dummies(df.drop('TARGET', axis=1), drop_first=True).values
    X = StandardScaler().fit_transform(X)
    y = df['TARGET'].values
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=random_state)
    rf = RandomForestClassifier(random_state=random_state, class_weight='balanced_subsample', n_jobs=-1)
    param_grid={'n_estimators': range(50, 101, 10),
                'max_depth': range(4, 21, 2)}
    grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc')
    grid.fit(X, y)
    print(grid.best_params_)
    return roc_auc_score(y_test, grid.predict(X_test))

In [None]:
%%time
calc_score_2(df.drop(not_imp, axis=1))

In [None]:
%%time
calc_score_2(df.drop(not_imp, axis=1))

In [None]:
pd.DataFrame(score).sort_values(1)

In [None]:
pd.DataFrame(score).set_index(0).plot(figsize=(20,10), grid=True)

In [None]:
%%time
scores = []
features = []
while (len(df.columns)) > 3:
    worst, drop_score = drop_feature()
    best, add_score = add_feature()    
    if drop_score >  add_score or (len(features) > 0 and features[-1] == best):
        scores.append(drop_score)
        not_imp.append(worst)
        features.append(worst)
        print('------------------------')
        print('delete', worst, drop_score)
        print('------------------------')
    else:
        scores.append(add_score)
        not_imp.remove(best)
        features.append(best)
        print('------------------------')
        print('append', best, add_score)
        print('------------------------')

In [None]:
features

In [None]:
pd.DataFrame(scores, features).plot(figsize=(15,5))

In [None]:
pd.DataFrame(scores, features)

In [None]:
df_scores = pd.DataFrame(scores, important, columns={"scores"} )
df_scores.plot(figsize=(15,5), rot=90);

In [None]:
df

df.drop(not_imp,  axis=1).drop('TARGET', axis=1)

In [None]:
curr_auc

In [None]:
df_scores.sort_values("scores")

In [None]:
not_important

In [None]:
scores

In [None]:
set(['LDEAL_GRACE_DAYS_PCT_MED',
 'REST_DYNAMIC_FDEP_3M',
 'AMOUNT_RUB_NAS_PRC',
 'AMOUNT_RUB_NAS_PRC'])

In [None]:
%%time
estimator = RandomForestClassifier(verbose=True)
selector = RFE(estimator, n_features_to_select=10)
selector.fit(X_train, y_train)
roc_auc_score(y_test, selector.predict(X_test))

In [None]:
corr[(corr['value'] > 0.5)]

In [None]:
coef = pd.DataFrame()
coef["feature"] = df.drop("TARGET", axis=1).columns
coef["rf_coef"] = np.abs(rf.feature_importances_)
#coef["lasso_coef"] = np.abs(lasso.coef_)

#coef['ridge'] = np.abs(ridge.coef_)
#coef['xgb_coef'] = np.abs(xgb_clf.feature_importances_)
#coef['ranking'] = selector.ranking_
#coef['support'] = selector.support_
coef.sort_values("rf_coef", ascending=False, inplace=True)
#coef[(coef["rf_coef"] > 0.01) | (coef["xgb_coef"] > 0)]
coef

In [None]:
coef = pd.DataFrame()
coef["feature"] = df.drop("TARGET", axis=1).columns
coef["rf_coef"] = np.abs(grid.feature_importances_)
#coef["lasso_coef"] = np.abs(lasso.coef_)

#coef['ridge'] = np.abs(ridge.coef_)
#coef['xgb_coef'] = np.abs(xgb_clf.feature_importances_)
#coef['ranking'] = selector.ranking_
#coef['support'] = selector.support_
coef.sort_values("rf_coef", ascending=False, inplace=True)
#coef[(coef["rf_coef"] > 0.01) | (coef["xgb_coef"] > 0)]
coef

In [None]:
import winsound
frequency = 2500  # Set Frequency To 2500 Hertz
duration = 1000  # Set Duration To 1000 ms == 1 second
winsound.Beep(frequency, duration)

pip install seglearn