In [1]:
%load_ext autoreload
%autoreload 2

import os
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append(os.path.abspath('..'))
# ---------------------------------
from time import sleep
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
# ---------------------------------
from tools import CV, Tuning, CVGetScore
from hyperopt import hp

In [2]:
y_train = pd.read_csv('../data/train.csv', index_col='id')['target'].values
print(y_train.shape)

(600000,)


In [3]:
def mk_mdf(mdict, rule_dict={}):
    df = list()
    for name, path in mdict.items():
        marr = np.load(path)
        df.append(pd.DataFrame(data=marr, columns=[f'{name}{i}' for i in range(marr.shape[1])]))
    df = pd.concat(df, axis=1)
    
    mdf = dict()
    for colname, rules in rule_dict.items():
        values = np.mean(df[rules].values, axis=1)
        mdf[colname] = values
    del df
    return pd.DataFrame(mdf)

In [4]:
! ls ../tmp/xdeepfm

3973270008.csv	4293006264.csv	4293006264predict.npy  4293006264stacking1.npy


In [5]:
train_dict = {
    'cin': '../tmp/cin/1960993645stacking1.npy',
    'xdeepfm': '../tmp/xdeepfm/4293006264stacking1.npy'
}


test_dict = {
    'cin': '../tmp/cin/1960993645predict.npy',
    'xdeepfm': '../tmp/xdeepfm/4293006264predict.npy'
}


rule_dict = {
    'cin':  [f'cin{i}' for i in range(5)],
    'xdeepfm': [f'xdeepfm{i}' for i in range(5)],
}

x_train_df = mk_mdf(train_dict, rule_dict)
x_train = x_train_df.values
x_test_df = mk_mdf(test_dict, rule_dict)
x_test = x_test_df.values

In [9]:
x_train_df.head()

Unnamed: 0,cin,xdeepfm
0,0.177138,0.048773
1,0.218902,0.085085
2,0.320728,0.269822
3,0.135259,0.028764
4,0.242454,0.122328


In [8]:
x_test_df.head()

Unnamed: 0,cin,xdeepfm
0,0.238673,0.119105
1,0.303313,0.242635
2,0.259058,0.148813
3,0.239284,0.113369
4,0.253513,0.13489


In [11]:
from sklearn.linear_model import LogisticRegression

In [None]:
nflod = 5

cv_fit_param = {'fit_params': {}, 
                'use_proba':True}

# model_fix_param & model_search_space
model_fix_param = {'penalty': 'l2', 'max_iter': 1000, 'n_jobs': -1}

ss = {
    'solver': (hp.choice, ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')),
    'C': (hp.loguniform, (-10, 5))
    }

# cv get score
def neg_auc(y_true, y_pred):
    return - roc_auc_score(y_true, y_pred)

gs = CVGetScore(x=x_train,
                y=y_train, 
                metrics_func=neg_auc,
                split_method=StratifiedKFold,
                nfolds=nflod, 
                random_state=2333,
                model=LogisticRegression, 
                cv_fit_params=cv_fit_param, 
                model_fix_params=model_fix_param, 
                model_search_space=ss)

tuning = Tuning(gs, verbose=1)
tuning.fmin(gs.GET_SEARCH_SPACE(), max_evals=50)

In [None]:
tuning.log.plot(score_interval=[-0.7900, -0.7895])

In [12]:
%%time

nflod = 40

model_param = {
    'max_iter': 1000,
    'n_jobs': -1,
    'C': 0.027669607403232415,
    'solver': 'saga'
}

model = LogisticRegression(**model_param)
cv = CV(model, nflod)

score = cv.fit(x=x_train,
               y=y_train, 
               metrics_func=roc_auc_score,
               split_method=StratifiedKFold,
               use_proba=True, 
               verbose=True)

folds 0 is done, score is 0.7819049703789698
folds 1 is done, score is 0.7916823815158789
folds 2 is done, score is 0.7916385668805289
folds 3 is done, score is 0.7878708879672627
folds 4 is done, score is 0.7926082139705828
folds 5 is done, score is 0.791173898067763
folds 6 is done, score is 0.7908437109757648
folds 7 is done, score is 0.7903241862395407
folds 8 is done, score is 0.7889320784354413
folds 9 is done, score is 0.7845134892994144
folds 10 is done, score is 0.7892035539160702
folds 11 is done, score is 0.800677818250817
folds 12 is done, score is 0.7874174649109031
folds 13 is done, score is 0.7912118123322192
folds 14 is done, score is 0.7864969778417121
folds 15 is done, score is 0.7900172793237918
folds 16 is done, score is 0.7914613512854162
folds 17 is done, score is 0.792580932057638
folds 18 is done, score is 0.7958508475035706
folds 19 is done, score is 0.7951185588980865
folds 20 is done, score is 0.7890375256578505
folds 21 is done, score is 0.7872753594435845
f

In [13]:
score

0.7896885676276588

In [14]:
cv.model[0].coef_

array([[8.49387973, 1.87813581]])

In [15]:
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')
submission['target'] = cv.predict(x=x_test, use_proba=True)
submission.to_csv(f'../tmp/submission/cinx2.csv')