# target encoder & ohe & LogisticRegression

In [2]:
%load_ext autoreload
%autoreload 2

import os
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append(os.path.abspath('..'))
# ---------------------------------
import numpy as np
import pandas as pd
import scipy

from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, SGDRegressor
from sklearn.metrics import roc_auc_score
# ---------------------------------
from tools import Tuning

In [None]:
train_df = pd.read_csv('../data/train.csv', index_col='id')
test_df = pd.read_csv('../data/test.csv', index_col='id')

# ord_5
for i in range(2):
    train_df[f'ord_5_{i}'] = train_df['ord_5'].str[i]
    test_df[f'ord_5_{i}'] = test_df['ord_5'].str[i]

# fillna
for col in test_df.columns:
    train_df[col].fillna(train_df[col].mode()[0], inplace=True)
    test_df[col].fillna(test_df[col].mode()[0], inplace=True)

# target
target = train_df['target']
y_train = target.values

# drop
train_df.drop(['target', 'ord_5'], axis=1, inplace=True)
test_df.drop(['ord_5'], axis=1, inplace=True)

# ohe

In [None]:
ohe_features = [
    'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
    'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4',
    'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5_0', 'ord_5_1',
    'day', 'month'
]


ohe = OneHotEncoder(dtype='float32', handle_unknown='ignore')
ohe_x_train = ohe.fit_transform(train_df[ohe_features])
ohe_x_test = ohe.transform(test_df[ohe_features])

In [None]:
ohe_x_train.shape

## Target Encoder

In [None]:
target_features = [
    'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'
]


def transform(transformer, x_train, y_train, cv):
    oof = pd.DataFrame(index=x_train.index, columns=x_train.columns)
    for train_idx, valid_idx in cv.split(x_train, y_train):
        x_train_train = x_train.loc[train_idx]
        y_train_train = y_train.loc[train_idx]
        x_train_valid = x_train.loc[valid_idx]
        transformer.fit(x_train_train, y_train_train)
        oof_part = transformer.transform(x_train_valid)
        oof.loc[valid_idx] = oof_part
    return oof


te = TargetEncoder(drop_invariant=True, smoothing=0.2)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
target_x_train = transform(te, train_df[target_features], target, cv).astype('float')

te.fit(train_df[target_features], target)
target_x_test = te.transform(test_df[target_features]).astype('float')

## Merge

In [None]:
x_train = scipy.sparse.hstack([ohe_x_train, target_x_train]).tocsr()
x_test = scipy.sparse.hstack([ohe_x_test, target_x_test]).tocsr()

del train_df, test_df, ohe_x_train, target_x_train, ohe_x_test, target_x_test

## Tuning

In [None]:
def neg_auc(y_true, y_pred):
    return - roc_auc_score(y_true, y_pred)

model = LogisticRegression

model_fix_params = {'penalty': 'l2', 'random_state':2333, 'max_iter':10000, 'n_jobs': -1}

cv_params = {
    'split_method': StratifiedKFold,
    'nflod': 10,
    'shuffle': True,
    'random_state':23333,
    'fit_params':None}

In [None]:
tu = Tuning(x_train, y_train, model, model_fix_params, cv_params, neg_auc)

sd = {
    'C': tu.hp.loguniform('C', -10, 0),
    'solver': tu.hp.choice('solver', ['liblinear', 'sag', 'saga'])
}

final = tu.fmin(sd)

In [None]:
tu.best_param

# Submission

In [None]:
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')
submission['target'] = y_pred
submission.to_csv('logit.csv')