In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin
import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, train_test_split
import seaborn as sns
import warnings

In [4]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [5]:

data_dir = Path('/content/drive/MyDrive/dacon/input')
feature_dir = Path('../build/feature')
val_dir = Path('/content/drive/MyDrive/dacon/build/val')
tst_dir = Path('/content/drive/MyDrive/dacon/build/tst')
sub_dir = Path('/content/drive/MyDrive/dacon/build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [6]:
algo_name = 'lgbcv'
feature_name = 'stacking1-hyperopt'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [7]:
model_names = ['bla_emb','gru_emb','xgboost_feature','cnn_emb','rf_feature', 'lgbcv_feature', 'mta_emb', 'lr_tfidf', 'lstm_emb','svm_feature', 'gNB_feature','knn_feature']
trn = []
tst = []
feature_names = []
for model in model_names:
    trn.append(np.loadtxt(val_dir / f'{model}.val.csv', delimiter=','))
    tst.append(np.loadtxt(tst_dir / f'{model}.tst.csv', delimiter=','))
    feature_names += [f'{model}_class0', f'{model}_class1', f'{model}_class2', f'{model}_class3', f'{model}_class4']
    
trn = np.hstack(trn)
tst = np.hstack(tst)
feature_names

['bla_emb_class0',
 'bla_emb_class1',
 'bla_emb_class2',
 'bla_emb_class3',
 'bla_emb_class4',
 'gru_emb_class0',
 'gru_emb_class1',
 'gru_emb_class2',
 'gru_emb_class3',
 'gru_emb_class4',
 'xgboost_feature_class0',
 'xgboost_feature_class1',
 'xgboost_feature_class2',
 'xgboost_feature_class3',
 'xgboost_feature_class4',
 'cnn_emb_class0',
 'cnn_emb_class1',
 'cnn_emb_class2',
 'cnn_emb_class3',
 'cnn_emb_class4',
 'rf_feature_class0',
 'rf_feature_class1',
 'rf_feature_class2',
 'rf_feature_class3',
 'rf_feature_class4',
 'lgbcv_feature_class0',
 'lgbcv_feature_class1',
 'lgbcv_feature_class2',
 'lgbcv_feature_class3',
 'lgbcv_feature_class4',
 'mta_emb_class0',
 'mta_emb_class1',
 'mta_emb_class2',
 'mta_emb_class3',
 'mta_emb_class4',
 'lr_tfidf_class0',
 'lr_tfidf_class1',
 'lr_tfidf_class2',
 'lr_tfidf_class3',
 'lr_tfidf_class4',
 'lstm_emb_class0',
 'lstm_emb_class1',
 'lstm_emb_class2',
 'lstm_emb_class3',
 'lstm_emb_class4',
 'svm_feature_class0',
 'svm_feature_class1',
 'sv

In [8]:
y = pd.read_csv(trn_file, index_col=0, usecols=['index', target_col]).values.flatten()
print(y.shape, trn.shape, tst.shape)

(54879,) (54879, 60) (19617, 60)


In [9]:
X_trn, X_val, y_trn, y_val = train_test_split(trn, y, test_size=.2, random_state=seed)

In [10]:
params = {
    "objective": "multiclass",
    "n_estimators": 1000,
    "subsample_freq": 1,
    "random_state": seed,
    "n_jobs": -1,
}

space = {
    "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
    "num_leaves": hp.choice("num_leaves", [15, 31, 63, 127]),
    "colsample_bytree": hp.quniform("colsample_bytree", .5, .9, 0.1),
    "subsample": hp.quniform("subsample", .5, .9, 0.1),
    "min_child_samples": hp.choice('min_child_samples', [10, 25, 100])
}

In [11]:
def objective(hyperparams):
    model = lgb.LGBMClassifier(**params, **hyperparams)
    model.fit(X=X_trn, y=y_trn,
              eval_set=[(X_val, y_val)],
              eval_metric="multi_logloss",
              early_stopping_rounds=10,
              verbose=False)
    score = model.best_score_["valid_0"]["multi_logloss"]

    return {'loss': score, 'status': STATUS_OK, 'model': model}

trials = Trials()
best = fmin(fn=objective, space=space, trials=trials,
            algo=tpe.suggest, max_evals=10, verbose=1)

hyperparams = space_eval(space, best)
n_best = trials.best_trial['result']['model'].best_iteration_
params.update(hyperparams)
print(params)

100%|██████████| 10/10 [03:35<00:00, 21.51s/it, best loss: 0.42638896910208113]
{'objective': 'multiclass', 'n_estimators': 1000, 'subsample_freq': 1, 'random_state': 42, 'n_jobs': -1, 'colsample_bytree': 0.8, 'learning_rate': 0.019823206580601896, 'min_child_samples': 100, 'num_leaves': 31, 'subsample': 0.8}


In [12]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [13]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(**params)
    clf.fit(trn[i_trn], y[i_trn],
            eval_set=[(trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold

training model for CV #1
[1]	valid_0's multi_logloss: 1.53673
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 1.50503
[3]	valid_0's multi_logloss: 1.47493
[4]	valid_0's multi_logloss: 1.44612
[5]	valid_0's multi_logloss: 1.4185
[6]	valid_0's multi_logloss: 1.39222
[7]	valid_0's multi_logloss: 1.36687
[8]	valid_0's multi_logloss: 1.34266
[9]	valid_0's multi_logloss: 1.31924
[10]	valid_0's multi_logloss: 1.29664
[11]	valid_0's multi_logloss: 1.27492
[12]	valid_0's multi_logloss: 1.25386
[13]	valid_0's multi_logloss: 1.23364
[14]	valid_0's multi_logloss: 1.21405
[15]	valid_0's multi_logloss: 1.19509
[16]	valid_0's multi_logloss: 1.17675
[17]	valid_0's multi_logloss: 1.15926
[18]	valid_0's multi_logloss: 1.14202
[19]	valid_0's multi_logloss: 1.12556
[20]	valid_0's multi_logloss: 1.10948
[21]	valid_0's multi_logloss: 1.09384
[22]	valid_0's multi_logloss: 1.07866
[23]	valid_0's multi_logloss: 1.06393
[24]	valid_0's multi_logloss: 1.04957
[25]	valid_

In [14]:
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')

84.3674%


In [15]:
print(p_val.shape, p_tst.shape)

(54879, 5) (19617, 5)


In [16]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [17]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [18]:
sub[sub.columns] = p_tst
sub.head()


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0068,0.5702,0.407,0.0142,0.0018
1,0.0058,0.9731,0.0015,0.0028,0.0168
2,0.9947,0.0015,0.0016,0.0008,0.0014
3,0.0023,0.0156,0.9768,0.0036,0.0018
4,0.8411,0.0212,0.0406,0.0532,0.0438


In [19]:
sub.to_csv(sub_file)