In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import warnings

In [3]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [4]:

data_dir = Path('/content/drive/MyDrive/dacon/input')
feature_dir = Path('../build/feature')
val_dir = Path('/content/drive/MyDrive/dacon/build/val')
tst_dir = Path('/content/drive/MyDrive/dacon/build/tst')
sub_dir = Path('/content/drive/MyDrive/dacon/build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [5]:
algo_name = 'lgbcv'
feature_name = 'stacking1'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [6]:
model_names = ['cnn_emb', 'lgbcv_feature', 'mta_emb', 'lr_tfidf', 'lstm_emb', 'rf_feature','svm_feature']
trn = []
tst = []
feature_names = []
for model in model_names:
    trn.append(np.loadtxt(val_dir / f'{model}.val.csv', delimiter=','))
    tst.append(np.loadtxt(tst_dir / f'{model}.tst.csv', delimiter=','))
    feature_names += [f'{model}_class0', f'{model}_class1', f'{model}_class2', f'{model}_class3', f'{model}_class4']
    
trn = np.hstack(trn)
tst = np.hstack(tst)
feature_names

['cnn_emb_class0',
 'cnn_emb_class1',
 'cnn_emb_class2',
 'cnn_emb_class3',
 'cnn_emb_class4',
 'lgbcv_feature_class0',
 'lgbcv_feature_class1',
 'lgbcv_feature_class2',
 'lgbcv_feature_class3',
 'lgbcv_feature_class4',
 'mta_emb_class0',
 'mta_emb_class1',
 'mta_emb_class2',
 'mta_emb_class3',
 'mta_emb_class4',
 'lr_tfidf_class0',
 'lr_tfidf_class1',
 'lr_tfidf_class2',
 'lr_tfidf_class3',
 'lr_tfidf_class4',
 'lstm_emb_class0',
 'lstm_emb_class1',
 'lstm_emb_class2',
 'lstm_emb_class3',
 'lstm_emb_class4',
 'rf_feature_class0',
 'rf_feature_class1',
 'rf_feature_class2',
 'rf_feature_class3',
 'rf_feature_class4',
 'svm_feature_class0',
 'svm_feature_class1',
 'svm_feature_class2',
 'svm_feature_class3',
 'svm_feature_class4']

In [7]:
y = pd.read_csv(trn_file, index_col=0, usecols=['index', target_col]).values.flatten()
y.shape

(54879,)

In [8]:

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [9]:

p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=1000,
                             num_leaves=64,
                             learning_rate=0.1,
                             min_child_samples=10,
                             subsample=.5,
                             subsample_freq=1,
                             colsample_bytree=.8,
                             random_state=seed,
                             n_jobs=-1)
    clf.fit(trn[i_trn], y[i_trn],
            eval_set=[(trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold

training model for CV #1
[1]	valid_0's multi_logloss: 1.41848
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 1.30062
[3]	valid_0's multi_logloss: 1.20481
[4]	valid_0's multi_logloss: 1.12439
[5]	valid_0's multi_logloss: 1.0555
[6]	valid_0's multi_logloss: 0.996733
[7]	valid_0's multi_logloss: 0.946724
[8]	valid_0's multi_logloss: 0.902497
[9]	valid_0's multi_logloss: 0.863341
[10]	valid_0's multi_logloss: 0.82941
[11]	valid_0's multi_logloss: 0.798796
[12]	valid_0's multi_logloss: 0.771709
[13]	valid_0's multi_logloss: 0.748102
[14]	valid_0's multi_logloss: 0.726752
[15]	valid_0's multi_logloss: 0.707642
[16]	valid_0's multi_logloss: 0.690635
[17]	valid_0's multi_logloss: 0.675235
[18]	valid_0's multi_logloss: 0.661439
[19]	valid_0's multi_logloss: 0.649119
[20]	valid_0's multi_logloss: 0.637964
[21]	valid_0's multi_logloss: 0.62835
[22]	valid_0's multi_logloss: 0.61951
[23]	valid_0's multi_logloss: 0.61129
[24]	valid_0's multi_logloss: 0.604

In [10]:
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')

80.6392%


In [11]:
print(p_val.shape, p_tst.shape)

(54879, 5) (19617, 5)


In [12]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [13]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [14]:
sub[sub.columns] = p_tst
sub.head()


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0072,0.7541,0.2171,0.0184,0.0032
1,0.1744,0.56,0.0153,0.0278,0.2225
2,0.9961,0.0009,0.001,0.001,0.0011
3,0.0127,0.0117,0.9581,0.0045,0.013
4,0.7508,0.0483,0.0322,0.0925,0.0763


In [15]:
sub.to_csv(sub_file)