In [25]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [26]:

import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
from nltk.tokenize import word_tokenize
import warnings
import tensorflow as tf

In [27]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [28]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')

1 Physical GPUs, 1 Logical GPU


In [29]:
data_dir = Path('/content/drive/MyDrive/dacon/input')
feature_dir = Path('../build/feature')
val_dir = Path('/content/drive/MyDrive/dacon/build/val')
tst_dir = Path('/content/drive/MyDrive/dacon/build/tst')
sub_dir = Path('/content/drive/MyDrive/dacon/build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [30]:
algo_name = 'lgbcv'
feature_name = 'feature'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [31]:
train = pd.read_csv(trn_file, index_col=0)
train.head()

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [32]:
test = pd.read_csv(tst_file, index_col=0)
test.head()

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [33]:

train['text']=train['text'].str.replace('\'s', '')
train['text']=train['text'].str.replace('’s', '')
train['text']=train['text'].str.replace("\'", '')
train['text']=train['text'].str.replace("’", '')

test['text']=test['text'].str.replace("’s",'')
test['text']=test['text'].str.replace("\'s",'')
test['text']=test['text'].str.replace("\'", '')
test['text']=test['text'].str.replace("’", '')

In [34]:
train['text']=train['text'].str.replace('á', '')
train['text']=train['text'].str.replace('ä', '')
train['text']=train['text'].str.replace('é', '')
train['text']=train['text'].str.replace('í', '')
train['text']=train['text'].str.replace('ó', '')
train['text']=train['text'].str.replace('ú', '')
train['text']=train['text'].str.replace('ý', '')
train['text']=train['text'].str.replace('ü', ' Umlaut ')

test['text']=test['text'].str.replace('ä', '')
test['text']=test['text'].str.replace('á', '')
test['text']=test['text'].str.replace('é', '')
test['text']=test['text'].str.replace('í', '')
test['text']=test['text'].str.replace('ó', '')
test['text']=test['text'].str.replace('ú', '')
test['text']=test['text'].str.replace('ý', '')
test['text']=test['text'].str.replace('ü', '')

In [35]:

import re
def alpha_num(text):
    return re.sub(r'[0-9]', ' num ', text)

def remove_word(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in del_word:
            final_text.append(i.strip())
    return " ".join(final_text)


del_word = ['the', 'and' , 'to' , 'of' , 'a',]

train['text'] = train['text'].str.lower().apply(alpha_num).apply(remove_word)
test['text'] = test['text'].str.lower().apply(alpha_num).apply(remove_word)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 4),max_features=300 ,min_df=5, 
                      sublinear_tf=True)
trn= vec.fit_transform(train['text'])
tst = vec.transform(test['text'])
print(trn.shape, tst.shape)

NameError: ignored

In [37]:
y = train['author'].values
print(y.shape)

(54879,)


In [38]:
X_trn, X_val, y_trn, y_val = train_test_split(trn, y, test_size=.2, random_state=seed)

In [39]:
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe, fmin
params = {
    "objective": "multiclass",
    "n_estimators": 1000,
    "subsample_freq": 1,
    "random_state": seed,
    "n_jobs": -1,
}

space = {
    "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.3)),
    "num_leaves": hp.choice("num_leaves", [15, 31, 63, 127]),
    "colsample_bytree": hp.quniform("colsample_bytree", .5, .9, 0.1),
    "subsample": hp.quniform("subsample", .5, .9, 0.1),
    "min_child_samples": hp.choice('min_child_samples', [10, 25, 100])
}

In [40]:
def objective(hyperparams):
    model = lgb.LGBMClassifier(**params, **hyperparams)
    model.fit(X=X_trn, y=y_trn,
              eval_set=[(X_val, y_val)],
              eval_metric="multi_logloss",
              early_stopping_rounds=10,
              verbose=False)
    score = model.best_score_["valid_0"]["multi_logloss"]

    return {'loss': score, 'status': STATUS_OK, 'model': model}

trials = Trials()
best = fmin(fn=objective, space=space, trials=trials,
            algo=tpe.suggest, max_evals=10, verbose=1)

hyperparams = space_eval(space, best)
n_best = trials.best_trial['result']['model'].best_iteration_
params.update(hyperparams)
print(params)

100%|██████████| 10/10 [21:52<00:00, 131.27s/it, best loss: 0.871382258860049]
{'objective': 'multiclass', 'n_estimators': 1000, 'subsample_freq': 1, 'random_state': 42, 'n_jobs': -1, 'colsample_bytree': 0.6000000000000001, 'learning_rate': 0.018126866760856356, 'min_child_samples': 25, 'num_leaves': 63, 'subsample': 0.7000000000000001}


In [41]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [42]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=1000,
                             num_leaves=64,
                             learning_rate=0.1,
                             min_child_samples=10,
                             subsample=.5,
                             subsample_freq=1,
                             colsample_bytree=.8,
                             random_state=seed,
                             n_jobs=-1)
    
    clf.fit(trn[i_trn], y[i_trn],
            eval_set=[(trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_class

training model for CV #1
[1]	valid_0's multi_logloss: 1.51167
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's multi_logloss: 1.46373
[3]	valid_0's multi_logloss: 1.42176
[4]	valid_0's multi_logloss: 1.38232
[5]	valid_0's multi_logloss: 1.3495
[6]	valid_0's multi_logloss: 1.31969
[7]	valid_0's multi_logloss: 1.29285
[8]	valid_0's multi_logloss: 1.26781
[9]	valid_0's multi_logloss: 1.24627
[10]	valid_0's multi_logloss: 1.22488
[11]	valid_0's multi_logloss: 1.2058
[12]	valid_0's multi_logloss: 1.18889
[13]	valid_0's multi_logloss: 1.17354
[14]	valid_0's multi_logloss: 1.15897
[15]	valid_0's multi_logloss: 1.1453
[16]	valid_0's multi_logloss: 1.13258
[17]	valid_0's multi_logloss: 1.12044
[18]	valid_0's multi_logloss: 1.11004
[19]	valid_0's multi_logloss: 1.09988
[20]	valid_0's multi_logloss: 1.08997
[21]	valid_0's multi_logloss: 1.08102
[22]	valid_0's multi_logloss: 1.07238
[23]	valid_0's multi_logloss: 1.06398
[24]	valid_0's multi_logloss: 1.05595
[25]	valid_0'

In [43]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')

Accuracy (CV):  66.0253%


In [44]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [45]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [46]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.037,0.8178,0.0884,0.0398,0.0169
1,0.0172,0.9203,0.0088,0.0136,0.0401
2,0.8446,0.0588,0.0482,0.0182,0.0302
3,0.0127,0.0125,0.9002,0.0623,0.0122
4,0.5684,0.162,0.1275,0.094,0.0481


In [47]:
sub.to_csv(sub_file)