In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:

import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
from nltk.tokenize import word_tokenize
import warnings

In [3]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [4]:
data_dir = Path('../input')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [5]:
algo_name = 'lgbcv'
feature_name = 'feature'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [6]:
train = pd.read_csv(trn_file, index_col=0)
train.head()

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [7]:
test = pd.read_csv(tst_file, index_col=0)
test.head()

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [8]:
from tqdm import tqdm 
import re 

text_list = list(train['text'])
clear_text_list = [] 
for i in tqdm(range(len(text_list))): 
    clear_text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', text_list[i]) 
    clear_text_list.append(clear_text.lower()) 

train['text'] = clear_text_list 
train

100%|██████████| 54879/54879 [00:01<00:00, 33274.26it/s]


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,he was almost choking there was so much so muc...,3
1,“your sister asked for it i suppose”,2
2,she was engaged one day as she walked in peru...,1
3,the captain was in the porch keeping himself c...,4
4,“have mercy gentlemen” odin flung up his hands...,3
...,...,...
54874,“is that you mr smith” odin whispered “i hardl...,2
54875,i told my plan to the captain and between us w...,4
54876,your sincere wellwisher friend and sister luc...,1
54877,“then you wanted me to lend you money”,3


In [9]:
from tqdm import tqdm 
import re 

text_list = list(test['text'])
clear_text_list = [] 
for i in tqdm(range(len(text_list))): 
    clear_text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', text_list[i]) 
    clear_text_list.append(clear_text.lower()) 

test['text'] = clear_text_list 
test


100%|██████████| 19617/19617 [00:00<00:00, 45607.29it/s]


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“not at all i think she is one of the most cha...
1,no replied he with sudden consciousness not to...
2,as the lady had stated her intention of scream...
3,“and then suddenly in the silence i heard a so...
4,his conviction remained unchanged so far as i ...
...,...
19612,at the end of another day or two odin growing ...
19613,all afternoon we sat together mostly in silenc...
19614,odin having carried his thanks to odin procee...
19615,soon after this upon odins leaving the room ma...


In [60]:
vec = TfidfVectorizer(analyzer='char',sublinear_tf = True, max_features=80)
X = vec.fit_transform(train['text'])
X_tst = vec.transform(test['text'])
print(X.shape, X_tst.shape)

(54879, 66) (19617, 66)


In [None]:
# max feature               mid_df           ngram               accuracy
     230                       1               1 3                  59
     300                       1               1 4                  61.5  
    320                        3               1 4                  61.5
    350                        3               1 5                  61.57

    
#analyzer='char'
    330                        5                 X                  


In [61]:
y = train['author'].values
print(X[0].shape, X_tst[0].shape, y.shape)

(1, 66) (1, 66) (54879,)


In [62]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [63]:
p = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=1000,
                             num_leaves=64,
                             learning_rate=0.1,
                             min_child_samples=10,
                             subsample=.5,
                             subsample_freq=1,
                             colsample_bytree=.8,
                             random_state=seed,
                             n_jobs=-1)
    
    clf.fit(X[i_trn], y[i_trn],
            eval_set=[(X[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    p[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_tst) / n_class

training model for CV #1
[1]	valid_0's multi_logloss: 1.51991
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 1.47771
[3]	valid_0's multi_logloss: 1.45085
[4]	valid_0's multi_logloss: 1.42608
[5]	valid_0's multi_logloss: 1.40192
[6]	valid_0's multi_logloss: 1.3822
[7]	valid_0's multi_logloss: 1.36689
[8]	valid_0's multi_logloss: 1.3549
[9]	valid_0's multi_logloss: 1.34234
[10]	valid_0's multi_logloss: 1.33228
[11]	valid_0's multi_logloss: 1.32284
[12]	valid_0's multi_logloss: 1.31452
[13]	valid_0's multi_logloss: 1.30615
[14]	valid_0's multi_logloss: 1.30063
[15]	valid_0's multi_logloss: 1.29513
[16]	valid_0's multi_logloss: 1.2904
[17]	valid_0's multi_logloss: 1.2861
[18]	valid_0's multi_logloss: 1.28236
[19]	valid_0's multi_logloss: 1.27904
[20]	valid_0's multi_logloss: 1.2749
[21]	valid_0's multi_logloss: 1.27238
[22]	valid_0's multi_logloss: 1.26993
[23]	valid_0's multi_logloss: 1.26734
[24]	valid_0's multi_logloss: 1.26555
[25]	valid_0's m

KeyboardInterrupt: 

In [41]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')


Accuracy (CV):  61.5700%


In [29]:
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

(54879, 5) (19617, 5)


In [73]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

In [74]:
sub[sub.columns] = p_tst
sub.head()


ValueError: arrays must all be same length

In [30]:
sub.to_csv(sub_file)

NameError: name 'model' is not defined