In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:

import lightgbm as lgb
from matplotlib import pyplot as plt
from matplotlib import rcParams
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
from nltk.tokenize import word_tokenize
import warnings

In [4]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [5]:
data_dir = Path('../input')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

In [6]:
algo_name = 'lgbcv'
feature_name = 'feature'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'
p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'
sub_file = sub_dir / f'{model_name}.csv'

In [7]:
df = pd.read_csv(feature_file, index_col=0)
print(df.shape)
df.head()

(74496, 13)


Unnamed: 0_level_0,text,author,count_sent,count_word,count_unique_word,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len,word_unique_percent,punct_percent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,"almost choke . much , much want say , strange ...",3.0,1,46,39,240,8,0,4,25,4.2391,84.7826,17.3913
1,"“ sister ask , suppose ? ”",2.0,1,7,7,38,2,1,2,2,4.5714,100.0,28.5714
2,"engage one day walk , peruse jane ’ last lette...",1.0,1,57,50,320,9,0,4,26,4.614,87.7193,15.7895
3,"captain porch , keep carefully way treacherous...",4.0,1,58,49,319,18,0,7,26,4.5172,84.4828,31.0345
4,"“ mercy , gentlemen ! ” odin fling hand . “ ’ ...",3.0,1,39,36,228,13,0,4,16,4.8718,92.3077,33.3333


In [8]:
vec = TfidfVectorizer(dtype=np.float32, tokenizer=word_tokenize, stop_words=stopwords.words('english'),min_df=50, max_features=473)
X_tfidf = vec.fit_transform(df['text'])
print(X_tfidf.shape)

(74496, 473)


In [9]:
X_tfidf=X_tfidf.todense()
vector = pd.DataFrame(data=X_tfidf)


In [10]:
vector

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472
0,0.0000,0.0000,0.0000,0.0000,0.0000,0.3060,0.0,0.0,0.2162,0.0,0.0000,0.0000,0.1503,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3099,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.2866,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000
1,0.0000,0.0000,0.0000,0.0000,0.0000,0.1311,0.0,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.2767,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.4429,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.2004,0.2070
2,0.0000,0.0000,0.0000,0.0000,0.0000,0.3070,0.0,0.0,0.0964,0.0,0.0000,0.1715,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.1785,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.1992,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.2175,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.1055,0.0000,0.0000
3,0.0000,0.0000,0.1477,0.0000,0.0000,0.4493,0.0,0.0,0.2821,0.0,0.0000,0.0000,0.1961,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.1735,0.0000,0.0,0.0000,0.1673,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.4624,0.0,0.1692,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0763,0.0789
4,0.3055,0.0000,0.0000,0.0000,0.0000,0.2710,0.0,0.0,0.1276,0.0,0.2528,0.0000,0.1331,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.2881,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.2794,0.2072,0.2141
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74491,0.0000,0.0843,0.1502,0.0000,0.0000,0.3300,0.0,0.0,0.2151,0.0,0.0000,0.0000,0.1495,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0946,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0000,0.0872,0.0,0.0,0.0,...,0.0,0.0,0.0000,0.1682,0.0,0.0000,0.0000,0.1045,0.1185,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0779,0.0000,0.0,0.0,0.0000,0.1231,0.0000,0.0,0.0,0.0,0.1016,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000
74492,0.0000,0.0000,0.0000,0.0799,0.0798,0.4801,0.0,0.0,0.1966,0.0,0.0000,0.0700,0.1230,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0889,0.0,0.0794,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,...,0.1,0.0,0.0000,0.0000,0.0,0.1775,0.0000,0.0000,0.0000,0.0,0.0887,0.0000,0.0,0.0000,0.0967,0.0,0.0000,0.1024,0.0000,0.0931,0.0,0.0,0.0839,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0734,0.0,0.0,0.0723,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0763,0.0,0.0,0.0,0.0,0.1291,0.0000,0.0000
74493,0.0000,0.0000,0.1735,0.0000,0.0000,0.2933,0.0,0.0,0.1105,0.0,0.0000,0.0000,0.1152,0.0000,0.0,0.0000,0.2772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.2833,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000
74494,0.0000,0.0000,0.1497,0.0000,0.0000,0.2530,0.0,0.0,0.1906,0.0,0.0000,0.0000,0.0000,0.1068,0.0,0.5573,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2049,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0000,0.0000,0.0,0.1434,0.0000,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0000,0.1553,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.1849,0.0,0.0,0.0,0.0,0.0000,0.0000,0.0000


In [11]:
y = df[target_col].values[:54879]
df.drop('text',axis=1,inplace=True)
df.drop(target_col, axis=1, inplace=True)
trn = df.iloc[:54879]
tst = df.iloc[54879:]
feature_name = df.columns.tolist()
print(y.shape, trn.shape, tst.shape)

(54879,) (54879, 11) (19617, 11)


In [23]:
tmp1=vector[:54879]
tmp2=vector[54879:]
tmp1.reset_index(drop=True, inplace=True)
trn.reset_index(drop=True, inplace=True)
tmp2.reset_index(drop=True, inplace=True)
tst.reset_index(drop=True, inplace=True)
x_trn=pd.concat([trn,tmp1], axis=1)
x_tst=pd.concat([tst,tmp2], axis=1)

In [24]:
x_tst

Unnamed: 0,count_sent,count_word,count_unique_word,count_letters,count_punctuations,count_words_upper,count_words_title,count_stopwords,mean_word_len,word_unique_percent,punct_percent,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,...,423,424,425,426,427,428,429,430,431,432,433,434,435,436,437,438,439,440,441,442,443,444,445,446,447,448,449,450,451,452,453,454,455,456,457,458,459,460,461,462,463,464,465,466,467,468,469,470,471,472
0,1,89,68,456,9,5,9,49,4.1348,76.4045,10.1124,0.0,0.0000,0.0000,0.0000,0.0000,0.1552,0.0000,0.0000,0.2435,0.0,0.0,0.1733,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.3505,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.2189,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.191,0.0,0.0,0.0,0.0000,0.0791,0.0817
1,1,43,36,221,20,5,5,21,4.1628,83.7209,46.5116,0.0,0.0000,0.0000,0.0000,0.0000,0.3810,0.3918,0.3287,0.1025,0.0,0.0,0.0000,0.1069,0.0000,0.0,0.5994,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0000,0.1671,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.1442,0.0,0.0,0.0,0.0,0.0,0.0000,0.000,0.0,0.0,0.0,0.0000,0.0000,0.0000
2,1,64,55,375,10,0,2,32,4.8750,85.9375,15.6250,0.0,0.0000,0.0000,0.0000,0.0000,0.4102,0.0000,0.0000,0.1288,0.0,0.0,0.4582,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.1811,0.0,0.0,0.0,0.0,0.0,0.0000,0.000,0.0,0.0,0.0,0.0000,0.0000,0.0000
3,1,240,150,1218,28,11,21,121,4.0792,62.5000,11.6667,0.0,0.0000,0.0000,0.0000,0.0000,0.3830,0.0000,0.0000,0.2645,0.0,0.0,0.0000,0.0501,0.0539,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.1088,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0846,0.0,0.3618,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0865,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.115,0.0000,0.0,0.0,0.0000,0.1081,0.0,0.1353,0.0,0.0,0.0,0.0,0.0,0.0933,0.000,0.0,0.0,0.0,0.0000,0.0390,0.0000
4,1,91,71,510,13,4,8,44,4.6154,78.0220,14.2857,0.0,0.0000,0.0000,0.0000,0.0000,0.2411,0.0000,0.2913,0.1817,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000,0.0,0.0,0.0,0.0000,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19612,1,154,98,861,29,1,10,71,4.5974,63.6364,18.8312,0.0,0.0843,0.1502,0.0000,0.0000,0.3300,0.0000,0.0000,0.2151,0.0,0.0,0.0000,0.1495,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0946,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1682,0.0,0.0000,0.0,0.1045,0.1185,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0000,0.0779,0.0000,0.0,0.0,0.0000,0.1231,0.0000,0.0,0.0,0.0,0.1016,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000,0.0,0.0,0.0,0.0000,0.0000,0.0000
19613,1,258,162,1377,39,5,15,137,4.3411,62.7907,15.1163,0.0,0.0000,0.0000,0.0799,0.0798,0.4801,0.0000,0.0000,0.1966,0.0,0.0,0.0700,0.1230,0.0000,0.0,0.0000,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0889,0.0,0.0794,0.0,0.0000,0.0,0.0,0.0,...,0.1,0.0,0.0,0.0000,0.0,0.1775,0.0,0.0000,0.0000,0.0,0.0887,0.0,0.0,0.0,0.0967,0.0,0.0000,0.1024,0.0000,0.0931,0.0,0.0,0.0839,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.000,0.0734,0.0,0.0,0.0723,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0763,0.000,0.0,0.0,0.0,0.1291,0.0000,0.0000
19614,1,59,44,320,9,0,3,35,4.4237,74.5763,15.2542,0.0,0.0000,0.1735,0.0000,0.0000,0.2933,0.0000,0.0000,0.1105,0.0,0.0,0.0000,0.1152,0.0000,0.0,0.0000,0.2772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0000,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.0000,0.000,0.0,0.0,0.0,0.0000,0.0000,0.0000
19615,1,63,59,347,15,3,9,30,4.5238,93.6508,23.8095,0.0,0.0000,0.1497,0.0000,0.0000,0.2530,0.0000,0.0000,0.1906,0.0,0.0,0.0000,0.0000,0.1068,0.0,0.5573,0.0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2049,0.0,0.0,0.0000,0.0,0.0000,0.0,0.0000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0000,0.0,0.1434,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0000,0.0,0.0000,0.0000,0.1553,0.0000,0.0,0.0,0.0000,0.0000,0.0000,0.0,0.0,0.0,0.0000,0.000,0.0000,0.0,0.0,0.0000,0.0000,0.0,0.0000,0.0,0.0,0.0,0.0,0.0,0.1849,0.000,0.0,0.0,0.0,0.0000,0.0000,0.0000


In [25]:
trn=x_trn.iloc[:].values
tst=x_tst.iloc[:].values
print(trn.shape, tst.shape)

(54879, 484) (19617, 484)


In [26]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [27]:
p_val = np.zeros((trn.shape[0], n_class))
p_tst = np.zeros((tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
    print(f'training model for CV #{i}')
    clf = lgb.LGBMClassifier(objective='multiclass',
                             n_estimators=1000,
                             num_leaves=64,
                             learning_rate=0.1,
                             min_child_samples=10,
                             subsample=.5,
                             subsample_freq=1,
                             colsample_bytree=.8,
                             random_state=seed,
                             n_jobs=-1)
    clf.fit(trn[i_trn], y[i_trn],
            eval_set=[(trn[i_val], y[i_val])],
            eval_metric='multiclass',
            early_stopping_rounds=10)
    
    p_val[i_val, :] = clf.predict_proba(trn[i_val])
    p_tst += clf.predict_proba(tst) / n_fold

training model for CV #1
[1]	valid_0's multi_logloss: 1.47331
Training until validation scores don't improve for 10 rounds
[2]	valid_0's multi_logloss: 1.39509
[3]	valid_0's multi_logloss: 1.33761
[4]	valid_0's multi_logloss: 1.28995
[5]	valid_0's multi_logloss: 1.24196
[6]	valid_0's multi_logloss: 1.20435
[7]	valid_0's multi_logloss: 1.17159
[8]	valid_0's multi_logloss: 1.14297
[9]	valid_0's multi_logloss: 1.11619
[10]	valid_0's multi_logloss: 1.09323
[11]	valid_0's multi_logloss: 1.07315
[12]	valid_0's multi_logloss: 1.05554
[13]	valid_0's multi_logloss: 1.0385
[14]	valid_0's multi_logloss: 1.02311
[15]	valid_0's multi_logloss: 1.00974
[16]	valid_0's multi_logloss: 0.996945
[17]	valid_0's multi_logloss: 0.984435
[18]	valid_0's multi_logloss: 0.97447
[19]	valid_0's multi_logloss: 0.964102
[20]	valid_0's multi_logloss: 0.954582
[21]	valid_0's multi_logloss: 0.945267
[22]	valid_0's multi_logloss: 0.936619
[23]	valid_0's multi_logloss: 0.928954
[24]	valid_0's multi_logloss: 0.922094
[25]

[40]	valid_0's multi_logloss: 0.861264
[41]	valid_0's multi_logloss: 0.858175
[42]	valid_0's multi_logloss: 0.855534
[43]	valid_0's multi_logloss: 0.853026
[44]	valid_0's multi_logloss: 0.849952
[45]	valid_0's multi_logloss: 0.847589
[46]	valid_0's multi_logloss: 0.845052
[47]	valid_0's multi_logloss: 0.84269
[48]	valid_0's multi_logloss: 0.840266
[49]	valid_0's multi_logloss: 0.838133
[50]	valid_0's multi_logloss: 0.835792
[51]	valid_0's multi_logloss: 0.833314
[52]	valid_0's multi_logloss: 0.831386
[53]	valid_0's multi_logloss: 0.829366
[54]	valid_0's multi_logloss: 0.827885
[55]	valid_0's multi_logloss: 0.826375
[56]	valid_0's multi_logloss: 0.824731
[57]	valid_0's multi_logloss: 0.822928
[58]	valid_0's multi_logloss: 0.821719
[59]	valid_0's multi_logloss: 0.820288
[60]	valid_0's multi_logloss: 0.819181
[61]	valid_0's multi_logloss: 0.817582
[62]	valid_0's multi_logloss: 0.816672
[63]	valid_0's multi_logloss: 0.815476
[64]	valid_0's multi_logloss: 0.814619
[65]	valid_0's multi_loglo

[104]	valid_0's multi_logloss: 0.78132
[105]	valid_0's multi_logloss: 0.780725
[106]	valid_0's multi_logloss: 0.780638
[107]	valid_0's multi_logloss: 0.780289
[108]	valid_0's multi_logloss: 0.780006
[109]	valid_0's multi_logloss: 0.780045
[110]	valid_0's multi_logloss: 0.780111
[111]	valid_0's multi_logloss: 0.78003
[112]	valid_0's multi_logloss: 0.779503
[113]	valid_0's multi_logloss: 0.779482
[114]	valid_0's multi_logloss: 0.779336
[115]	valid_0's multi_logloss: 0.778557
[116]	valid_0's multi_logloss: 0.778062
[117]	valid_0's multi_logloss: 0.777656
[118]	valid_0's multi_logloss: 0.777626
[119]	valid_0's multi_logloss: 0.77741
[120]	valid_0's multi_logloss: 0.777282
[121]	valid_0's multi_logloss: 0.777439
[122]	valid_0's multi_logloss: 0.777602
[123]	valid_0's multi_logloss: 0.777357
[124]	valid_0's multi_logloss: 0.777114
[125]	valid_0's multi_logloss: 0.776719
[126]	valid_0's multi_logloss: 0.776504
[127]	valid_0's multi_logloss: 0.776414
[128]	valid_0's multi_logloss: 0.77678
[129

[14]	valid_0's multi_logloss: 1.02502
[15]	valid_0's multi_logloss: 1.01162
[16]	valid_0's multi_logloss: 0.998847
[17]	valid_0's multi_logloss: 0.987906
[18]	valid_0's multi_logloss: 0.977599
[19]	valid_0's multi_logloss: 0.967502
[20]	valid_0's multi_logloss: 0.958315
[21]	valid_0's multi_logloss: 0.949276
[22]	valid_0's multi_logloss: 0.941005
[23]	valid_0's multi_logloss: 0.934164
[24]	valid_0's multi_logloss: 0.927009
[25]	valid_0's multi_logloss: 0.921063
[26]	valid_0's multi_logloss: 0.915207
[27]	valid_0's multi_logloss: 0.908977
[28]	valid_0's multi_logloss: 0.902772
[29]	valid_0's multi_logloss: 0.897636
[30]	valid_0's multi_logloss: 0.892637
[31]	valid_0's multi_logloss: 0.88816
[32]	valid_0's multi_logloss: 0.883179
[33]	valid_0's multi_logloss: 0.878459
[34]	valid_0's multi_logloss: 0.87477
[35]	valid_0's multi_logloss: 0.870739
[36]	valid_0's multi_logloss: 0.867095
[37]	valid_0's multi_logloss: 0.86351
[38]	valid_0's multi_logloss: 0.859674
[39]	valid_0's multi_logloss: 

In [28]:
print(f'{accuracy_score(y, np.argmax(p_val, axis=1)) * 100:.4f}%')

70.2564%


In [29]:
print(p_val.shape, p_tst.shape)

(54879, 5) (19617, 5)


In [73]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

In [74]:
imp = pd.DataFrame({'feature': df.columns, 'importance': clf.feature_importances_})
imp = imp.sort_values('importance').set_index('feature')
imp.plot(kind='barh')

ValueError: arrays must all be same length

In [30]:
model.predict_proba(tst)

NameError: name 'model' is not defined