# 소설 작가 분류 AI 경진대회.
> 월간 데이콘 9 | 소설 문체 | NLP |Logloss

[참고]
- https://dacon.io/competitions/official/235670/codeshare/1901?page=2&dtype=recent&ptype=pub
- https://www.kaggle.com/marcospinaci/0-335-log-loss-in-a-dozen-lines
- https://www.kaggle.com/sudalairajkumar/simple-feature-engg-notebook-spooky-author


## 1. 라이브러리 및 데이터

In [1]:
import pandas as pd 
import numpy as np

import re
# nltk?
import nltk
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn import metrics, preprocessing, pipeline, model_selection, naive_bayes
from sklearn.metrics import log_loss  #?
from sklearn.preprocessing import LabelEncoder 
from sklearn.pipeline import Pipeline #?
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer #?
from sklearn.naive_bayes import MultinomialNB, BernoulliNB #?
from sklearn.calibration import CalibratedClassifierCV #?
from sklearn.linear_model import SGDClassifier, LogisticRegression
import xgboost as xgb

import time

# keras
from keras import backend as K #?
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [2]:
pd.set_option('display.max_columns',200)
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test_x.csv')

## 2. 데이터 전처리
Data Cleansing & Pre-Processing

In [3]:
X_train = train['text'].str.replace('[^a-zA-Z0-9]',' ')
Y_train = LabelEncoder().fit_transform(train['author'])
y_train = train['author']
X_test = test['text'].str.replace('^[a-zA-Z0-9]',' ')

In [4]:
# 구두점 비율(문장안에 각 부호가 얼마나 있는지 확인)
punctuations=[{"id":1,"p":"[;:]"},
              {"id":2,"p":"[,.]"},
              {"id":3,"p":"[?]"},
              {"id":4,"p":"[!]"},
              {"id":5,"p":"[''\']"},
              {"id":6,"p":"[""\"]"},
              {"id":7,"p":"[:;,.?! \' "" '' \"]"}]
for p in punctuations:
    punctuation = p['p']
    _train = [sentence.split() for sentence in train['text']]
    train['punc_' + str(p['id'])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _train]
        
    _test = [sentence.split() for sentence in test['text']]
    test['punc_' + str(p['id'])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _test]


In [46]:
# 구두점 비율(문장 안에 각 부호가 얼마나 있는지)
punctuations = [{"id":1, "p" : "[;:]"},
                {"id":2, "p" : "[,.]"},
                {"id":3, "p" : "[?]"},
                {"id":4, "p" : "[!]"},
                {"id":5, "p" : "[‘’\']"},
                {"id":6, "p" : "[“”\"]"},
                {"id":7, "p" : "[;:,.?!\'“”‘’\"]"}]

for p in punctuations:
    punctuation = p["p"]
    _train =  [sentence.split() for sentence in train['text']]
    train['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _train]

    _test =  [sentence.split() for sentence in test['text']]
    test['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _test]

## Pipeline
- TfidfVectorizer
- CountVectorizer

#### TfidfVectorizer - word

In [17]:
# 1
start = time.localtime()
print('%04d%02d%02d%02d:%02d' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour,start.tm_min))

# tfidf_MNB_
cv_scores=[]
pred_full_test=0
pred_train = np.zeros([train.shape[0],5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    
    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                           ('tfidf',TfidfTransformer()),
                           ('clf',MultinomialNB()),
                          ])
    parameters = {'vect__ngram_range':[(1,2)],
                  'vect__max_df':(0.25,0.3),
#                   'vect__min_df':[1],
                  'vect__analyzer':['word'],
                  'clf__alpha':[0.024, 0.031],
                 }
    
    gs_clf = GridSearchCV(classifier, parameters, n_jobs =-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r'%(param_name, best_parameters[param_name]))
        
    pred_test_y = gs_clf.predict_proba(val_X)   
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
        
print('cv socres:',cv_scores)
print('Mean cv score',np.mean(cv_scores))
pred_full_test = pred_full_test/5

train['tfidf_MNB_0'] = pred_train[:,0]
train['tfidf_MNB_1'] = pred_train[:,1]
train['tfidf_MNB_2'] = pred_train[:,2]
train['tfidf_MNB_3'] = pred_train[:,3]
train['tfidf_MNB_4'] = pred_train[:,4]

test['tfidf_MNB_0'] = pred_full_test[:,0]
test['tfidf_MNB_1'] = pred_full_test[:,1]
test['tfidf_MNB_2'] = pred_full_test[:,2]
test['tfidf_MNB_3'] = pred_full_test[:,3]
test['tfidf_MNB_4'] = pred_full_test[:,4]

end = time.localtime()

print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2021011109:22
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    5.8s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    6.0s finished


	clf__alpha: 0.031
	vect__analyzer: 'word'
	vect__max_df: 0.3
	vect__ngram_range: (1, 2)


KeyboardInterrupt: 

In [16]:
# 2
## MultinomiaaNB(alpha = 0.05 )


start = time.localtime()
print('%04d%02d%02d%02d:%02d' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour,start.tm_min))

# tfidf_MNB_
cv_scores=[]
pred_full_test=0
pred_train = np.zeros([train.shape[0],5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    
    # 위와 clf 부분 다름
    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf',TfidfTransformer()),
                          ('clf',CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method = 'isotonic')),
                          ])
    # clf_apha 주석
    parameters = {'vect__ngram_range':[(1,2)],
                 'vect__max_df': (0.4, 0.5),
                 #'vect__min_df':[1],
                  'vect__analyzer':['word'],
                 #'clf__alpha' :(0.016, 0.018),
                 }

    
    gs_clf = GridSearchCV(classifier, parameters, n_jobs = -1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r' %(param_name, best_parameters[param_name]))
            
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index,:] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
    
print('cv socre:',cv_scores)
print('Mean cv socre:', np.mean(cv_scores))
pred_full_test = pred_full_test/5
    
train['tfidf_MNB_0'] = pred_train[:,0]
train['tfidf_MNB_1'] = pred_train[:,1]
train['tfidf_MNB_2'] = pred_train[:,2]
train['tfidf_MNB_3'] = pred_train[:,3]
train['tfidf_MNB_4'] = pred_train[:,4]

test['tfidf_MNB_0'] = pred_full_test[:,0]
test['tfidf_MNB_1'] = pred_full_test[:,1]
test['tfidf_MNB_2'] = pred_full_test[:,2]
test['tfidf_MNB_3'] = pred_full_test[:,3]
test['tfidf_MNB_4'] = pred_full_test[:,4]

end = time.localtime()

print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2021011109:22
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.0s finished


KeyboardInterrupt: 

In [15]:
# 3
## MultinomiaaNB(alpha),BernoulliNB(alpha) 0.5 -> 0.02
## 'vect__max_df'(0.4,0.5) -> (0.03, 0.4)

start = time.localtime()
print('%04d%02d%02d%02d:%02d' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour,start.tm_min))

# tfidf_MNB_
cv_scores=[]
pred_full_test=0
pred_train = np.zeros([train.shape[0],5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    
    # 위와 clf 부분 다름
    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf',TfidfTransformer()),
                          ('clf',CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method = 'isotonic')),
                          ])
    # clf_apha 주석
    parameters = {'vect__ngram_range':[(1,2)],
                 'vect__max_df': (0.03, 0.4),
                 #'vect__min_df':[1],
                  'vect__analyzer':['word'],
                 #'clf__alpha' :(0.016, 0.018),
                 }

    
    gs_clf = GridSearchCV(classifier, parameters, n_jobs = -1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r' %(param_name, best_parameters[param_name]))
            
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index,:] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
    
print('cv socre:',cv_scores)
print('Mean cv socre:', np.mean(cv_scores))
pred_full_test = pred_full_test/5
    
train['tfidf_MNB_0'] = pred_train[:,0]
train['tfidf_MNB_1'] = pred_train[:,1]
train['tfidf_MNB_2'] = pred_train[:,2]
train['tfidf_MNB_3'] = pred_train[:,3]
train['tfidf_MNB_4'] = pred_train[:,4]

test['tfidf_MNB_0'] = pred_full_test[:,0]
test['tfidf_MNB_1'] = pred_full_test[:,1]
test['tfidf_MNB_2'] = pred_full_test[:,2]
test['tfidf_MNB_3'] = pred_full_test[:,3]
test['tfidf_MNB_4'] = pred_full_test[:,4]

end = time.localtime()

print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2021011109:21
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.7s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.6s finished


	vect__analyzer: 'word'
	vect__max_df: 0.03
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.3s finished


KeyboardInterrupt: 

In [32]:
# 4 
## clf, SGDClassifier(loss = 'modified_huber', alpha = 0.00001, max_iter = 10000, tol=1e-4),method='sigmoid'

start = time.localtime()
print('%04d/%02d/%02d/%02d/%02d' % (start.tm_year, start.tm_mon,start.tm_mday, start.tm_hour, start.tm_min))
# stidf_CBNB_
cv_csores =[]
pred_ful_test = 0
pred_train = np.zeros([train.shape[0],5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    
    classifier = Pipeline([('vect', TfidfVectorizer(lowercase = False)),
                         ('tfidf',TfidfTransformer()),
                         ('clf',CalibratedClassifierCV(SGDClassifier(loss = 'modified_huber', alpha = 0.00001, max_iter = 10000, tol=1e-4),method='sigmoid')),
#                            ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
                           ])
    parameters = {'vect__ngram_range':[(1,2)],
#                  'vect__max_df':(0/03, 0.4),
#                   'vect__min_df':[1],
                  'vect__analyzer':['word'],
#                   'clf__alpha':(0.016,0.018)
                 }
    
    gs_clf = GridSearchCV(classifier, parameters, n_jobs = -1, verbose = 1, cv =2)
    gs_clf.fit(dev_X,dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r'%(param_name, best_parameters[param_name]))
        
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, :] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
    
print('cv score : ', cv_scores)
print('Meanc cv score : ', np.mean(cv_scores))
pred_full_test = pred_full_test/5

train["tfidf_CBNB_0"] = pred_train[ : , 0]
train["tfidf_CBNB_1"] = pred_train[ : , 1]
train["tfidf_CBNB_2"] = pred_train[ : , 2]
train["tfidf_CBNB_3"] = pred_train[ : , 3]
train["tfidf_CBNB_4"] = pred_train[ : , 4]
test["tfidf_CBNB_0"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print('%04d/%02d/%02d/%02d/%02d' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print('%04d/%02d/%02d/%02d/%02d' % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

    

2021/01/11/09/36
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.2s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.4s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.3s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.1s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.3s finished


	vect__analyzer: 'word'
	vect__ngram_range: (1, 2)
cv score :  [0.6028549775135393, 0.6172834527914917, 0.6170012178396332, 0.601880893347187, 0.619873449366342, 0.6028883684967539, 0.6184402906278125, 0.6182194795364018, 0.6022628815022257, 0.6202862741321026, 0.6024717063000936, 0.6188515838634626, 0.6168023618902848, 0.6011816322805669, 0.6210410841780997]
Meanc cv score :  0.6120893102443998
2021/01/11/09/36
2021/01/11/09/37


In [None]:
# 5
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0"] = pred_train[ : , 0]
train["tfidf_L_1"] = pred_train[ : , 1]
train["tfidf_L_2"] = pred_train[ : , 2]
train["tfidf_L_3"] = pred_train[ : , 3]
train["tfidf_L_4"] = pred_train[ : , 4]
test["tfidf_L_0"] = pred_full_test[ : , 0]
test["tfidf_L_1"] = pred_full_test[ : , 1]
test["tfidf_L_2"] = pred_full_test[ : , 2]
test["tfidf_L_3"] = pred_full_test[ : , 3]
test["tfidf_L_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))


### CountVectorzier - word

In [None]:
# 5

start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
                  'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0"] = pred_train[ : , 0]
train["count_MNB_1"] = pred_train[ : , 1]
train["count_MNB_2"] = pred_train[ : , 2]
train["count_MNB_3"] = pred_train[ : , 3]
train["count_MNB_4"] = pred_train[ : , 4]
test["count_MNB_0"] = pred_full_test[ : , 0]
test["count_MNB_1"] = pred_full_test[ : , 1]
test["count_MNB_2"] = pred_full_test[ : , 2]
test["count_MNB_3"] = pred_full_test[ : , 3]
test["count_MNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 6
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0"] = pred_train[ : , 0]
train["count_CMNB_1"] = pred_train[ : , 1]
train["count_CMNB_2"] = pred_train[ : , 2]
train["count_CMNB_3"] = pred_train[ : , 3]
train["count_CMNB_4"] = pred_train[ : , 4]
test["count_CMNB_0"] = pred_full_test[ : , 0]
test["count_CMNB_1"] = pred_full_test[ : , 1]
test["count_CMNB_2"] = pred_full_test[ : , 2]
test["count_CMNB_3"] = pred_full_test[ : , 3]
test["count_CMNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 7 
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
                  'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0"] = pred_train[ : , 0]
train["count_CBNB_1"] = pred_train[ : , 1]
train["count_CBNB_2"] = pred_train[ : , 2]
train["count_CBNB_3"] = pred_train[ : , 3]
train["count_CBNB_4"] = pred_train[ : , 4]
test["count_CBNB_0"] = pred_full_test[ : , 0]
test["count_CBNB_1"] = pred_full_test[ : , 1]
test["count_CBNB_2"] = pred_full_test[ : , 2]
test["count_CBNB_3"] = pred_full_test[ : , 3]
test["count_CBNB_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 9 
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0"] = pred_train[ : , 0]
train["count_CH_1"] = pred_train[ : , 1]
train["count_CH_2"] = pred_train[ : , 2]
train["count_CH_3"] = pred_train[ : , 3]
train["count_CH_4"] = pred_train[ : , 4]
test["count_CH_0"] = pred_full_test[ : , 0]
test["count_CH_1"] = pred_full_test[ : , 1]
test["count_CH_2"] = pred_full_test[ : , 2]
test["count_CH_3"] = pred_full_test[ : , 3]
test["count_CH_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 10
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 2)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['word'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0"] = pred_train[ : , 0]
train["count_L_1"] = pred_train[ : , 1]
train["count_L_2"] = pred_train[ : , 2]
train["count_L_3"] = pred_train[ : , 3]
train["count_L_4"] = pred_train[ : , 4]
test["count_L_0"] = pred_full_test[ : , 0]
test["count_L_1"] = pred_full_test[ : , 1]
test["count_L_2"] = pred_full_test[ : , 2]
test["count_L_3"] = pred_full_test[ : , 3]
test["count_L_4"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

### TfidfVectorizer - char

In [None]:
# 11
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 3)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0_char"] = pred_train[ : , 0]
train["tfidf_MNB_1_char"] = pred_train[ : , 1]
train["tfidf_MNB_2_char"] = pred_train[ : , 2]
train["tfidf_MNB_3_char"] = pred_train[ : , 3]
train["tfidf_MNB_4_char"] = pred_train[ : , 4]
test["tfidf_MNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_MNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_MNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_MNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_MNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [37]:
# 12
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CMNB_0_char"] = pred_train[ : , 0]
train["tfidf_CMNB_1_char"] = pred_train[ : , 1]
train["tfidf_CMNB_2_char"] = pred_train[ : , 2]
train["tfidf_CMNB_3_char"] = pred_train[ : , 3]
train["tfidf_CMNB_4_char"] = pred_train[ : , 4]
test["tfidf_CMNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_CMNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_CMNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_CMNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_CMNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


2021/01/11 10:18
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.0min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  1.0min finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   59.8s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   59.2s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   59.8s finished


	vect__analyzer: 'char'
	vect__ngram_range: (1, 7)


  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


cv score :  [0.7757343584422367, 0.7817726107614854, 0.813803751371872, 0.7902246807326817, 0.8351873041845956]
Mean cv score :  0.7993445410985742
2021/01/11 10:18
2021/01/11 10:33


  proba /= np.sum(proba, axis=1)[:, np.newaxis]


In [None]:
# 13 
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CBNB_0_char"] = pred_train[ : , 0]
train["tfidf_CBNB_1_char"] = pred_train[ : , 1]
train["tfidf_CBNB_2_char"] = pred_train[ : , 2]
train["tfidf_CBNB_3_char"] = pred_train[ : , 3]
train["tfidf_CBNB_4_char"] = pred_train[ : , 4]
test["tfidf_CBNB_0_char"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1_char"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2_char"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3_char"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 14
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CH_0_char"] = pred_train[ : , 0]
train["tfidf_CH_1_char"] = pred_train[ : , 1]
train["tfidf_CH_2_char"] = pred_train[ : , 2]
train["tfidf_CH_3_char"] = pred_train[ : , 3]
train["tfidf_CH_4_char"] = pred_train[ : , 4]
test["tfidf_CH_0_char"] = pred_full_test[ : , 0]
test["tfidf_CH_1_char"] = pred_full_test[ : , 1]
test["tfidf_CH_2_char"] = pred_full_test[ : , 2]
test["tfidf_CH_3_char"] = pred_full_test[ : , 3]
test["tfidf_CH_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 15
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0_char"] = pred_train[ : , 0]
train["tfidf_L_1_char"] = pred_train[ : , 1]
train["tfidf_L_2_char"] = pred_train[ : , 2]
train["tfidf_L_3_char"] = pred_train[ : , 3]
train["tfidf_L_4_char"] = pred_train[ : , 4]
test["tfidf_L_0_char"] = pred_full_test[ : , 0]
test["tfidf_L_1_char"] = pred_full_test[ : , 1]
test["tfidf_L_2_char"] = pred_full_test[ : , 2]
test["tfidf_L_3_char"] = pred_full_test[ : , 3]
test["tfidf_L_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

### CountVectorizer - chaar

In [None]:
# 16
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 3)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0_char"] = pred_train[ : , 0]
train["count_MNB_1_char"] = pred_train[ : , 1]
train["count_MNB_2_char"] = pred_train[ : , 2]
train["count_MNB_3_char"] = pred_train[ : , 3]
train["count_MNB_4_char"] = pred_train[ : , 4]
test["count_MNB_0_char"] = pred_full_test[ : , 0]
test["count_MNB_1_char"] = pred_full_test[ : , 1]
test["count_MNB_2_char"] = pred_full_test[ : , 2]
test["count_MNB_3_char"] = pred_full_test[ : , 3]
test["count_MNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 17
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0_char"] = pred_train[ : , 0]
train["count_CMNB_1_char"] = pred_train[ : , 1]
train["count_CMNB_2_char"] = pred_train[ : , 2]
train["count_CMNB_3_char"] = pred_train[ : , 3]
train["count_CMNB_4_char"] = pred_train[ : , 4]
test["count_CMNB_0_char"] = pred_full_test[ : , 0]
test["count_CMNB_1_char"] = pred_full_test[ : , 1]
test["count_CMNB_2_char"] = pred_full_test[ : , 2]
test["count_CMNB_3_char"] = pred_full_test[ : , 3]
test["count_CMNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 18
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0_char"] = pred_train[ : , 0]
train["count_CBNB_1_char"] = pred_train[ : , 1]
train["count_CBNB_2_char"] = pred_train[ : , 2]
train["count_CBNB_3_char"] = pred_train[ : , 3]
train["count_CBNB_4_char"] = pred_train[ : , 4]
test["count_CBNB_0_char"] = pred_full_test[ : , 0]
test["count_CBNB_1_char"] = pred_full_test[ : , 1]
test["count_CBNB_2_char"] = pred_full_test[ : , 2]
test["count_CBNB_3_char"] = pred_full_test[ : , 3]
test["count_CBNB_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 19 
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0_char"] = pred_train[ : , 0]
train["count_CH_1_char"] = pred_train[ : , 1]
train["count_CH_2_char"] = pred_train[ : , 2]
train["count_CH_3_char"] = pred_train[ : , 3]
train["count_CH_4_char"] = pred_train[ : , 4]
test["count_CH_0_char"] = pred_full_test[ : , 0]
test["count_CH_1_char"] = pred_full_test[ : , 1]
test["count_CH_2_char"] = pred_full_test[ : , 2]
test["count_CH_3_char"] = pred_full_test[ : , 3]
test["count_CH_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 20
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0_char"] = pred_train[ : , 0]
train["count_L_1_char"] = pred_train[ : , 1]
train["count_L_2_char"] = pred_train[ : , 2]
train["count_L_3_char"] = pred_train[ : , 3]
train["count_L_4_char"] = pred_train[ : , 4]
test["count_L_0_char"] = pred_full_test[ : , 0]
test["count_L_1_char"] = pred_full_test[ : , 1]
test["count_L_2_char"] = pred_full_test[ : , 2]
test["count_L_3_char"] = pred_full_test[ : , 3]
test["count_L_4_char"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

### TfidfVectorizer - char_wb

In [None]:
# 21
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 4)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_MNB_0_char_wb"] = pred_train[ : , 0]
train["tfidf_MNB_1_char_wb"] = pred_train[ : , 1]
train["tfidf_MNB_2_char_wb"] = pred_train[ : , 2]
train["tfidf_MNB_3_char_wb"] = pred_train[ : , 3]
train["tfidf_MNB_4_char_wb"] = pred_train[ : , 4]
test["tfidf_MNB_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_MNB_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_MNB_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_MNB_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_MNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 22
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CMNB_0_char_wb"] = pred_train[ : , 0]
train["tfidf_CMNB_1_char_wb"] = pred_train[ : , 1]
train["tfidf_CMNB_2_char_wb"] = pred_train[ : , 2]
train["tfidf_CMNB_3_char_wb"] = pred_train[ : , 3]
train["tfidf_CMNB_4_char_wb"] = pred_train[ : , 4]
test["tfidf_CMNB_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_CMNB_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_CMNB_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_CMNB_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_CMNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 23
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CBNB_0_char_wb"] = pred_train[ : , 0]
train["tfidf_CBNB_1_char_wb"] = pred_train[ : , 1]
train["tfidf_CBNB_2_char_wb"] = pred_train[ : , 2]
train["tfidf_CBNB_3_char_wb"] = pred_train[ : , 3]
train["tfidf_CBNB_4_char_wb"] = pred_train[ : , 4]
test["tfidf_CBNB_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_CBNB_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_CBNB_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_CBNB_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_CBNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 24
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 5)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_CH_0_char_wb"] = pred_train[ : , 0]
train["tfidf_CH_1_char_wb"] = pred_train[ : , 1]
train["tfidf_CH_2_char_wb"] = pred_train[ : , 2]
train["tfidf_CH_3_char_wb"] = pred_train[ : , 3]
train["tfidf_CH_4_char_wb"] = pred_train[ : , 4]
test["tfidf_CH_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_CH_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_CH_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_CH_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_CH_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 25
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# tfidf_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 5)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["tfidf_L_0_char_wb"] = pred_train[ : , 0]
train["tfidf_L_1_char_wb"] = pred_train[ : , 1]
train["tfidf_L_2_char_wb"] = pred_train[ : , 2]
train["tfidf_L_3_char_wb"] = pred_train[ : , 3]
train["tfidf_L_4_char_wb"] = pred_train[ : , 4]
test["tfidf_L_0_char_wb"] = pred_full_test[ : , 0]
test["tfidf_L_1_char_wb"] = pred_full_test[ : , 1]
test["tfidf_L_2_char_wb"] = pred_full_test[ : , 2]
test["tfidf_L_3_char_wb"] = pred_full_test[ : , 3]
test["tfidf_L_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

### CountVectorizer - char_wb

In [None]:
# 26
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_MNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', MultinomialNB()),
    ])
    parameters = {'vect__ngram_range': [(1, 4)],
#                   'vect__max_df': (0.25, 0.3),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': [0.024, 0.031],
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_MNB_0_char_wb"] = pred_train[ : , 0]
train["count_MNB_1_char_wb"] = pred_train[ : , 1]
train["count_MNB_2_char_wb"] = pred_train[ : , 2]
train["count_MNB_3_char_wb"] = pred_train[ : , 3]
train["count_MNB_4_char_wb"] = pred_train[ : , 4]
test["count_MNB_0_char_wb"] = pred_full_test[ : , 0]
test["count_MNB_1_char_wb"] = pred_full_test[ : , 1]
test["count_MNB_2_char_wb"] = pred_full_test[ : , 2]
test["count_MNB_3_char_wb"] = pred_full_test[ : , 3]
test["count_MNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 27
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CMNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 6), (1, 7)],
#                   'vect__max_df': (0.4, 0.5),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CMNB_0_char_wb"] = pred_train[ : , 0]
train["count_CMNB_1_char_wb"] = pred_train[ : , 1]
train["count_CMNB_2_char_wb"] = pred_train[ : , 2]
train["count_CMNB_3_char_wb"] = pred_train[ : , 3]
train["count_CMNB_4_char_wb"] = pred_train[ : , 4]
test["count_CMNB_0_char_wb"] = pred_full_test[ : , 0]
test["count_CMNB_1_char_wb"] = pred_full_test[ : , 1]
test["count_CMNB_2_char_wb"] = pred_full_test[ : , 2]
test["count_CMNB_3_char_wb"] = pred_full_test[ : , 3]
test["count_CMNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 28
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CBNB_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(BernoulliNB(alpha = 0.02), method='isotonic')),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.03, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CBNB_0_char_wb"] = pred_train[ : , 0]
train["count_CBNB_1_char_wb"] = pred_train[ : , 1]
train["count_CBNB_2_char_wb"] = pred_train[ : , 2]
train["count_CBNB_3_char_wb"] = pred_train[ : , 3]
train["count_CBNB_4_char_wb"] = pred_train[ : , 4]
test["count_CBNB_0_char_wb"] = pred_full_test[ : , 0]
test["count_CBNB_1_char_wb"] = pred_full_test[ : , 1]
test["count_CBNB_2_char_wb"] = pred_full_test[ : , 2]
test["count_CBNB_3_char_wb"] = pred_full_test[ : , 3]
test["count_CBNB_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 29
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_CH_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', CalibratedClassifierCV(SGDClassifier(loss='modified_huber', alpha=0.00001, max_iter=10000, tol=1e-4), method='sigmoid')),
    ])
    parameters = {'vect__ngram_range': [(1, 6)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_CH_0_char_wb"] = pred_train[ : , 0]
train["count_CH_1_char_wb"] = pred_train[ : , 1]
train["count_CH_2_char_wb"] = pred_train[ : , 2]
train["count_CH_3_char_wb"] = pred_train[ : , 3]
train["count_CH_4_char_wb"] = pred_train[ : , 4]
test["count_CH_0_char_wb"] = pred_full_test[ : , 0]
test["count_CH_1_char_wb"] = pred_full_test[ : , 1]
test["count_CH_2_char_wb"] = pred_full_test[ : , 2]
test["count_CH_3_char_wb"] = pred_full_test[ : , 3]
test["count_CH_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
# 30
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
# count_L_
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([train.shape[0], 5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]

    classifier = Pipeline([('vect', CountVectorizer(lowercase=False)),
                          ('tfidf', TfidfTransformer()),
                          ('clf', LogisticRegression(C=50, max_iter=200)),
    ])
    parameters = {'vect__ngram_range': [(1, 7)],
#                   'vect__max_df': (0.3, 0.4),
#                   'vect__min_df': [1],
                  'vect__analyzer' : ['char_wb'],
#                   'clf__alpha': (0.016, 0.018),
    }
    gs_clf = GridSearchCV(classifier, parameters, n_jobs=-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5

train["count_L_0_char_wb"] = pred_train[ : , 0]
train["count_L_1_char_wb"] = pred_train[ : , 1]
train["count_L_2_char_wb"] = pred_train[ : , 2]
train["count_L_3_char_wb"] = pred_train[ : , 3]
train["count_L_4_char_wb"] = pred_train[ : , 4]
test["count_L_0_char_wb"] = pred_full_test[ : , 0]
test["count_L_1_char_wb"] = pred_full_test[ : , 1]
test["count_L_2_char_wb"] = pred_full_test[ : , 2]
test["count_L_3_char_wb"] = pred_full_test[ : , 3]
test["count_L_4_char_wb"] = pred_full_test[ : , 4]    

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

### Keras

In [None]:
def preprocessFastText(text):
    text = text.replace("'","'")
    signs = set(';:,.?!\' "" '' \"')
    prods = set(text) % signs
    if not prods:
        return text
    
    for sign in prods:
        text = text.replcae(sign,' {} '.format(sign))
    return text

def creat+_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
        ngrams = []
        for n in range(2, n_gram_max+1):
            for w_index in range(len(q)-n+1):
                ngrams.append('--'.join(q[w_index:windex+n]))
        return q + ngrams
    
    docs= []
    for docs in df.text:
        doc = preprocessFastText(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
        
    return docs

docs = create_docs(train)
tokenizer = Tokenizer(lower = False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_coints.items() if v >=2])

tokenizer = Tokenizer(num_words = num_words, lower=False, filters='')
tokenzier.fit_no_texts(docs)
docs = tokenizer.text_to_sequences(docs)

maxlen = max([max(len(l) for l in docs)])

docs = pad_sequences(sequences=docs, maxlen=maxlen)

docs_test = create_docs(test)
docs_test = tokenizer.texts_to_sequences(docs_test)
docs_test = pad_sequences(sequences = docs_test, maxlen=maxlen)

xtrain_pad = docs
xtest_pad = docs_test
                

In [None]:
input_dim = np.max(docs) + 1
embedding_dims = 20

def initFastText(embedding_dims, input_dim):
    model = Sequential()
    model.add(Embedding(input_dim = input_dim, ouput_dim = embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(5, activation='softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [None]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))

ytrain_enc = np_utils.to_categorical(Y_train)
earlyStopping=EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([xtrain_pad.shape[0], 5])

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)

for dev_index, val_index in kf.split(xtrain_pad):
    dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
    dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
    
    model = initFastText(embedding_dims,input_dim)
    model.fit(dev_X, dev_y,
              batch_size=32, 
              epochs=40, 
              verbose=1, 
              validation_data=(val_X, val_y),
              callbacks=[earlyStopping])
    
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(xtest_pad)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print('')
    print('')    
    print('')    
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5 

train["ff_0"] = pred_train[:,0]
train["ff_1"] = pred_train[:,1]
train["ff_2"] = pred_train[:,2]
train["ff_3"] = pred_train[:,3]
train["ff_4"] = pred_train[:,4]
test["ff_0"] = pred_full_test[:,0]
test["ff_1"] = pred_full_test[:,1]
test["ff_2"] = pred_full_test[:,2]
test["ff_3"] = pred_full_test[:,3]
test["ff_4"] = pred_full_test[:,4]

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

In [None]:
max_len = 70
nb_words = 10000

texts_1 = []
for text in train['text']:
    texts_1.append(text)
    
test_texts_1 = []
for text in test['text']:
    test_texts_1.append(text)
    
    
tokenizer = Tokenzier(num_words = nb_words)
tokenizer.fit_on_texts(texts_1)
sequences_1 = tokenzier.texts_to_sequences(texts_1)
word_index = tokenizer.word_index

test_sequences_1 = tokenzier.texts_to_sequences(test_texts_1)

x_train_pad = pad_sequences(sequences_1, maxlen=max_len)
xtest_pad = pad_sequences(test_sequences_1, maxlen = max_len)
del test_sequences_1
del sequences_1
nb_words_cnt = min(nb_words, len(word_index)) + 1

In [None]:
def initNN(nb_words_cnt, max_len):
    model = Sequential()
    model.add(Embedding(nb_words_cnt, 32, input_length=max_len))
    model.add(Dropout(0.3))
    model.add(Conv1D(64,5,padding='valid', activation='relu'))
    model.add(Dropout(0.3))
    model.add(MaxPolling1D())
    model.add(Flatten())
    model.add(Dense(800, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model


In [None]:
start = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))

ytrain_enc = np_utils.to_categorical(Y_train)
earlyStopping=EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')
cv_scores = []
pred_full_test = 0
pred_train = np.zeros([xtrain_pad.shape[0], 5])

kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=32143233)

for dev_index, val_index in kf.split(xtrain_pad):
    dev_X, val_X = xtrain_pad[dev_index], xtrain_pad[val_index]
    dev_y, val_y = ytrain_enc[dev_index], ytrain_enc[val_index]
    
    model = initNN(nb_words_cnt, max_len)
    model.fit(dev_X, dev_y,
              batch_size=32,
              epochs=3,
              verbose=1,
              validation_data=(val_X, val_y),
              callbacks=[earlyStopping])
    
    pred_val_y = model.predict(val_X)
    pred_test_y = model.predict(xtest_pad)
    pred_full_test = pred_full_test + pred_test_y
    pred_train[val_index,:] = pred_val_y
    cv_scores.append(metrics.log_loss(val_y, pred_val_y))
    print('')
    print('')
    print('')
print("cv score : ", cv_scores)
print("Mean cv score : ", np.mean(cv_scores))
pred_full_test = pred_full_test / 5 

train["nn_0"] = pred_train[:,0]
train["nn_1"] = pred_train[:,1]
train["nn_2"] = pred_train[:,2]
train["nn_3"] = pred_train[:,3]
train["nn_4"] = pred_train[:,4]

test["nn_0"] = pred_full_test[:,0]
test["nn_1"] = pred_full_test[:,1]
test["nn_2"] = pred_full_test[:,2]
test["nn_3"] = pred_full_test[:,3]
test["nn_4"] = pred_full_test[:,4]

end = time.localtime()
print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

### 모델 학습 및 검증

### Model Tunning & Evaluation

In [None]:
start = time.localtime()
print('%04d/%02d/%02d/ %02d:%02' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))

# Final Model
# XGBoost
def runXGB(train_X, train_y, test_X, test_y=None, test_X3 = None, seed_val=0, child = 1, colsample=0.3):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta']= 0.1
    parma['max_depth'] = 5
#     param['silent']=1
    param['num_class'] = 5
    param['eval_metric']= 'mlogloss'
    param['min_child_weight'] = child
    param['subsample'] = 0.8
    param['colsample_bytree'] = colsample
    param['seed'] = seed_val
    num_rounds = 2000
    
    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label = train_y)
    
    if test_y in not None:
        xgtest = xgb.DMatrix(test_X, label = test_y)
        watchlist = [(xgtrain,'train'), (xgtest,'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=20)
        
    else: 
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)
        
    pred_test_y = model.predict(xgtest, ntree_limit = model.best_ntree_limit)
    if test_X2 is not None:
        xgtest2 = xgb.DMatrix(test_X2)
        pred_test_y2 = model.predict(xgtest2, ntree_limit = model.best_ntree_limit)
    return pred_test_y, pred_test_y2, model

def do(train ,test, Y_train):
    drop_columns = ['index', 'text']
    x_train = train.drop(drop_columns+['author'], axis = 1)
    x_test = test.drop(drop_columns, axis = 1)
    y_train =Y_train
    
    kf = model_selection.KFold(n_splits = 5, shuffle=True, random_state = 32143233)
    cv_scores=[]
    pred_full_test=0
    pred_train=np.zeros([x_train.shape[0],5])
    for dev_index, val_index in kf.split(x_train):
        dev_X, val_X = x_train.loc[dev_index], x_train.loc[val_index]
        dev_y, val_y = y_train[dev_index], y_train[val_index]
        pred_val_y, pred_test_y, model = runXGB(dev_X, dev_y, val_X, val_y ,x_test, seed_val=0, colsample=0.7)
        pred_full_test = pred_full_test + pred_test_y
        pred_train[val_index, : ] = pred_val_y
        cv_scores.append(metrics.log_loss(val_y, pred_val_y))
        
    print('cv score : ', cv_scores)
    print('Mean cv score : ', np.mean(cv_scores))
    return pred_full_test/5
result = do(train, test, Y_train)

end = time.localtime()
print('%04d/%02d/%02d %02d:%02d' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print('%04d/%02d/%02d %02d:%02d' % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))
    
        