# 소설 작가 분류 AI 경진대회.
> 월간 데이콘 9 | 소설 문체 | NLP |Logloss

[참고]
- https://dacon.io/competitions/official/235670/codeshare/1901?page=2&dtype=recent&ptype=pub
- https://www.kaggle.com/marcospinaci/0-335-log-loss-in-a-dozen-lines
- https://www.kaggle.com/sudalairajkumar/simple-feature-engg-notebook-spooky-author


## 1. 라이브러리 및 데이터

In [16]:
import pandas as pd 
import numpy as np

import re
# nltk?
import nltk
import nltk.data
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn import metrics, preprocessing, pipeline, model_selection, naive_bayes
from sklearn.metrics import log_loss  #?
from sklearn.preprocessing import LabelEncoder 
from sklearn.pipeline import Pipeline #?
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer #?
from sklearn.naive_bayes import MultinomialNB, BernoulliNB #?
from sklearn.calibration import CalibratedClassifierCV #?
from sklearn.linear_model import SGDClassifier, LogisticRegression
import xgboost as xgb

import time

# keras
from keras import backend as K #?
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import GlobalAveragePooling1D, Conv1D, MaxPooling1D, Flatten
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence, text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping

In [18]:
pd.set_option('display.max_columns',200)
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test_x.csv')

## 2. 데이터 전처리
Data Cleansing & Pre-Processing

In [29]:
X_train = train['text'].str.replace('[^a-zA-Z0-9]',' ')
Y_train = LabelEncoder().fit_transform(train['author'])
y_train = train['author']
X_test = test['text'].str.replace('^[a-zA-Z0-9]',' ')

In [50]:
# 구두점 비율(문장안에 각 부호가 얼마나 있는지 확인)
punctuations=[{"id":1,"p":"[;:]"},
              {"id":2,"p":"[,.]"},
              {"id":3,"p":"[?]"},
              {"id":4,"p":"[!]"},
              {"id":5,"p":"[''\']"},
              {"id":6,"p":"[""\"]"},
              {"id":7,"p":"[:;,.?! \' "" '' \"]"}]
for p in punctuations:
    punctuation = p['p']
    _train = [sentence.split() for sentence in train['text']]
    train['punc_' + str(p['id'])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _train]
        
    _test = [sentence.split() for sentence in test['text']]
    test['punc_' + str(p['id'])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _test]


In [46]:
# 구두점 비율(문장 안에 각 부호가 얼마나 있는지)
punctuations = [{"id":1, "p" : "[;:]"},
                {"id":2, "p" : "[,.]"},
                {"id":3, "p" : "[?]"},
                {"id":4, "p" : "[!]"},
                {"id":5, "p" : "[‘’\']"},
                {"id":6, "p" : "[“”\"]"},
                {"id":7, "p" : "[;:,.?!\'“”‘’\"]"}]

for p in punctuations:
    punctuation = p["p"]
    _train =  [sentence.split() for sentence in train['text']]
    train['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _train]

    _test =  [sentence.split() for sentence in test['text']]
    test['punc_' + str(p["id"])] = [len([word for word in sentence if bool(re.search(punctuation, word))]) * 100 / len(sentence) for sentence in _test]

## Pipeline
- TfidfVectorizer
- CountVectorizer

#### TfidfVectorizer - word

In [76]:
start = time.localtime()
print('%04d%02d%02d%02d:%02d' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour,start.tm_min))

# tfidf_MNB_
cv_scores=[]
pred_full_test=0
pred_train = np.zeros([train.shape[0],5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    
    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                           ('tfidf',TfidfTransformer()),
                           ('clf',MultinomialNB()),
                          ])
    parameters = {'vect__ngram_range':[(1,2)],
                  'vect__max_df':(0.25,0.3),
#                   'vect__min_df':[1],
                  'vect__analyzer':['word'],
                  'clf__alpha':[0.024, 0.031],
                 }
    
    gs_clf = GridSearchCV(classifier, parameters, n_jobs =-1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r'%(param_name, best_parameters[param_name]))
        
    pred_test_y = gs_clf.predict_proba(val_X)   
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index, : ] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
        
print('cv socres:',cv_scores)
print('Mean cv score',np.mean(cv_scores))
pred_full_test = pred_full_test/5

train['tfidf_MNB_0'] = pred_train[:,0]
train['tfidf_MNB_1'] = pred_train[:,1]
train['tfidf_MNB_2'] = pred_train[:,2]
train['tfidf_MNB_3'] = pred_train[:,3]
train['tfidf_MNB_4'] = pred_train[:,4]

test['tfidf_MNB_0'] = pred_full_test[:,0]
test['tfidf_MNB_1'] = pred_full_test[:,1]
test['tfidf_MNB_2'] = pred_full_test[:,2]
test['tfidf_MNB_3'] = pred_full_test[:,3]
test['tfidf_MNB_4'] = pred_full_test[:,4]

end = time.localtime()

print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2021010719:20
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    6.5s finished


	vect__analyzer: 'word'
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


KeyboardInterrupt: 

In [85]:
start = time.localtime()
print('%04d%02d%02d%02d:%02d' % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour,start.tm_min))

# tfidf_MNB_
cv_scores=[]
pred_full_test=0
pred_train = np.zeros([train.shape[0],5])

kf = model_selection.KFold(n_splits = 5, shuffle = True, random_state = 32143233)
for dev_index, val_index in kf.split(train):
    dev_X, val_X = X_train[dev_index], X_train[val_index]
    dev_y, val_y = y_train[dev_index], y_train[val_index]
    
    # 위와 clf 부분 다름
    classifier = Pipeline([('vect', TfidfVectorizer(lowercase=False)),
                          ('tfidf',TfidfTransformer()),
                          ('clf',CalibratedClassifierCV(MultinomialNB(alpha = 0.05), method = 'isotonic')),
                          ])
    parameters = {'vect__ngram_range':[(1,2)],
                 'vect__max_df': (0.4, 0.5),
                 #'vect__min_df':[1],
                  'vect__analyzer':['word'],
                 #'clf__alpha' :(0.016, 0.018),
                 }

    
    gs_clf = GridSearchCV(classifier, parameters, n_jobs = -1, verbose=1, cv=2)
    gs_clf.fit(dev_X, dev_y)
    best_parameters = gs_clf.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print('\t%s: %r' %(param_name, best_parameters[param_name]))
            
    pred_test_y = gs_clf.predict_proba(val_X)
    pred_test_y2 = gs_clf.predict_proba(X_test)
    pred_full_test = pred_full_test + pred_test_y2
    pred_train[val_index,:] = pred_test_y
    cv_scores.append(metrics.log_loss(val_y, pred_test_y))
    
print('cv socre:',cv_scores)
print('Mean cv socre:', np.mean(cv_scores))
pred_full_test = pred_full_test/5
    
train['tfidf_MNB_0'] = pred_train[:,0]
train['tfidf_MNB_1'] = pred_train[:,1]
train['tfidf_MNB_2'] = pred_train[:,2]
train['tfidf_MNB_3'] = pred_train[:,3]
train['tfidf_MNB_4'] = pred_train[:,4]

test['tfidf_MNB_0'] = pred_full_test[:,0]
test['tfidf_MNB_1'] = pred_full_test[:,1]
test['tfidf_MNB_2'] = pred_full_test[:,2]
test['tfidf_MNB_3'] = pred_full_test[:,3]
test['tfidf_MNB_4'] = pred_full_test[:,4]

end = time.localtime()

print("%04d/%02d/%02d %02d:%02d" % (start.tm_year, start.tm_mon, start.tm_mday, start.tm_hour, start.tm_min))
print("%04d/%02d/%02d %02d:%02d" % (end.tm_year, end.tm_mon, end.tm_mday, end.tm_hour, end.tm_min))

2021010719:25
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    6.6s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.2s finished


	vect__analyzer: 'word'
	vect__max_df: 0.4
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.4s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    7.0s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    5.3s finished


	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)
cv socre: [0.5976665473926792, 0.5975338255728783, 0.6173897988183089, 0.5963371295574684, 0.6096579691870107]
Mean cv socre: 0.6037170541056691
2021/01/07 19:25
2021/01/07 19:26
