In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 확인

In [2]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

## 전처리

### train_test_split

In [3]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

### 벡터라이즈

#### tfidfvectorizer

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

#### 여러 모델들 적용

In [6]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, SGDClassifier
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


models = []
models.append(('LogisticRegression', LogisticRegression(random_state=13)))
models.append(('MultinomialNB', MultinomialNB()))
models.append(('RandomForestClassifier', RandomForestClassifier(random_state=13, n_jobs=-1)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=13)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=13)))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=13)))
models.append(('LGBMClassifier', LGBMClassifier(random_state=13)))
models.append(('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)))
models.append(('LinearSVC', LinearSVC(C=1, loss='hinge', random_state=13)))
models.append(('XgBoost', XGBClassifier(learning_rate=0.1, max_depth=3, random_state=13, n_jobs=-1)))
models.append(('RidgeClassifier', RidgeClassifier(random_state=13)))
models.append(('SGDClassifier', SGDClassifier(random_state=13, loss='modified_huber')))
# models.append(('RidgeClassifierCV', RidgeClassifierCV(cv=3)))

In [7]:
models

[('LogisticRegression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=13, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('MultinomialNB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)),
 ('RandomForestClassifier',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=-1, oob_score=False, random_state=13, verbose=0,
                

In [8]:
from sklearn.metrics import accuracy_score

train_score = []
test_score = []
names = []

for name, model in models:
    clf = model
    clf.fit(X_train_tfidf_vect, y_train)
    
    train_pred = clf.predict(X_train_tfidf_vect)
    test_pred = clf.predict(X_test_tfidf_vect)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, train_pred))
    test_score.append(accuracy_score(y_test, test_pred))

In [9]:
result = pd.DataFrame({'model name': names,
                       'train score': train_score,
                       'test score': test_score}) 
result.sort_values(by='train score', ascending=False).reset_index(drop=True)

Unnamed: 0,model name,train score,test score
0,RandomForestClassifier,0.991982,0.614674
1,DecisionTreeClassifier,0.991982,0.486091
2,RidgeClassifier,0.882051,0.731718
3,LinearSVC,0.868281,0.730199
4,SGDClassifier,0.867682,0.732932
5,LogisticRegression,0.825407,0.72206
6,MultinomialNB,0.758454,0.679422
7,LGBMClassifier,0.729975,0.649903
8,GradientBoostingClassifier,0.580216,0.560313
9,XgBoost,0.527658,0.517128


In [10]:
result['diff'] = result['train score'] - result['test score']

In [11]:
result.round(2).sort_values(by='train score', ascending=False)

Unnamed: 0,model name,train score,test score,diff
2,RandomForestClassifier,0.99,0.61,0.38
3,DecisionTreeClassifier,0.99,0.49,0.51
10,RidgeClassifier,0.88,0.73,0.15
8,LinearSVC,0.87,0.73,0.14
11,SGDClassifier,0.87,0.73,0.13
0,LogisticRegression,0.83,0.72,0.1
1,MultinomialNB,0.76,0.68,0.08
6,LGBMClassifier,0.73,0.65,0.08
5,GradientBoostingClassifier,0.58,0.56,0.02
9,XgBoost,0.53,0.52,0.01


##### stop words 필터링을 추가하고 ngram을 기본 (1,1)에서 (1,2)로 변경하여 피처 벡터화

In [12]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.701


##### min-df 조정
- min_df, max_df 는 아무런 결과의 차이를 가지고 오지 못했고,
- sublinear_tf도 영향이 없었다.

In [13]:
tfidf_vect = TfidfVectorizer(stop_words='english', sublinear_tf = True)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.722


In [14]:
# sublinear_tf : 높은 TF값들에 대해서 스무딩 처리, TF값에 대해 아웃라이어 처리

In [15]:
tfidf_vect = TfidfVectorizer(stop_words='english', max_features=200000)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.722


In [16]:
X_train_tfidf_vect.shape

(38415, 30690)

In [17]:
# max_features = 20000개 일 때 정확도 0.722
X_train_tfidf_vect.shape

(38415, 30690)

In [18]:
# max_features = 40000개 일 때 정확도 0.723
# max_features = 80000개 일 때 정확도 0.723
# max_features = 200000개 일 때 정확도 0.723
# max_features 파라미터는 아무런 변화를 가지고 오지 못함

#### GridSearchCV로 TfidfVectorizer parameter 조정
- ngram_range=(1, 2) 가 최적

In [19]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])
parameters = {
#     'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__alpha': (1e-2, 1e-3)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:   18.3s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None))]


In [20]:
grid_search_tune.best_score_

0.6995704902421409

##### GridSearchCV로 LogisticRegression C 하이퍼 파라미터 튜닝

In [21]:
from sklearn.model_selection import GridSearchCV

# 최적 C 값 도출 튜닝 수행. CV는 3 Fold셋으로 설정.
params = { 'C': [0.01, 0.1, 1, 5, 10], 'random_state': [13]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter:', grid_cv_lr.best_params_)

# 최적 C 값으로 학습된 grid_cv로 예측 수행하고 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   17.4s remaining:   15.2s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   18.3s finished


Logistic Regression best C parameter: {'C': 5, 'random_state': 13}
TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.727


In [22]:
grid_cv_lr.best_params_

{'C': 5, 'random_state': 13}

In [23]:
grid_cv_lr.best_score_

0.7121957568658077

## 모델 하이퍼파라미터 튜닝

### MultinomialNB

In [24]:
from sklearn.naive_bayes import MultinomialNB

params = {'alpha': [0.01, 0.1, 0.5, 1.0],
         'fit_prior': ['True', 'False']}
clf = MultinomialNB()
grid_cv = GridSearchCV(clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('MultinomialNB best parameters:', grid_cv.best_params_)
print('MultinomialNB best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


MultinomialNB best parameters: {'alpha': 0.1, 'fit_prior': 'True'}
MultinomialNB best accuracy score: 0.7180788754392816


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    0.3s finished


In [25]:
clf = MultinomialNB(alpha=0.1, fit_prior='True')
clf.fit(X_train_tfidf_vect, y_train)

train_pred = clf.predict(X_train_tfidf_vect)
test_pred = clf.predict(X_test_tfidf_vect)

print('MultinomialNB train accuracy score', accuracy_score(y_train, train_pred))
print('MultinomialNB test accuracy score', accuracy_score(y_test, test_pred))

MultinomialNB train accuracy score 0.8374853572823115
MultinomialNB test accuracy score 0.7308066083576288


In [26]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)

In [27]:
from sklearn.model_selection import cross_validate
cross_validate(clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.0175879 , 0.01331615, 0.01286483, 0.01333785, 0.01395202]),
 'score_time': array([0.00237393, 0.00212479, 0.00227022, 0.00236893, 0.00264502]),
 'test_score': array([0.72367565, 0.72302486, 0.72380581, 0.7271899 , 0.73070415]),
 'train_score': array([0.8453729 , 0.84494989, 0.84511259, 0.84387609, 0.84442926])}

### LinearSVC

In [28]:
import sklearn
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(C=1, loss='hinge', random_state=13)
svm_clf.fit(X_train_tfidf_vect, y_train)

train_pred = svm_clf.predict(X_train_tfidf_vect)
test_pred = svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, test_pred))

LinearSVC train accuracy score: 0.8682806195496551
LinearSVC test accuracy score: 0.7301992225461613


In [29]:
params = {
    'C': [0.001, 0.01, 1], 
    'loss': ['squared_hinge', 'hinge'], 
    'penalty': ['l1', 'l2'], 
    'random_state': [13]
    }

grid_cv = GridSearchCV(LinearSVC(), param_grid=params, refit=True, verbose=2)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.2s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.2s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.2s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.2s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.2s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, rand

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   19.1s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 1],
                         'loss': ['squared_hinge', 'hinge'],
                         'penalty': ['l1', 'l2'], 'random_state': [13]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [30]:
print('LinearSVC Best parameters:', grid_cv.best_params_)
print('LinearSVC Best accruacy score:', grid_cv.best_score_)

LinearSVC Best parameters: {'C': 1, 'loss': 'hinge', 'penalty': 'l2', 'random_state': 13}
LinearSVC Best accruacy score: 0.7220096316543018


In [31]:
svm_clf = LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)
svm_clf.fit(X_train_tfidf_vect, y_train)

train_pred = svm_clf.predict(X_train_tfidf_vect)
test_pred = svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, test_pred)) 

LinearSVC train accuracy score: 0.8682806195496551
LinearSVC test accuracy score: 0.7301992225461613


In [32]:
svm_clf.decision_function(X_test_tfidf_vect) > 0

array([[ True, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False, False,  True],
       ...,
       [False, False, False, False, False],
       [False,  True, False, False, False],
       [False, False, False,  True, False]])

In [33]:
# 교차검증

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [34]:
from sklearn.model_selection import cross_validate, cross_val_score

cross_validate(svm_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([1.96901417, 2.654037  , 2.18455291, 2.05403781, 2.56171584,
        2.57312608, 2.47893095, 2.5213201 , 1.95154691, 2.12115717]),
 'score_time': array([0.00145912, 0.00115585, 0.001369  , 0.00162792, 0.00137615,
        0.00139999, 0.00120807, 0.00129294, 0.00137401, 0.00132799]),
 'test_score': array([0.7272254 , 0.7313899 , 0.72462259, 0.72280062, 0.72904737,
        0.71908357, 0.72064567, 0.71960427, 0.74017183, 0.72533194]),
 'train_score': array([0.87140254, 0.87137362, 0.87157609, 0.87151824, 0.8721835 ,
        0.87175334, 0.87236073, 0.87192688, 0.86969977, 0.87262104])}

### SGDClassifier

In [35]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=13, loss='modified_huber')
sgd_clf.fit(X_train_tfidf_vect, y_train)

train_pred = sgd_clf.predict(X_train_tfidf_vect)
test_pred = sgd_clf.predict(X_test_tfidf_vect)

print('SGDClassifier train accuracy score:', accuracy_score(y_train, train_pred))
print('SGDClassifier test accuracy score:', accuracy_score(y_test, test_pred))

SGDClassifier train accuracy score: 0.8676818950930626
SGDClassifier test accuracy score: 0.7329324586977648


In [36]:
# 교차 검증

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train_tfidf_vect, y_train, cv=5, scoring='accuracy')

array([0.72458675, 0.72146297, 0.71977092, 0.728101  , 0.72263439])

In [37]:
# 오차 행렬

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train_tfidf_vect, y_train, cv=3)

In [39]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_train_pred)

array([[6675,  427,  796,  935,  396],
       [ 557, 3305,  556,  566,  128],
       [ 706,  236, 5636, 1062,  386],
       [ 629,  240,  866, 8576,  196],
       [ 784,  118,  778,  581, 3280]])

In [40]:
params = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    'loss': ['log', 'modified_huber', 'hinge', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'elasticnet'],
    'n_jobs': [-1],
    'random_state': [13],
}

grid_cv = GridSearchCV(sgd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   57.7s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='modified_huber', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5, random_state=13,
                                     shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                   1000.0],
                         'loss': ['log', 'modified_huber', 'hinge',
             

In [41]:
print('SGDClassifier best parameters:', grid_cv.best_params_)
print('SGDClassifier best accuracy score:', grid_cv.best_score_)

SGDClassifier best parameters: {'alpha': 0.0001, 'loss': 'modified_huber', 'n_jobs': -1, 'penalty': 'l2', 'random_state': 13}
SGDClassifier best accuracy score: 0.7151373161525446


In [42]:
from sklearn.model_selection import cross_validate

cross_validate(sgd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.31299591, 0.28304791, 0.28077602, 0.2831018 , 0.32168198]),
 'score_time': array([0.00261998, 0.00223494, 0.00217199, 0.00218797, 0.00200605]),
 'test_score': array([0.72458675, 0.72146297, 0.71977092, 0.728101  , 0.72263439]),
 'train_score': array([0.88419237, 0.88370428, 0.88428999, 0.88432253, 0.88484316])}

### RidgeClassifier

In [43]:
rd_clf = RidgeClassifier()
params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
    'normalize': [True, False],
    'max_iter': [100, 300],
    'random_state': [13]
}

grid_cv = GridSearchCV(rd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('RidgeClassifier best parameters:', grid_cv.best_params_)
print('RidgeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   19.8s finished


RidgeClassifier best parameters: {'alpha': 1.0, 'max_iter': 100, 'normalize': False, 'random_state': 13}
RidgeClassifier best accuracy score: 0.71308082780164


In [44]:
rd_clf = RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)
rd_clf.fit(X_train_tfidf_vect, y_train)

train_pred = rd_clf.predict(X_train_tfidf_vect)
test_pred = rd_clf.predict(X_test_tfidf_vect)
print('RidgeClassifier best train accuracy score:', accuracy_score(y_train, train_pred))
print('RidgeClassifier best test accuracy score:', accuracy_score(y_test, test_pred))

RidgeClassifier best train accuracy score: 0.8799427307041521
RidgeClassifier best test accuracy score: 0.7144679300291545


In [45]:
cross_validate(rd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([0.38962412, 0.37851286, 0.36588693, 0.40881324, 0.4203701 ,
        0.40804815, 0.37102795, 0.37114191, 0.41792321, 0.39581203]),
 'score_time': array([0.00161481, 0.0022068 , 0.00217605, 0.00236201, 0.00239205,
        0.00225687, 0.00222993, 0.00233579, 0.0026598 , 0.00258279]),
 'test_score': array([0.70041645, 0.70640292, 0.7038001 , 0.70458095, 0.70458095,
        0.70450404, 0.70320229, 0.70190055, 0.71465764, 0.69617287]),
 'train_score': array([0.88519943, 0.88485234, 0.88502589, 0.88473664, 0.88531513,
        0.88534737, 0.88413259, 0.88471105, 0.88404581, 0.88581015])}

### LogisticRegression

In [46]:
clf = LogisticRegression()
params = {
    'C': [0.01, 0.1, 0.5, 1.0],
    'max_iter': [100, 200, 500],
}

grid_cv = GridSearchCV(clf, param_grid=params, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LogisticRegression best parameters:', grid_cv.best_params_)
print('LogisticRegression best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.6min finished


LogisticRegression best parameters: {'C': 1.0, 'max_iter': 200}
LogisticRegression best accuracy score: 0.7113627489262007


In [47]:
lr_clf = LogisticRegression(C=1.0, max_iter=500)
lr_clf.fit(X_train_tfidf_vect, y_train)

train_pred = lr_clf.predict(X_train_tfidf_vect)
test_pred = lr_clf.predict(X_test_tfidf_vect)

print('LogisticRegression train accuracy score:', accuracy_score(y_train, train_pred))
print('LogisticRegression test accuracy score:', accuracy_score(y_test, test_pred))

LogisticRegression train accuracy score: 0.8322530261616556
LogisticRegression test accuracy score: 0.722424684159378


### LGBMClassifier

In [48]:
clf = LGBMClassifier()
params = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8],
    'num_leaves': [6,8], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [13],
#     'colsample_bytree' : [0.64, 0.65, 0.66],
#     'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid_cv = GridSearchCV(clf, param_grid=params, verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LGBMClassifier best parameters:', grid_cv.best_params_)
print('LGBMClassifier best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.8min finished


LGBMClassifier best parameters: {'boosting_type': 'dart', 'learning_rate': 0.01, 'max_bin': 255, 'n_estimators': 8, 'num_leaves': 8, 'objective': 'binary', 'random_state': 13, 'reg_alpha': 1, 'reg_lambda': 1}
LGBMClassifier best accuracy score: 0.34457894051802684


In [49]:
lgb_clf = LGBMClassifier(boosting_type='dart', learning_rate=0.01, max_bin=510, n_estimators=8, num_leaves=8,
                        objective='binary', random_state=13, reg_alpha=1, reg_lambda=1)
lgb_clf.fit(X_train_tfidf_vect, y_train)
pred = lgb_clf.predict(X_test_tfidf_vect)
print('LGBMClassifier best accuracy score:', accuracy_score(y_test, pred))

LGBMClassifier best accuracy score: 0.3485787172011662


In [50]:
lgb_clf = LGBMClassifier(random_state=13)
lgb_clf.fit(X_train_tfidf_vect, y_train)

train_pred = lgb_clf.predict(X_train_tfidf_vect)
test_pred = lgb_clf.predict(X_test_tfidf_vect)

print('LGBMClassifier train accuracy score:', accuracy_score(y_train, train_pred))
print('LGBMClassifier test accuracy score:', accuracy_score(y_test, test_pred))

LGBMClassifier train accuracy score: 0.729975270076793
LGBMClassifier test accuracy score: 0.6499028182701652


In [51]:
cross_validate(lgb_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([21.22280693, 20.44541788, 20.26144385, 19.98406816, 20.01709628]),
 'score_time': array([1.21222615, 1.21647286, 1.25959206, 1.20136285, 1.26162386]),
 'test_score': array([0.64388911, 0.63607966, 0.64011454, 0.63881296, 0.64792399]),
 'train_score': array([0.73815567, 0.73861122, 0.73796043, 0.74206039, 0.74036835])}

### DecisionTreeClassifier

In [52]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_tfidf_vect, y_train)

train_pred = dt_clf.predict(X_train_tfidf_vect)
test_pred = dt_clf.predict(X_test_tfidf_vect)

print('DecisionTreeClassifier train score :', accuracy_score(y_train, train_pred))
print('DecisionTreeClassifier test score :', accuracy_score(y_test, test_pred))

DecisionTreeClassifier train score : 0.9919822985812834
DecisionTreeClassifier test score : 0.4842687074829932


In [53]:
params = {
    'max_depth': [5, 10, 20, 120],
    'min_samples_split': [16, 24],
    'min_samples_leaf': [16, 32],
    'random_state': [13]
}

grid_cv = GridSearchCV(dt_clf, param_grid=params, verbose=1, n_jobs=-1, cv=5, scoring='accuracy')
grid_cv.fit(X_train_tfidf_vect, y_train)

print('DecisionTreeClassifier best parameters:', grid_cv.best_params_)
print('DecisionTreeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   23.6s finished


DecisionTreeClassifier best parameters: {'max_depth': 120, 'min_samples_leaf': 32, 'min_samples_split': 16, 'random_state': 13}
DecisionTreeClassifier best accuracy score: 0.4535207601197449


## 전처리

In [54]:
abc = pd.merge(X_test, y_test, left_index = True, right_index=True)
abc['pred'] = pred

# 예측이 틀린 결과 -> 데이터 프레임 화
error_df = abc[abc['author'] != abc['pred']]

# CountVectorizer로 단어 갯수 카운트
cv = CountVectorizer(stop_words='english')   
cv_fit=cv.fit_transform(error_df['text'])    
word_list = cv.get_feature_names();    
count_list = cv_fit.toarray().sum(axis=0)

di = dict(zip(word_list,count_list))

# 예측이 틀린 결과 중 가장 많이 나온 단어 20개 정렬
pd.DataFrame(list(di.items())).sort_values(by=1, ascending=False)[:20]

NameError: name 'CountVectorizer' is not defined

In [None]:
# from catboost import CatBoostClassifier

# cb = CatBoostClassifier(silent=True, random_state=13, n_estimators=300).fit(X_train_tfidf_vect, y_train)
# accuracy_score(y_train, cb.predict(X_train_tfidf_vect))
# accuracy_score(y_test, cb.predict(X_test_tfidf_vect))

In [None]:
# accuracy_score(y_train, cb.predict(X_train_tfidf_vect))

## 모델 검증