In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 확인

In [2]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

## 전처리

### train_test_split

In [3]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

### 벡터라이즈

#### tfidfvectorizer

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

#### 여러 모델들 적용

In [6]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, SGDClassifier
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


models = []
models.append(('LogisticRegression', LogisticRegression(random_state=13)))
models.append(('MultinomialNB', MultinomialNB()))
models.append(('RandomForestClassifier', RandomForestClassifier(random_state=13, n_jobs=-1)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=13)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=13)))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=13)))
models.append(('LGBMClassifier', LGBMClassifier(random_state=13)))
models.append(('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)))
models.append(('LinearSVC', LinearSVC(C=1, loss='hinge', random_state=13)))
models.append(('XgBoost', XGBClassifier(learning_rate=0.1, max_depth=3, random_state=13, n_jobs=-1)))
models.append(('RidgeClassifier', RidgeClassifier(random_state=13)))
models.append(('SGDClassifier', SGDClassifier(random_state=13, loss='modified_huber')))
# models.append(('RidgeClassifierCV', RidgeClassifierCV(cv=3)))

In [7]:
models

[('LogisticRegression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=13, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('MultinomialNB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)),
 ('RandomForestClassifier',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=-1, oob_score=False, random_state=13, verbose=0,
                

In [8]:
from sklearn.metrics import accuracy_score

train_score = []
test_score = []
names = []

for name, model in models:
    clf = model
    clf.fit(X_train_tfidf_vect, y_train)
    
    train_pred = clf.predict(X_train_tfidf_vect)
    test_pred = clf.predict(X_test_tfidf_vect)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, train_pred))
    test_score.append(accuracy_score(y_test, test_pred))

In [9]:
result = pd.DataFrame({'model name': names,
                       'train score': train_score,
                       'test score': test_score})
result['diff'] = result['train score'] - result['test score']
result.round(2).sort_values(by='train score', ascending=False).reset_index(drop=True)

Unnamed: 0,model name,train score,test score,diff
0,RandomForestClassifier,0.99,0.62,0.37
1,DecisionTreeClassifier,0.99,0.49,0.5
2,RidgeClassifier,0.88,0.73,0.15
3,LinearSVC,0.87,0.73,0.13
4,SGDClassifier,0.86,0.73,0.13
5,LogisticRegression,0.83,0.72,0.1
6,MultinomialNB,0.76,0.69,0.07
7,LGBMClassifier,0.73,0.65,0.07
8,GradientBoostingClassifier,0.58,0.56,0.02
9,XgBoost,0.53,0.52,0.01


##### stop words 필터링을 추가하고 ngram을 기본 (1,1)에서 (1,2)로 변경하여 피처 벡터화

In [12]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.708


##### min-df 조정
- min_df, max_df 는 아무런 결과의 차이를 가지고 오지 못했고,
- sublinear_tf도 영향이 없었다.

In [13]:
tfidf_vect = TfidfVectorizer(stop_words='english', sublinear_tf = True)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.724


In [14]:
# sublinear_tf : 높은 TF값들에 대해서 스무딩 처리, TF값에 대해 아웃라이어 처리

In [15]:
tfidf_vect = TfidfVectorizer(stop_words='english', max_features=200000)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test ,pred)))

TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.723


In [16]:
X_train_tfidf_vect.shape

(43903, 32118)

In [17]:
# max_features = 20000개 일 때 정확도 0.722
X_train_tfidf_vect.shape

(43903, 32118)

In [18]:
# max_features = 40000개 일 때 정확도 0.723
# max_features = 80000개 일 때 정확도 0.723
# max_features = 200000개 일 때 정확도 0.723
# max_features 파라미터는 아무런 변화를 가지고 오지 못함

#### GridSearchCV로 TfidfVectorizer parameter 조정
- ngram_range=(1, 2) 가 최적

In [19]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])
parameters = {
#     'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'clf__estimator__alpha': (1e-2, 1e-3)
}

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(X_train, y_train)

print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  12 out of  12 | elapsed:   25.9s finished


Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None))]


In [20]:
grid_search_tune.best_score_

0.7048948827832544

##### GridSearchCV로 LogisticRegression C 하이퍼 파라미터 튜닝

In [21]:
from sklearn.model_selection import GridSearchCV

# 최적 C 값 도출 튜닝 수행. CV는 3 Fold셋으로 설정.
params = { 'C': [0.01, 0.1, 1, 5, 10], 'random_state': [13]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter:', grid_cv_lr.best_params_)

# 최적 C 값으로 학습된 grid_cv로 예측 수행하고 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:   24.1s remaining:   21.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   25.6s finished


Logistic Regression best C parameter: {'C': 5, 'random_state': 13}
TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.726


In [22]:
grid_cv_lr.best_params_

{'C': 5, 'random_state': 13}

In [23]:
grid_cv_lr.best_score_

0.7178551750593618

## 모델 하이퍼파라미터 튜닝

### MultinomialNB

In [24]:
from sklearn.naive_bayes import MultinomialNB

params = {'alpha': [0.01, 0.1, 0.5, 1.0],
         'fit_prior': ['True', 'False']}
clf = MultinomialNB()
grid_cv = GridSearchCV(clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('MultinomialNB best parameters:', grid_cv.best_params_)
print('MultinomialNB best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


MultinomialNB best parameters: {'alpha': 0.1, 'fit_prior': 'True'}
MultinomialNB best accuracy score: 0.7235268361870119


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    0.3s finished


In [25]:
mu_clf = MultinomialNB(alpha=0.1, fit_prior='True')
mu_clf.fit(X_train_tfidf_vect, y_train)

mu_train_pred = mu_clf.predict(X_train_tfidf_vect)
mu_test_pred = mu_clf.predict(X_test_tfidf_vect)

print('MultinomialNB train accuracy score', accuracy_score(y_train, mu_train_pred))
print('MultinomialNB test accuracy score', accuracy_score(y_test, mu_test_pred))

MultinomialNB train accuracy score 0.8349087761656379
MultinomialNB test accuracy score 0.731231778425656


In [26]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)

In [27]:
from sklearn.model_selection import cross_validate
cross_validate(mu_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.02347112, 0.02090478, 0.01915121, 0.01864433, 0.01804709]),
 'score_time': array([0.00352192, 0.00471091, 0.00376391, 0.0030489 , 0.00392294]),
 'test_score': array([0.73271837, 0.73032684, 0.73157955, 0.73280182, 0.7308656 ]),
 'train_score': array([0.84072661, 0.84160925, 0.84300438, 0.84110127, 0.84198389])}

### LinearSVC

In [28]:
import sklearn
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

svc_clf = LinearSVC(C=1, loss='hinge', random_state=13)
svc_clf.fit(X_train_tfidf_vect, y_train)

svc_train_pred = svc_clf.predict(X_train_tfidf_vect)
svc_test_pred = svc_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, svc_train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, svc_test_pred))

LinearSVC train accuracy score: 0.8660228230417055
LinearSVC test accuracy score: 0.7331450437317785


In [29]:
params = {
    'C': [0.001, 0.01, 1], 
    'loss': ['squared_hinge', 'hinge'], 
    'penalty': ['l1', 'l2'], 
    'random_state': [13]
    }

grid_cv = GridSearchCV(LinearSVC(), param_grid=params, refit=True, verbose=2)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.4s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.3s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.3s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.4s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.3s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, rand

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   30.4s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 1],
                         'loss': ['squared_hinge', 'hinge'],
                         'penalty': ['l1', 'l2'], 'random_state': [13]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [30]:
print('LinearSVC Best parameters:', grid_cv.best_params_)
print('LinearSVC Best accruacy score:', grid_cv.best_score_)

LinearSVC Best parameters: {'C': 1, 'loss': 'hinge', 'penalty': 'l2', 'random_state': 13}
LinearSVC Best accruacy score: 0.7300185583960399


In [31]:
svc_svm_clf = LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)
svc_svm_clf.fit(X_train_tfidf_vect, y_train)

svc_train_pred = svc_svm_clf.predict(X_train_tfidf_vect)
svc_test_pred = svc_svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, svc_train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, svc_test_pred)) 

LinearSVC train accuracy score: 0.8660228230417055
LinearSVC test accuracy score: 0.7331450437317785


In [32]:
svc_svm_clf.decision_function(X_test_tfidf_vect) > 0

array([[ True, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False, False,  True],
       ...,
       [False, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False, False,  True]])

In [33]:
# 교차검증

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [34]:
from sklearn.model_selection import cross_validate, cross_val_score

cross_validate(svc_svm_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([3.01112199, 2.50579286, 3.23305893, 2.76333189, 2.923774  ,
        3.14577007, 3.1926558 , 2.72132301, 3.21674228, 2.77025533]),
 'score_time': array([0.0015521 , 0.00157309, 0.0016098 , 0.00181127, 0.0015738 ,
        0.00156212, 0.00163913, 0.00179935, 0.00162673, 0.00146294]),
 'test_score': array([0.73445684, 0.72102027, 0.74083352, 0.72642369, 0.73986333,
        0.74350797, 0.7214123 , 0.738041  , 0.73302961, 0.73781321]),
 'train_score': array([0.86862219, 0.86834379, 0.86874873, 0.86966315, 0.86842305,
        0.86968846, 0.86865082, 0.8676385 , 0.86849898, 0.86829651])}

### SGDClassifier

In [35]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=13, loss='modified_huber')
sgd_clf.fit(X_train_tfidf_vect, y_train)

sgd_train_pred = sgd_clf.predict(X_train_tfidf_vect)
sgd_test_pred = sgd_clf.predict(X_test_tfidf_vect)

print('SGDClassifier train accuracy score:', accuracy_score(y_train, sgd_train_pred))
print('SGDClassifier test accuracy score:', accuracy_score(y_test, sgd_test_pred))

SGDClassifier train accuracy score: 0.8592351319955356
SGDClassifier test accuracy score: 0.7332361516034985


In [36]:
# 교차 검증

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train_tfidf_vect, y_train, cv=5, scoring='accuracy')

array([0.73397107, 0.73727366, 0.72679649, 0.73405467, 0.73075171])

In [37]:
# 오차 행렬

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train_tfidf_vect, y_train, cv=3)

In [38]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_train_pred)

array([[7559,  471,  915, 1111,  456],
       [ 606, 3830,  634,  635,  136],
       [ 723,  237, 6604, 1223,  435],
       [ 639,  238,  940, 9981,  236],
       [ 835,  135,  870,  669, 3785]])

In [39]:
params = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    'loss': ['log', 'modified_huber', 'hinge', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'elasticnet'],
    'n_jobs': [-1],
    'random_state': [13],
}

grid_cv = GridSearchCV(sgd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  1.2min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='modified_huber', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5, random_state=13,
                                     shuffle=True, tol=0.001,
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                   1000.0],
                         'loss': ['log', 'modified_huber', 'hinge',
             

In [40]:
print('SGDClassifier best parameters:', grid_cv.best_params_)
print('SGDClassifier best accuracy score:', grid_cv.best_score_)

SGDClassifier best parameters: {'alpha': 0.0001, 'loss': 'modified_huber', 'n_jobs': -1, 'penalty': 'l2', 'random_state': 13}
SGDClassifier best accuracy score: 0.7233901899433525


In [41]:
from sklearn.model_selection import cross_validate

cross_validate(sgd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.50120807, 0.42848897, 0.54985309, 0.47018981, 0.46314192]),
 'score_time': array([0.00359988, 0.00411582, 0.00498295, 0.00413322, 0.00345588]),
 'test_score': array([0.73397107, 0.73727366, 0.72679649, 0.73405467, 0.73075171]),
 'train_score': array([0.87383976, 0.87557656, 0.87660156, 0.87595023, 0.87572246])}

### RidgeClassifier

In [42]:
rd_clf = RidgeClassifier()
params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
    'normalize': [True, False],
    'max_iter': [100, 300],
    'random_state': [13]
}

grid_cv = GridSearchCV(rd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('RidgeClassifier best parameters:', grid_cv.best_params_)
print('RidgeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:   28.2s finished


RidgeClassifier best parameters: {'alpha': 1.0, 'max_iter': 100, 'normalize': False, 'random_state': 13}
RidgeClassifier best accuracy score: 0.7193358294665587


In [43]:
rd_clf = RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)
rd_clf.fit(X_train_tfidf_vect, y_train)

rd_train_pred = rd_clf.predict(X_train_tfidf_vect)
rd_test_pred = rd_clf.predict(X_test_tfidf_vect)
print('RidgeClassifier best train accuracy score:', accuracy_score(y_train, rd_train_pred))
print('RidgeClassifier best test accuracy score:', accuracy_score(y_test, rd_test_pred))

RidgeClassifier best train accuracy score: 0.8744504931325877
RidgeClassifier best test accuracy score: 0.7160167638483965


In [44]:
cross_validate(rd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([0.46430492, 0.46808791, 0.45691228, 0.48221111, 0.54432106,
        0.45497799, 0.51907086, 0.50296211, 0.4991529 , 0.48341298]),
 'score_time': array([0.00313401, 0.00292993, 0.00259399, 0.00349116, 0.00264287,
        0.00299501, 0.00276208, 0.00331402, 0.00330305, 0.00297499]),
 'test_score': array([0.71509907, 0.69801867, 0.71805967, 0.70888383, 0.71799544,
        0.7191344 , 0.70410023, 0.7166287 , 0.70523918, 0.71617312]),
 'train_score': array([0.87975805, 0.87983397, 0.87866977, 0.87965986, 0.87958393,
        0.87907777, 0.88006479, 0.87935616, 0.87973578, 0.87996356])}

### LogisticRegression

In [45]:
clf = LogisticRegression()
params = {
    'C': [0.01, 0.1, 0.5, 1.0],
    'max_iter': [100, 200, 500],
}

grid_cv = GridSearchCV(clf, param_grid=params, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LogisticRegression best parameters:', grid_cv.best_params_)
print('LogisticRegression best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.9min finished


LogisticRegression best parameters: {'C': 1.0, 'max_iter': 200}
LogisticRegression best accuracy score: 0.718834740777808


In [46]:
lr_clf = LogisticRegression(C=1.0, max_iter=500)
lr_clf.fit(X_train_tfidf_vect, y_train)

lr_train_pred = lr_clf.predict(X_train_tfidf_vect)
lr_test_pred = lr_clf.predict(X_test_tfidf_vect)

print('LogisticRegression train accuracy score:', accuracy_score(y_train, lr_train_pred))
print('LogisticRegression test accuracy score:', accuracy_score(y_test, lr_test_pred))

LogisticRegression train accuracy score: 0.8322438102179806
LogisticRegression test accuracy score: 0.7260386297376094


### LGBMClassifier

In [47]:
clf = LGBMClassifier()
params = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8],
    'num_leaves': [6,8], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [13],
#     'colsample_bytree' : [0.64, 0.65, 0.66],
#     'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid_cv = GridSearchCV(clf, param_grid=params, verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LGBMClassifier best parameters:', grid_cv.best_params_)
print('LGBMClassifier best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.5min finished


LGBMClassifier best parameters: {'boosting_type': 'dart', 'learning_rate': 0.01, 'max_bin': 510, 'n_estimators': 8, 'num_leaves': 8, 'objective': 'binary', 'random_state': 13, 'reg_alpha': 1, 'reg_lambda': 1}
LGBMClassifier best accuracy score: 0.34489685096134515


In [48]:
lgb_clf = LGBMClassifier(boosting_type='dart', learning_rate=0.01, max_bin=510, n_estimators=8, num_leaves=8,
                        objective='binary', random_state=13, reg_alpha=1, reg_lambda=1)
lgb_clf.fit(X_train_tfidf_vect, y_train)
pred = lgb_clf.predict(X_test_tfidf_vect)
print('LGBMClassifier best accuracy score:', accuracy_score(y_test, pred))

LGBMClassifier best accuracy score: 0.3497631195335277


In [49]:
lgb_clf = LGBMClassifier(random_state=13)
lgb_clf.fit(X_train_tfidf_vect, y_train)

lgb_train_pred = lgb_clf.predict(X_train_tfidf_vect)
lgb_test_pred = lgb_clf.predict(X_test_tfidf_vect)

print('LGBMClassifier train accuracy score:', accuracy_score(y_train, train_pred))
print('LGBMClassifier test accuracy score:', accuracy_score(y_test, test_pred))

LGBMClassifier train accuracy score: 0.8592351319955356
LGBMClassifier test accuracy score: 0.7332361516034985


In [50]:
cross_validate(lgb_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([32.22999191, 31.499861  , 28.85051799, 30.26241589, 29.88290715]),
 'score_time': array([1.86452985, 1.96915913, 1.56499314, 1.56113005, 1.61052489]),
 'test_score': array([0.6523175 , 0.64605398, 0.64149869, 0.64840547, 0.65056948]),
 'train_score': array([0.73173509, 0.7339844 , 0.73506634, 0.73350796, 0.73564331])}

### DecisionTreeClassifier

In [51]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_tfidf_vect, y_train)

dt_train_pred = dt_clf.predict(X_train_tfidf_vect)
dt_test_pred = dt_clf.predict(X_test_tfidf_vect)

print('DecisionTreeClassifier train score :', accuracy_score(y_train, dt_train_pred))
print('DecisionTreeClassifier test score :', accuracy_score(y_test, dt_test_pred))

DecisionTreeClassifier train score : 0.9914812199621894
DecisionTreeClassifier test score : 0.49152696793002915


In [52]:
params = {
    'max_depth': [5, 10, 20, 120],
    'min_samples_split': [16, 24],
    'min_samples_leaf': [16, 32],
    'random_state': [13]
}

grid_cv = GridSearchCV(dt_clf, param_grid=params, verbose=1, n_jobs=-1, cv=5, scoring='accuracy')
grid_cv.fit(X_train_tfidf_vect, y_train)

print('DecisionTreeClassifier best parameters:', grid_cv.best_params_)
print('DecisionTreeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   33.6s finished


DecisionTreeClassifier best parameters: {'max_depth': 120, 'min_samples_leaf': 32, 'min_samples_split': 16, 'random_state': 13}
DecisionTreeClassifier best accuracy score: 0.4575086922764231


## 전처리

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

abc = pd.merge(X_test, y_test, left_index = True, right_index=True)
abc['pred'] = sgd_test_pred

# 예측이 틀린 결과 -> 데이터 프레임 화
error_df = abc[abc['author'] != abc['pred']]

# CountVectorizer로 단어 갯수 카운트
cv = CountVectorizer(stop_words='english')   
cv_fit=cv.fit_transform(error_df['text'])    
word_list = cv.get_feature_names();    
count_list = cv_fit.toarray().sum(axis=0)

di = dict(zip(word_list,count_list))

# 예측이 틀린 결과 중 가장 많이 나온 단어 20개 정렬
pd.DataFrame(list(di.items())).sort_values(by=1, ascending=False)[:20]

Unnamed: 0,0,1
4204,odin,1397
5281,said,731
3762,man,206
1182,come,175
1720,did,161
4012,mr,161
6227,think,158
1458,cried,157
3478,know,150
6283,time,142


In [54]:
# from catboost import CatBoostClassifier

# cb = CatBoostClassifier(silent=True, random_state=13, n_estimators=300).fit(X_train_tfidf_vect, y_train)
# accuracy_score(y_train, cb.predict(X_train_tfidf_vect))
# accuracy_score(y_test, cb.predict(X_test_tfidf_vect))

In [55]:
# accuracy_score(y_train, cb.predict(X_train_tfidf_vect))

## 모델 검증

In [56]:
t_models = []

t_models.append(('MultinomialNB', MultinomialNB(alpha=0.5, fit_prior='True')))
t_models.append(('LinearSVC', LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)))
t_models.append(('SGDClassifier', SGDClassifier(alpha=0.0001, loss='modified_huber', n_jobs=-1, penalty='l2', random_state=13)))
t_models.append(('RidgeClassifier', RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)))
t_models.append(('LogisticRegression', LogisticRegression(C=1.0, max_iter=500)))

In [57]:
from sklearn.metrics import accuracy_score

train_score = []
test_score = []
names = []

for name, model in t_models:
    clf = model
    clf.fit(X_train_tfidf_vect, y_train)
    
    train_pred = clf.predict(X_train_tfidf_vect)
    test_pred = clf.predict(X_test_tfidf_vect)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, train_pred))
    test_score.append(accuracy_score(y_test, test_pred))

In [58]:
result = pd.DataFrame({'model name': names,
                       'train score': train_score,
                       'test score': test_score})
result['diff'] = result['train score'] - result['test score']
result.round(2).sort_values(by='train score', ascending=False).reset_index(drop=True)

Unnamed: 0,model name,train score,test score,diff
0,LinearSVC,0.87,0.73,0.13
1,RidgeClassifier,0.87,0.72,0.16
2,SGDClassifier,0.86,0.73,0.13
3,LogisticRegression,0.83,0.73,0.11
4,MultinomialNB,0.8,0.72,0.08
