In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 확인

In [2]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

In [3]:
pd.set_option('max_colwidth', 180)

## train data
- 실제 작가와 작가가 작성한 Text가 라벨링된 데이터프레임

In [4]:
train.drop(columns='index')

Unnamed: 0,text,author
0,"He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of n...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in perusing Jane’s last letter, and dwelling on some passages which proved that Jane had not written in spirits, when, instead of being ...",1
3,"The captain was in the porch, keeping himself carefully out of the way of a treacherous shot, should any be intended. He turned and spoke to us, “Doctor's watch on the lookout....",4
4,"“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are ...",3
...,...,...
54874,"“Is that you, Mr. Smith?” odin whispered. “I hardly dared hope that you would come.”",2
54875,"I told my plan to the captain, and between us we settled on the details of its accomplishment.",4
54876,"""Your sincere well-wisher, friend, and sister, ""LUCY odin.",1
54877,“Then you wanted me to lend you money?”,3


In [5]:
train.text[0]

'He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.'

In [6]:
train.author.unique(), train.author.value_counts()

(array([3, 2, 1, 4, 0]),
 3    15063
 0    13235
 2    11554
 4     7805
 1     7222
 Name: author, dtype: int64)

### text_x
- train data를 학습시켜 어떤 작가가 작성했는지 분석해야할 데이터프레임

In [7]:
test_x

Unnamed: 0,index,text
0,0,"“Not at all. I think she is one of the most charming young ladies I ever met, and might have been most useful in such work as we have been doing. She had a decided genius that ..."
1,1,"""No,"" replied he, with sudden consciousness, ""not to find it in YOU; for I cannot be ignorant that to you, to your goodness, I owe it all.--I feel it--I would express it if I c..."
2,2,"As the lady had stated her intention of screaming, of course she would have screamed at this additional boldness, but that the exertion was rendered unnecessary by a hasty knoc..."
3,3,“And then suddenly in the silence I heard a sound which sent my heart into my mouth. It was the clank of the levers and the swish of the leaking cylinder. He had set the engine...
4,4,"His conviction remained unchanged. So far as I know--and I believe his honest heart was transparent to me--he never wavered again, in his solemn certainty of finding her. His p..."
...,...,...
19612,19612,"At the end of another day or two, odin growing visibly stronger every twelve hours, Mrs. odin, urged equally by her own and her daughter's wishes, began to talk of removing to ..."
19613,19613,"All afternoon we sat together, mostly in silence, watching my lord’s door. My own mind was busy with the scene that had just passed, and its singular resemblance to my vision. ..."
19614,19614,"odin, having carried his thanks to odin, proceeded with his happiness to Lucy; and such was the excess of it by the time he reached Bartlett's Buildings, that she was able to ..."
19615,19615,"Soon after this, upon odin's leaving the room, ""Mama,"" said odin, ""I have an alarm on the subject of illness which I cannot conceal from you. I am sure odin is not well. We hav..."


### submission
- 제출 형태
- 가로 index : 작가 명

In [8]:
submission

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0
...,...,...,...,...,...,...
19612,19612,0,0,0,0,0
19613,19613,0,0,0,0,0
19614,19614,0,0,0,0,0
19615,19615,0,0,0,0,0


## 전처리

### train_test_split

In [9]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [10]:
X

0        He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of n...
1                                                                                                                                                     “Your sister asked for it, I suppose?”
2         She was engaged one day as she walked, in perusing Jane’s last letter, and dwelling on some passages which proved that Jane had not written in spirits, when, instead of being ...
3        The captain was in the porch, keeping himself carefully out of the way of a treacherous shot, should any be intended. He turned and spoke to us, “Doctor's watch on the lookout....
4        “Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are ...
                                                       

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

### TfidVectorize

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

#### Multinomial Naive Bayes 적용

In [13]:
from sklearn.naive_bayes import MultinomialNB
mu_clf = MultinomialNB().fit(X_train_tfidf_vect, y_train)

In [14]:
# 정식으로 pipeline을 만들고

from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [15]:
# 학습 후 train, test accuracy score 적용

from sklearn.metrics import accuracy_score

mu_clf.fit(X_train_tfidf_vect, y_train)

train_pred = mu_clf.predict(X_train_tfidf_vect)
test_pred = mu_clf.predict(X_test_tfidf_vect)

print('MultinomialNB train accuracy score:', accuracy_score(y_train, train_pred))
print('MultinomialNB test accuracy score:', accuracy_score(y_test, test_pred))
# predicted = text_clf.predict(X_test)

# np.mean(predicted == y_test)


MultinomialNB train accuracy score: 0.7595152950823406
MultinomialNB test accuracy score: 0.6868622448979592


In [16]:
from sklearn.metrics import classification_report

# print(confusion_matrix(y_test, pred))
# print('-'*50)
print(classification_report(y_train, train_pred))
print('-----------'*5)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.73      0.85      0.79     10512
           1       0.94      0.59      0.73      5841
           2       0.85      0.68      0.76      9222
           3       0.66      0.96      0.78     12034
           4       0.97      0.49      0.65      6294

    accuracy                           0.76     43903
   macro avg       0.83      0.72      0.74     43903
weighted avg       0.80      0.76      0.75     43903

-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.67      0.79      0.72      2723
           1       0.89      0.50      0.64      1381
           2       0.76      0.57      0.65      2332
           3       0.61      0.92      0.73      3029
           4       0.93      0.37      0.53      1511

    accuracy                           0.69     10976
   macro avg       0.77      0.63      0.66     10976
weighted avg       0.

#### TfidfVectorize 튜닝

##### stop words 필터링을 추가하고 ngram을 기본 (1,1)에서 (1,2)로 변경하여 피처 벡터화

In [17]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,1))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

mu_clf = MultinomialNB()
mu_clf.fit(X_train_tfidf_vect, y_train)

train_pred = mu_clf.predict(X_train_tfidf_vect)
test_pred = mu_clf.predict(X_test_tfidf_vect)

print('TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_train, train_pred)))
print('TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, test_pred)))

TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 0.760
TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 0.687


In [18]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

mu_clf = MultinomialNB()
mu_clf.fit(X_train_tfidf_vect, y_train)

train_pred = mu_clf.predict(X_train_tfidf_vect)
test_pred = mu_clf.predict(X_test_tfidf_vect)

print('TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_train, train_pred)))
print('TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, test_pred)))

TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 0.821
TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 0.624


In [19]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

mu_clf = MultinomialNB()
mu_clf.fit(X_train_tfidf_vect, y_train)

train_pred = mu_clf.predict(X_train_tfidf_vect)
test_pred = mu_clf.predict(X_test_tfidf_vect)

print('TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_train, train_pred)))
print('TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, test_pred)))

TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 0.861
TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 0.607


##### min-df 조정
- min_df, max_df 는 아무런 결과의 차이를 가지고 오지 못했고,
- sublinear_tf도 영향이 없었다.

In [20]:
tfidf_vect = TfidfVectorizer(stop_words='english', sublinear_tf = False)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

mu_clf = MultinomialNB()
mu_clf.fit(X_train_tfidf_vect, y_train)

train_pred = mu_clf.predict(X_train_tfidf_vect)
test_pred = mu_clf.predict(X_test_tfidf_vect)

print('TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_train, train_pred)))
print('TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, test_pred)))

TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 0.760
TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 0.687


In [21]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 1), min_df=0.001)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

mu_clf = MultinomialNB()
mu_clf.fit(X_train_tfidf_vect, y_train)

train_pred = mu_clf.predict(X_train_tfidf_vect)
test_pred = mu_clf.predict(X_test_tfidf_vect)

print('TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_train, train_pred)))
print('TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, test_pred)))

TF-IDF Vectorized MultinomialNB train 의 예측 정확도는 0.674
TF-IDF Vectorized MultinomialNB test 의 예측 정확도는 0.648


#### GridSearch로 최적의 TFIDF 파라미터 셋업
- train 데이터, test 데이터간의 성능 차이가 최소로 좁혀짐

In [22]:
# Pipeline & Gridsearch setup
# TFIDF pipeline setup
tvc_pipe = Pipeline([
     ('tvec', TfidfVectorizer()),
     ('mb', MultinomialNB())
])

# Fit
tvc_pipe.fit(X_train, y_train)

# Setting params for TFIDF Vectorizer gridsearch
tf_params = {
#     'tvec__max_features':[2000, 20000, 200000],
    'tvec__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tvec__max_df': [0.001, 0.01, 0.1, 0.5, 1.0],
    'tvec__min_df': [0.001, 0.01, 0.1],
    'tvec__smooth_idf': [True, False]
}

In [23]:
from sklearn.model_selection import GridSearchCV

# Setting up GridSearch for TFIDFVectorizer
tvc_gs = GridSearchCV(tvc_pipe, param_grid=tf_params, cv = 5, verbose =1, n_jobs = -1)

# Fitting TVC GS
tvc_gs.fit(X_train, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  5.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [24]:
tvc_gs.best_params_

{'tvec__max_df': 0.1,
 'tvec__min_df': 0.001,
 'tvec__ngram_range': (1, 2),
 'tvec__smooth_idf': False}

In [25]:
tvc_gs.best_score_

0.6925268550678508

In [26]:
# Scoring Training data on TFIDFVectorizer
print(tvc_gs.score(X_train, y_train))

# Scoring Test data on TFIDFVectorizer
print(tvc_gs.score(X_test, y_test))

0.745803248069608
0.7043549562682215


In [27]:
# sublinear_tf : 높은 TF값들에 대해서 스무딩 처리, TF값에 대해 아웃라이어 처리

#### 여러 모델들 적용

In [28]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=0.001, 
                             max_df=0.1, smooth_idf=False)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [29]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


models = []
models.append(('LogisticRegression', LogisticRegression(random_state=13)))
models.append(('MultinomialNB', MultinomialNB()))
models.append(('RandomForestClassifier', RandomForestClassifier(random_state=13, n_jobs=-1)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=13)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=13)))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=13)))
models.append(('LGBMClassifier', LGBMClassifier(random_state=13)))
models.append(('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)))
models.append(('LinearSVC', LinearSVC(C=1, loss='hinge', random_state=13)))
models.append(('XgBoost', XGBClassifier(learning_rate=0.1, max_depth=3, random_state=13, n_jobs=-1)))
models.append(('RidgeClassifier', RidgeClassifier(random_state=13)))
models.append(('SGDClassifier', SGDClassifier(random_state=13, loss='modified_huber')))

In [30]:
models

[('LogisticRegression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=13, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('MultinomialNB', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)),
 ('RandomForestClassifier',
  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=-1, oob_score=False, random_state=13, verbose=0,
                

In [31]:
from sklearn.metrics import accuracy_score

train_score = []
test_score = []
names = []

for name, model in models:
    clf = model
    clf.fit(X_train_tfidf_vect, y_train)
    
    train_pred = clf.predict(X_train_tfidf_vect)
    test_pred = clf.predict(X_test_tfidf_vect)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, train_pred))
    test_score.append(accuracy_score(y_test, test_pred))

In [32]:
result = pd.DataFrame({'model name': names,
                       'train score': train_score,
                       'test score': test_score}) 
result.sort_values(by='train score', ascending=False).reset_index(drop=True)

Unnamed: 0,model name,train score,test score
0,RandomForestClassifier,0.973487,0.583181
1,DecisionTreeClassifier,0.973487,0.483692
2,SGDClassifier,0.724916,0.662263
3,LinearSVC,0.724347,0.656159
4,LogisticRegression,0.72298,0.66199
5,RidgeClassifier,0.713505,0.653426
6,LGBMClassifier,0.702868,0.631378
7,MultinomialNB,0.678382,0.652332
8,GradientBoostingClassifier,0.554723,0.536625
9,KNeighborsClassifier,0.517322,0.326348


In [33]:
result['diff'] = result['train score'] - result['test score']

In [34]:
result.round(2).sort_values(by='train score', ascending=False)

Unnamed: 0,model name,train score,test score,diff
2,RandomForestClassifier,0.97,0.58,0.39
3,DecisionTreeClassifier,0.97,0.48,0.49
0,LogisticRegression,0.72,0.66,0.06
8,LinearSVC,0.72,0.66,0.07
11,SGDClassifier,0.72,0.66,0.06
10,RidgeClassifier,0.71,0.65,0.06
6,LGBMClassifier,0.7,0.63,0.07
1,MultinomialNB,0.68,0.65,0.03
5,GradientBoostingClassifier,0.55,0.54,0.02
7,KNeighborsClassifier,0.52,0.33,0.19


##### GridSearchCV로 LogisticRegression C 하이퍼 파라미터 튜닝

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()

# 최적 C 값 도출 튜닝 수행. CV는 3 Fold셋으로 설정.
params = { 'C': [0.01, 0.1, 1, 5, 10], 'random_state': [13]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter:', grid_cv_lr.best_params_)

# 최적 C 값으로 학습된 grid_cv로 예측 수행하고 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:    3.6s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    3.9s finished


Logistic Regression best C parameter: {'C': 1, 'random_state': 13}
TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.662


In [36]:
grid_cv_lr.best_params_

{'C': 1, 'random_state': 13}

In [37]:
grid_cv_lr.best_score_

0.6548982680731412

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

mu_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('mnb', MultinomialNB())
])

In [39]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    'mnb__alpha': np.linspace(0.5, 1.5, 6),
    'mnb__fit_prior': [True, False],
}

grid = GridSearchCV(mu_pipeline, grid_params)
grid.fit(X_train, y_train)

print("Best Scores: ", grid.best_score_)
print("Best Params: ", grid.best_params_)

Best Scores:  0.7287884563352381
Best Params:  {'mnb__alpha': 0.5, 'mnb__fit_prior': False}


## 모델 하이퍼파라미터 튜닝

### MultinomialNB

In [40]:
from sklearn.naive_bayes import MultinomialNB

params = {'alpha': np.linspace(0.5, 1.5, 6),
         'fit_prior': ['True', 'False']}
mu_clf = MultinomialNB()
grid_cv = GridSearchCV(mu_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('MultinomialNB best parameters:', grid_cv.best_params_)
print('MultinomialNB best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


MultinomialNB best parameters: {'alpha': 0.5, 'fit_prior': 'True'}
MultinomialNB best accuracy score: 0.6430084542275784


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    0.4s finished


In [41]:
mu_clf = MultinomialNB(alpha=0.5, fit_prior='True')
mu_clf.fit(X_train_tfidf_vect, y_train)

mu_train_pred = mu_clf.predict(X_train_tfidf_vect)
mu_test_pred = mu_clf.predict(X_test_tfidf_vect)

print('MultinomialNB train accuracy score', accuracy_score(y_train, mu_train_pred))
print('MultinomialNB test accuracy score', accuracy_score(y_test, mu_test_pred))

MultinomialNB train accuracy score 0.6818896202992962
MultinomialNB test accuracy score 0.6526056851311953


In [42]:
# 교차검증

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

In [43]:
from sklearn.model_selection import cross_validate
cross_validate(mu_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([0.01508307, 0.01333499, 0.01340222, 0.01505375, 0.01422095]),
 'score_time': array([0.00204682, 0.00213504, 0.00205684, 0.0022893 , 0.0022831 ]),
 'test_score': array([0.64229587, 0.64286528, 0.64730668, 0.64669704, 0.64555809]),
 'train_score': array([0.68526849, 0.68820113, 0.68754627, 0.68633089, 0.68695726])}

### LinearSVC

In [44]:
import sklearn
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(C=1, loss='hinge', random_state=13)
svm_clf.fit(X_train_tfidf_vect, y_train)

svm_train_pred = svm_clf.predict(X_train_tfidf_vect)
svm_test_pred = svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, svm_train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, svm_test_pred))

LinearSVC train accuracy score: 0.7243468555679566
LinearSVC test accuracy score: 0.6561588921282799


In [45]:
params = {
    'C': [0.001, 0.01, 1], 
    'loss': ['squared_hinge', 'hinge'], 
    'penalty': ['l1', 'l2'], 
    'random_state': [13]
    }

grid_cv = GridSearchCV(LinearSVC(), param_grid=params, cv=skf, verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 out of  60 | elapsed:    1.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    7.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=13, shuffle=True),
             error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 1],
                         'loss': ['squared_hinge', 'hinge'],
                         'penalty': ['l1', 'l2'], 'random_state': [13]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [46]:
print('LinearSVC Best parameters:', grid_cv.best_params_)
print('LinearSVC Best accruacy score:', grid_cv.best_score_)

LinearSVC Best parameters: {'C': 1, 'loss': 'squared_hinge', 'penalty': 'l2', 'random_state': 13}
LinearSVC Best accruacy score: 0.6551261200474517


In [47]:
svm_clf = LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)
svm_clf.fit(X_train_tfidf_vect, y_train)

svm_train_pred = svm_clf.predict(X_train_tfidf_vect)
svm_test_pred = svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, svm_train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, svm_test_pred)) 

LinearSVC train accuracy score: 0.7243468555679566
LinearSVC test accuracy score: 0.6561588921282799


In [48]:
svm_clf.decision_function(X_test_tfidf_vect) > 0

array([[ True, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False,  True],
       ...,
       [False, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False, False,  True]])

In [49]:
# 교차검증

from sklearn.model_selection import cross_validate, cross_val_score

cross_validate(svm_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([1.08990192, 0.93925619, 1.21452188, 1.0140028 , 1.0888052 ]),
 'score_time': array([0.00151014, 0.0015738 , 0.00241709, 0.00215816, 0.00256896]),
 'test_score': array([0.6523175 , 0.65072315, 0.65357021, 0.65432802, 0.65592255]),
 'train_score': array([0.73025454, 0.73210523, 0.73125107, 0.72821228, 0.73026222])}

### SGDClassifier

In [50]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=13, loss='modified_huber')
sgd_clf.fit(X_train_tfidf_vect, y_train)

sgd_train_pred = sgd_clf.predict(X_train_tfidf_vect)
sgd_test_pred = sgd_clf.predict(X_test_tfidf_vect)

print('SGDClassifier train accuracy score:', accuracy_score(y_train, sgd_train_pred))
print('SGDClassifier test accuracy score:', accuracy_score(y_test, sgd_test_pred))

SGDClassifier train accuracy score: 0.7249162927362595
SGDClassifier test accuracy score: 0.6622631195335277


In [51]:
# 교차 검증

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train_tfidf_vect, y_train, cv=skf, scoring='accuracy')

array([0.65379797, 0.65653115, 0.65983373, 0.65979499, 0.65763098])

In [52]:
# 오차 행렬

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train_tfidf_vect, y_train, cv=3)

In [53]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_train_pred)

array([[6824,  560, 1129, 1456,  543],
       [ 707, 3571,  687,  696,  180],
       [ 870,  351, 5822, 1561,  618],
       [ 808,  299, 1230, 9356,  341],
       [ 949,  193, 1118,  881, 3153]])

In [54]:
params = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    'loss': ['log', 'modified_huber', 'hinge', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'elasticnet'],
    'n_jobs': [-1],
    'random_state': [13],
}

grid_cv = GridSearchCV(sgd_clf, param_grid=params, cv=skf, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed:  1.4min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=13, shuffle=True),
             error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='modified_huber', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5, r...
                                     validation_fraction=0.1, verbose=0,
                                     warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                   1000.0],
                         'loss': ['log', 'modified_huber', 'hinge',
                 

In [55]:
print('SGDClassifier best parameters:', grid_cv.best_params_)
print('SGDClassifier best accuracy score:', grid_cv.best_score_)

SGDClassifier best parameters: {'alpha': 0.0001, 'loss': 'modified_huber', 'n_jobs': -1, 'penalty': 'elasticnet', 'random_state': 13}
SGDClassifier best accuracy score: 0.658292165809437


In [56]:
from sklearn.model_selection import cross_validate

cross_validate(sgd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([0.41827679, 0.40968609, 0.35633516, 0.35890007, 0.28312492]),
 'score_time': array([0.00209022, 0.00173092, 0.00182366, 0.00162101, 0.00182986]),
 'test_score': array([0.65379797, 0.65653115, 0.65983373, 0.65979499, 0.65763098]),
 'train_score': array([0.73461079, 0.73364273, 0.73384204, 0.7325684 , 0.73222675])}

### RidgeClassifier

In [57]:
rd_clf = RidgeClassifier()
params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
    'normalize': [True, False],
    'max_iter': [100, 300],
    'random_state': [13]
}

grid_cv = GridSearchCV(rd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('RidgeClassifier best parameters:', grid_cv.best_params_)
print('RidgeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    3.2s finished


RidgeClassifier best parameters: {'alpha': 1.0, 'max_iter': 100, 'normalize': False, 'random_state': 13}
RidgeClassifier best accuracy score: 0.6458328880065934


In [58]:
rd_clf = RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)
rd_clf.fit(X_train_tfidf_vect, y_train)

rd_train_pred = rd_clf.predict(X_train_tfidf_vect)
rd_test_pred = rd_clf.predict(X_test_tfidf_vect)
print('RidgeClassifier best train accuracy score:', accuracy_score(y_train, rd_train_pred))
print('RidgeClassifier best test accuracy score:', accuracy_score(y_test, rd_test_pred))

RidgeClassifier best train accuracy score: 0.6930050338245678
RidgeClassifier best test accuracy score: 0.6486880466472303


In [59]:
# 교차검증

cross_validate(rd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([0.11398983, 0.10546613, 0.10799384, 0.11834884, 0.10841703]),
 'score_time': array([0.00214505, 0.00169706, 0.00208998, 0.00235009, 0.00216389]),
 'test_score': array([0.64457351, 0.64571233, 0.64810386, 0.64191344, 0.64179954]),
 'train_score': array([0.69956153, 0.70055805, 0.69916292, 0.6994562 , 0.70085129])}

### LogisticRegression

In [60]:
clf = LogisticRegression()
params = {
    'C': [0.01, 0.1, 0.5, 1.0],
    'max_iter': [100, 200, 500],
}

grid_cv = GridSearchCV(clf, param_grid=params, scoring='accuracy', cv=skf, verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LogisticRegression best parameters:', grid_cv.best_params_)
print('LogisticRegression best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   24.8s finished


LogisticRegression best parameters: {'C': 1.0, 'max_iter': 500}
LogisticRegression best accuracy score: 0.6598865924797769


In [61]:
lr_clf = LogisticRegression(C=1.0, max_iter=500)
lr_clf.fit(X_train_tfidf_vect, y_train)

lr_train_pred = lr_clf.predict(X_train_tfidf_vect)
lr_test_pred = lr_clf.predict(X_test_tfidf_vect)

print('LogisticRegression train accuracy score:', accuracy_score(y_train, lr_train_pred))
print('LogisticRegression test accuracy score:', accuracy_score(y_test, lr_test_pred))

LogisticRegression train accuracy score: 0.7232990911782794
LogisticRegression test accuracy score: 0.6621720116618076


### LGBMClassifier

In [62]:
clf = LGBMClassifier()
params = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8],
    'num_leaves': [6,8], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [13],
#     'colsample_bytree' : [0.64, 0.65, 0.66],
#     'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid_cv = GridSearchCV(clf, param_grid=params, verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LGBMClassifier best parameters:', grid_cv.best_params_)
print('LGBMClassifier best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  2.0min finished


LGBMClassifier best parameters: {'boosting_type': 'dart', 'learning_rate': 0.01, 'max_bin': 510, 'n_estimators': 8, 'num_leaves': 8, 'objective': 'binary', 'random_state': 13, 'reg_alpha': 1, 'reg_lambda': 1.4}
LGBMClassifier best accuracy score: 0.35207185528705465


In [63]:
lgb_clf = LGBMClassifier(boosting_type='dart', learning_rate=0.01, max_bin=510, n_estimators=8, num_leaves=8,
                        objective='binary', random_state=13, reg_alpha=1, reg_lambda=1)
lgb_clf.fit(X_train_tfidf_vect, y_train)

# lgb_train_pred = lgb_clf.predict(X_train_tfidf_vect)
lgb_test_pred = lgb_clf.predict(X_test_tfidf_vect)

# print('LGBMClassifier best accuracy score:', accuracy_score(X_test, lgb_train_pred))
print('LGBMClassifier best accuracy score:', accuracy_score(y_test, lgb_test_pred))

LGBMClassifier best accuracy score: 0.360149416909621


In [64]:
lgb_clf = LGBMClassifier(boosting_type='dart', learning_rate=0.01, max_bin=255, n_estimators=8, num_leaves=8,
                         objective='binary', random_state=13, reg_alpha=1, reg_lambda=1)
lgb_clf.fit(X_train_tfidf_vect, y_train)

lgb_train_pred = lgb_clf.predict(X_train_tfidf_vect)
lgb_test_pred = lgb_clf.predict(X_test_tfidf_vect)

print('LGBMClassifier train accuracy score:', accuracy_score(y_train, lgb_train_pred))
print('LGBMClassifier test accuracy score:', accuracy_score(y_test, lgb_test_pred))

LGBMClassifier train accuracy score: 0.3563765574106553
LGBMClassifier test accuracy score: 0.360149416909621


In [65]:
cross_validate(lgb_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([2.11260891, 2.08829188, 2.05946708, 2.10079503, 1.91776419]),
 'score_time': array([0.02086806, 0.024997  , 0.0226748 , 0.02295709, 0.02157378]),
 'test_score': array([0.35781802, 0.35804578, 0.35155449, 0.34635535, 0.35501139]),
 'train_score': array([0.35920506, 0.35595923, 0.35171687, 0.34897361, 0.35976426])}

### DecisionTreeClassifier

In [66]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_tfidf_vect, y_train)

dt_train_pred = dt_clf.predict(X_train_tfidf_vect)
dt_test_pred = dt_clf.predict(X_test_tfidf_vect)

print('DecisionTreeClassifier train score :', accuracy_score(y_train, dt_train_pred))
print('DecisionTreeClassifier test score :', accuracy_score(y_test, dt_test_pred))

DecisionTreeClassifier train score : 0.9734870054438193
DecisionTreeClassifier test score : 0.48533163265306123


In [67]:
params = {
    'max_depth': [5, 10, 20, 120],
    'min_samples_split': [16, 24],
    'min_samples_leaf': [16, 32],
    'random_state': [13]
}

grid_cv = GridSearchCV(dt_clf, param_grid=params, verbose=1, n_jobs=-1, cv=5, scoring='accuracy')
grid_cv.fit(X_train_tfidf_vect, y_train)

print('DecisionTreeClassifier best parameters:', grid_cv.best_params_)
print('DecisionTreeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   21.3s finished


DecisionTreeClassifier best parameters: {'max_depth': 120, 'min_samples_leaf': 32, 'min_samples_split': 16, 'random_state': 13}
DecisionTreeClassifier best accuracy score: 0.455936974607891


## 결과

In [74]:
abc = pd.merge(X_test, y_test, left_index = True, right_index=True)
abc['pred'] = svm_test_pred

# 예측이 틀린 결과 -> 데이터 프레임 화
error_df = abc[abc['author'] != abc['pred']]

# CountVectorizer로 단어 갯수 카운트
cv = CountVectorizer(stop_words='english')   
cv_fit=cv.fit_transform(error_df['text'])    
word_list = cv.get_feature_names();    
count_list = cv_fit.toarray().sum(axis=0)

di = dict(zip(word_list,count_list))

# 예측이 틀린 결과 중 가장 많이 나온 단어 20개 정렬
pd.DataFrame(list(di.items())).sort_values(by=1, ascending=False)[:20]

Unnamed: 0,0,1
5550,odin,1842
6981,said,966
4966,man,268
5290,mr,262
4597,know,221
1601,come,213
2328,did,212
1971,cried,202
8234,think,196
4773,like,195


In [69]:
# from catboost import CatBoostClassifier

# cb = CatBoostClassifier(silent=True, random_state=13, n_estimators=300).fit(X_train_tfidf_vect, y_train)
# accuracy_score(y_train, cb.predict(X_train_tfidf_vect))
# accuracy_score(y_test, cb.predict(X_test_tfidf_vect))

In [70]:
# accuracy_score(y_train, cb.predict(X_train_tfidf_vect))

## 모델 비교

In [71]:
t_models = []

t_models.append(('MultinomialNB', MultinomialNB(alpha=0.5, fit_prior='True')))
t_models.append(('LinearSVC', LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)))
t_models.append(('SGDClassifier', SGDClassifier(alpha=0.0001, loss='modified_huber', n_jobs=-1, penalty='l2', random_state=13)))
t_models.append(('RidgeClassifier', RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)))
t_models.append(('LogisticRegression', LogisticRegression(C=1.0, max_iter=500)))

In [72]:
from sklearn.metrics import accuracy_score

train_score = []
test_score = []
names = []

for name, model in t_models:
    clf = model
    clf.fit(X_train_tfidf_vect, y_train)
    
    train_pred = clf.predict(X_train_tfidf_vect)
    test_pred = clf.predict(X_test_tfidf_vect)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, train_pred))
    test_score.append(accuracy_score(y_test, test_pred))

In [73]:
result = pd.DataFrame({'model name': names,
                       'train score': train_score,
                       'test score': test_score})
result['diff'] = result['train score'] - result['test score']
result.round(2).sort_values(by='train score', ascending=False).reset_index(drop=True)

Unnamed: 0,model name,train score,test score,diff
0,LinearSVC,0.72,0.66,0.07
1,SGDClassifier,0.72,0.66,0.06
2,LogisticRegression,0.72,0.66,0.06
3,RidgeClassifier,0.69,0.65,0.04
4,MultinomialNB,0.68,0.65,0.03
