In [1]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 확인

In [2]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [3]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

### train data
- 실제 작가와 작가가 작성한 Text가 라벨링된 데이터프레임

In [4]:
train

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [5]:
train.author.unique(), train.author.value_counts()

(array([3, 2, 1, 4, 0]), 3    15063
 0    13235
 2    11554
 4     7805
 1     7222
 Name: author, dtype: int64)

### text_x
- train data를 학습시켜 어떤 작가가 작성했는지 분석해야할 데이터프레임

In [6]:
test_x

Unnamed: 0,index,text
0,0,“Not at all. I think she is one of the most ch...
1,1,"""No,"" replied he, with sudden consciousness, ""..."
2,2,As the lady had stated her intention of scream...
3,3,“And then suddenly in the silence I heard a so...
4,4,His conviction remained unchanged. So far as I...
...,...,...
19612,19612,"At the end of another day or two, odin growing..."
19613,19613,"All afternoon we sat together, mostly in silen..."
19614,19614,"odin, having carried his thanks to odin, proc..."
19615,19615,"Soon after this, upon odin's leaving the room,..."


### submission
- 제출 형태
- 가로 index : 작가 명

In [7]:
submission

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0
...,...,...,...,...,...,...
19612,19612,0,0,0,0,0
19613,19613,0,0,0,0,0
19614,19614,0,0,0,0,0
19615,19615,0,0,0,0,0


## 전처리

### nltk.stopwords + train_test_split

In [8]:
X = pd.read_csv('open/word.csv')
X.text = X.text.astype('str')
X

Unnamed: 0,text,author
0,he was almost choking there was so much so muc...,3
1,your sister asked for it i suppose,2
2,she was engaged one day as she walked in perus...,1
3,the captain was in the porch keeping himself c...,4
4,have mercy gentlemen odin flung up his hands ...,3
...,...,...
54874,is that you mr smith odin whispered i hardly ...,2
54875,i told my plan to the captain and between us w...,4
54876,your sincere well wisher friend and sister luc...,1
54877,then you wanted me to lend you money,3


In [9]:
X

Unnamed: 0,text,author
0,he was almost choking there was so much so muc...,3
1,your sister asked for it i suppose,2
2,she was engaged one day as she walked in perus...,1
3,the captain was in the porch keeping himself c...,4
4,have mercy gentlemen odin flung up his hands ...,3
...,...,...
54874,is that you mr smith odin whispered i hardly ...,2
54875,i told my plan to the captain and between us w...,4
54876,your sincere well wisher friend and sister luc...,1
54877,then you wanted me to lend you money,3


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X.text, X.author, test_size=0.2, random_state=13)

### 벡터라이즈

#### countvectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# Count Vectorization으로 feature extraction 변환 수행. 
cnt_vect = CountVectorizer(stop_words='english')
cnt_vect.fit(X_train)

CountVectorizer(stop_words='english')

In [12]:
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)
print('X_train의 CountVectorizer Shape:', X_train_cnt_vect.shape, X_test_cnt_vect.shape)

X_train의 CountVectorizer Shape: (43903, 31653) (10976, 31653)


In [13]:
print(cnt_vect.vocabulary_)



In [14]:
len(cnt_vect.vocabulary_)

31653

In [15]:
tfidf_vect = CountVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

##### GridSearchCV로 LogisticRegression C 하이퍼 파라미터 튜닝

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF-IDF Logistic Regression의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

TF-IDF Logistic Regression의 예측 정확도는 0.724


In [17]:
from sklearn.model_selection import GridSearchCV

# 최적 C 값 도출 튜닝 수행. CV는 3 Fold셋으로 설정.
params = { 'C': [0.01, 0.1, 1, 5, 10], 'random_state': [13]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter:', grid_cv_lr.best_params_)

# 최적 C 값으로 학습된 grid_cv로 예측 수행하고 정확도 평가
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized Logistic Regression 의 예측 정확도는 {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  5.5min finished


Logistic Regression best C parameter: {'C': 1, 'random_state': 13}
TF-IDF Vectorized Logistic Regression 의 예측 정확도는 0.724


In [18]:
grid_cv_lr.best_params_

{'C': 1, 'random_state': 13}

In [19]:
grid_cv_lr.best_score_

0.7037787022519657

## 모델 하이퍼파라미터 튜닝

### MultinomialNB

In [20]:
from sklearn.naive_bayes import MultinomialNB

params = {'alpha': [0.01, 0.1, 0.5, 1.0],
         'fit_prior': ['True', 'False']}
clf = MultinomialNB()
grid_cv = GridSearchCV(clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('MultinomialNB best parameters:', grid_cv.best_params_)
print('MultinomialNB best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    1.6s finished


MultinomialNB best parameters: {'alpha': 0.1, 'fit_prior': 'True'}
MultinomialNB best accuracy score: 0.7303601195674864


In [21]:
clf = MultinomialNB(alpha=0.1, fit_prior='True')
clf.fit(X_train_tfidf_vect, y_train)

train_pred = clf.predict(X_train_tfidf_vect)
test_pred = clf.predict(X_test_tfidf_vect)

print('MultinomialNB train accuracy score', accuracy_score(y_train, train_pred))
print('MultinomialNB test accuracy score', accuracy_score(y_test, test_pred))

MultinomialNB train accuracy score 0.9581577568731067
MultinomialNB test accuracy score 0.7459001457725948


In [22]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)

In [23]:
from sklearn.model_selection import cross_validate
cross_validate(clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.10292792, 0.09514403, 0.08898497, 0.08664775, 0.08704114]),
 'score_time': array([0.01515794, 0.01451492, 0.01370311, 0.0131681 , 0.01491499]),
 'test_score': array([0.73522378, 0.73294613, 0.7324906 , 0.73712984, 0.7381549 ]),
 'train_score': array([0.95968339, 0.96002506, 0.96102158, 0.95979842, 0.95988384])}

### LinearSVC

In [24]:
import sklearn
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(C=1, loss='hinge', random_state=13)
svm_clf.fit(X_train_tfidf_vect, y_train)

train_pred = svm_clf.predict(X_train_tfidf_vect)
test_pred = svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, test_pred))

LinearSVC train accuracy score: 0.9870623875361593
LinearSVC test accuracy score: 0.7036260932944607


In [25]:
params = {
    'C': [0.001, 0.01, 1], 
    'loss': ['squared_hinge', 'hinge'], 
    'penalty': ['l1', 'l2'], 
    'random_state': [13]
    }

grid_cv = GridSearchCV(LinearSVC(), param_grid=params, refit=True, verbose=2)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.8s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.9s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.8s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.8s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.8s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, rand

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  2.6min finished


GridSearchCV(estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 1],
                         'loss': ['squared_hinge', 'hinge'],
                         'penalty': ['l1', 'l2'], 'random_state': [13]},
             verbose=2)

In [26]:
print('LinearSVC Best parameters:', grid_cv.best_params_)
print('LinearSVC Best accruacy score:', grid_cv.best_score_)

LinearSVC Best parameters: {'C': 0.01, 'loss': 'squared_hinge', 'penalty': 'l2', 'random_state': 13}
LinearSVC Best accruacy score: 0.6971734063424887


In [27]:
svm_clf = LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)
svm_clf.fit(X_train_tfidf_vect, y_train)

train_pred = svm_clf.predict(X_train_tfidf_vect)
test_pred = svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, test_pred)) 

LinearSVC train accuracy score: 0.9870623875361593
LinearSVC test accuracy score: 0.7036260932944607


In [28]:
svm_clf.decision_function(X_test_tfidf_vect) > 0

array([[ True, False, False, False, False],
       [False, False, False, False, False],
       [False, False, False, False,  True],
       ...,
       [False, False,  True, False, False],
       [False, False, False,  True, False],
       [False, False, False, False,  True]])

In [29]:
# 교차검증

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [30]:
from sklearn.model_selection import cross_validate, cross_val_score

cross_validate(svm_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([17.05398083, 19.38982701, 14.79069591, 17.01095915, 19.97363114,
        15.45635104, 16.09372306, 15.15638781, 16.57379007, 14.7106111 ]),
 'score_time': array([0.00415897, 0.00485516, 0.00412893, 0.00413299, 0.00429797,
        0.00415397, 0.00407791, 0.00416732, 0.00426579, 0.00411892]),
 'test_score': array([0.70576179, 0.68708722, 0.70986108, 0.6929385 , 0.69111617,
        0.70387244, 0.69225513, 0.6929385 , 0.6952164 , 0.68997722]),
 'train_score': array([0.98772525, 0.98716846, 0.98739623, 0.9872194 , 0.98719409,
        0.98727001, 0.9872194 , 0.98709286, 0.9872194 , 0.98711816])}

### SGDClassifier

In [31]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=13, loss='modified_huber')
sgd_clf.fit(X_train_tfidf_vect, y_train)

train_pred = sgd_clf.predict(X_train_tfidf_vect)
test_pred = sgd_clf.predict(X_test_tfidf_vect)

print('SGDClassifier train accuracy score:', accuracy_score(y_train, train_pred))
print('SGDClassifier test accuracy score:', accuracy_score(y_test, test_pred))

SGDClassifier train accuracy score: 0.98599184565975
SGDClassifier test accuracy score: 0.7383381924198251


In [32]:
# 교차 검증

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train_tfidf_vect, y_train, cv=5, scoring='accuracy')

array([0.72634096, 0.7209885 , 0.71643321, 0.72266515, 0.7214123 ])

In [33]:
# 오차 행렬

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train_tfidf_vect, y_train, cv=3)

In [34]:
from sklearn.metrics import classification_report, confusion_matrix
confusion_matrix(y_train, y_train_pred)

array([[7491,  495,  962,  984,  580],
       [ 600, 3731,  695,  621,  194],
       [ 739,  335, 6481, 1117,  550],
       [ 681,  302, 1172, 9612,  267],
       [ 828,  204,  858,  615, 3789]])

In [35]:
params = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    'loss': ['log', 'modified_huber', 'hinge', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'elasticnet'],
    'n_jobs': [-1],
    'random_state': [13],
}

grid_cv = GridSearchCV(sgd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 18.6min finished


GridSearchCV(cv=3,
             estimator=SGDClassifier(loss='modified_huber', random_state=13),
             n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                   1000.0],
                         'loss': ['log', 'modified_huber', 'hinge',
                                  'squared_hinge', 'perceptron'],
                         'n_jobs': [-1], 'penalty': ['l2', 'elasticnet'],
                         'random_state': [13]},
             scoring='accuracy', verbose=1)

In [36]:
print('SGDClassifier best parameters:', grid_cv.best_params_)
print('SGDClassifier best accuracy score:', grid_cv.best_score_)

SGDClassifier best parameters: {'alpha': 0.0001, 'loss': 'hinge', 'n_jobs': -1, 'penalty': 'elasticnet', 'random_state': 13}
SGDClassifier best accuracy score: 0.7163746467210714


In [37]:
from sklearn.model_selection import cross_validate

cross_validate(sgd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([1.05610394, 1.25250316, 1.08370686, 1.10896802, 0.93880081]),
 'score_time': array([0.03086996, 0.02968979, 0.03088999, 0.03800392, 0.03698611]),
 'test_score': array([0.72634096, 0.7209885 , 0.71643321, 0.72266515, 0.7214123 ]),
 'train_score': array([0.98787085, 0.98735835, 0.98789932, 0.98770037, 0.98778578])}

### RidgeClassifier

In [38]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, SGDClassifier
rd_clf = RidgeClassifier()
params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
    'normalize': [True, False],
    'max_iter': [100, 300],
    'random_state': [13]
}

grid_cv = GridSearchCV(rd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('RidgeClassifier best parameters:', grid_cv.best_params_)
print('RidgeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 21.6min finished


RidgeClassifier best parameters: {'alpha': 1.0, 'max_iter': 100, 'normalize': False, 'random_state': 13}
RidgeClassifier best accuracy score: 0.6614581266717651


In [39]:
rd_clf = RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)
rd_clf.fit(X_train_tfidf_vect, y_train)

train_pred = rd_clf.predict(X_train_tfidf_vect)
test_pred = rd_clf.predict(X_test_tfidf_vect)
print('RidgeClassifier best train accuracy score:', accuracy_score(y_train, train_pred))
print('RidgeClassifier best test accuracy score:', accuracy_score(y_test, test_pred))

RidgeClassifier best train accuracy score: 0.9765619661526547
RidgeClassifier best test accuracy score: 0.6416727405247813


In [40]:
cross_validate(rd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([2.70073295, 2.72607589, 2.77982092, 2.74330807, 2.74756098,
        2.72330189, 2.70191693, 2.646837  , 2.75050712, 2.58716583]),
 'score_time': array([0.01098537, 0.01041603, 0.01049709, 0.01054311, 0.01037478,
        0.01036811, 0.01039505, 0.01050496, 0.01036692, 0.01022911]),
 'test_score': array([0.63356866, 0.62514234, 0.63721248, 0.61958998, 0.63280182,
        0.63917995, 0.62323462, 0.6498861 , 0.62961276, 0.62984055]),
 'train_score': array([0.9771968 , 0.97712087, 0.97744989, 0.97691899, 0.97734923,
        0.97750108, 0.97707084, 0.97734923, 0.97707084, 0.9769443 ])}

### LogisticRegression

In [41]:
clf = LogisticRegression()
params = {
    'C': [0.01, 0.1, 0.5, 1.0],
    'max_iter': [100, 200, 500],
}

grid_cv = GridSearchCV(clf, param_grid=params, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LogisticRegression best parameters:', grid_cv.best_params_)
print('LogisticRegression best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 18.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 31.3min finished


LogisticRegression best parameters: {'C': 1.0, 'max_iter': 100}
LogisticRegression best accuracy score: 0.7127986600806929


In [42]:
lr_clf = LogisticRegression(C=1.0, max_iter=500)
lr_clf.fit(X_train_tfidf_vect, y_train)

train_pred = lr_clf.predict(X_train_tfidf_vect)
test_pred = lr_clf.predict(X_test_tfidf_vect)

print('LogisticRegression train accuracy score:', accuracy_score(y_train, train_pred))
print('LogisticRegression test accuracy score:', accuracy_score(y_test, test_pred))

LogisticRegression train accuracy score: 0.9789991572329909
LogisticRegression test accuracy score: 0.7223032069970845


### DecisionTreeClassifier

In [44]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_tfidf_vect, y_train)

train_pred = dt_clf.predict(X_train_tfidf_vect)
test_pred = dt_clf.predict(X_test_tfidf_vect)

print('DecisionTreeClassifier train score :', accuracy_score(y_train, train_pred))
print('DecisionTreeClassifier test score :', accuracy_score(y_test, test_pred))

DecisionTreeClassifier train score : 0.9925289843518666
DecisionTreeClassifier test score : 0.4987244897959184


In [45]:
params = {
    'max_depth': [5, 10, 20, 120],
    'min_samples_split': [16, 24],
    'min_samples_leaf': [16, 32],
    'random_state': [13]
}

grid_cv = GridSearchCV(dt_clf, param_grid=params, verbose=1, n_jobs=-1, cv=5, scoring='accuracy')
grid_cv.fit(X_train_tfidf_vect, y_train)

print('DecisionTreeClassifier best parameters:', grid_cv.best_params_)
print('DecisionTreeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 11.4min finished


DecisionTreeClassifier best parameters: {'max_depth': 120, 'min_samples_leaf': 16, 'min_samples_split': 16, 'random_state': 13}
DecisionTreeClassifier best accuracy score: 0.47766670843213727


## 모델 검증

In [46]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5)
kf = SGDClassifier(random_state=13, loss='modified_huber')

In [47]:
for train_idx, test_idx in kfold.split(X):
    print(len(train_idx), len(test_idx))

43903 10976
43903 10976
43903 10976
43903 10976
43904 10975


In [48]:
# 교차 검증 구현하기

kf = KFold(n_splits=2)

print(kf.get_n_splits(X_train_tfidf_vect))
print(kf)
for train_idx, test_idx in kf.split(X_train_tfidf_vect):
    print('--- idx')
    print(train_idx, test_idx)
    print('--- train data')
    print(X_train_tfidf_vect)
    print('--- val data')
    print(X[test_idx])

2
KFold(n_splits=2, random_state=None, shuffle=False)
--- idx
[21952 21953 21954 ... 43900 43901 43902] [    0     1     2 ... 21949 21950 21951]
--- train data
  (0, 322212)	1
  (0, 326733)	1
  (0, 403309)	1
  (1, 178516)	1
  (1, 178828)	1
  (1, 203234)	1
  (1, 203243)	1
  (1, 263074)	1
  (1, 307601)	1
  (1, 307790)	1
  (1, 322212)	1
  (1, 326733)	1
  (1, 398145)	1
  (1, 398494)	1
  (1, 403309)	1
  (1, 405260)	1
  (1, 452521)	1
  (1, 452543)	1
  (2, 15942)	1
  (2, 16202)	1
  (2, 58545)	1
  (2, 59150)	1
  (2, 124342)	1
  (2, 124349)	1
  (2, 128482)	1
  :	:
  (43902, 259708)	1
  (43902, 259923)	1
  (43902, 264444)	1
  (43902, 266240)	1
  (43902, 267048)	1
  (43902, 282442)	1
  (43902, 282447)	1
  (43902, 302707)	1
  (43902, 302791)	1
  (43902, 330563)	1
  (43902, 330695)	1
  (43902, 372761)	1
  (43902, 372770)	1
  (43902, 372894)	1
  (43902, 372896)	1
  (43902, 408187)	1
  (43902, 408189)	1
  (43902, 417261)	1
  (43902, 417262)	1
  (43902, 439721)	1
  (43902, 439725)	1
  (43902, 461577)

KeyError: "None of [Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,\n                9,\n            ...\n            21942, 21943, 21944, 21945, 21946, 21947, 21948, 21949, 21950,\n            21951],\n           dtype='int64', length=21952)] are in the [columns]"

In [None]:
from sklearn.model_selection import cross_val_score

skfold = StratifiedKFold(n_splits=5)
cross_val_score(sgd_clf, X_train_tfidf_vect, y_train, cv=skfold, scoring=None)

In [None]:
cross_val_score(sgd_clf, X_train_tfidf_vect, y_train, cv=5, scoring='accuracy')

In [None]:
# train score 와 같이

from sklearn.model_selection import cross_validate

cross_validate(sgd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

In [None]:
test_x.text

In [None]:
X_train

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect_test = TfidfVectorizer(stop_words='english', max_features=30000)
tfidf_vect_test.fit(test_x.text)
X_train_tfidf_test = tfidf_vect_test.transform(test_x.text)

In [None]:
pred

In [None]:
pred = lr_clf.predict(X_train_tfidf_test)

In [None]:
pred

In [None]:
test_x['author'] = pred

In [None]:
test_x