# count vectorizer만 적용하여 진행
- stopwords 적용하여 진행한 것과 단순한 비교를 위해

In [5]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

## 데이터 확인

In [6]:
# 파일 불러오기

train = pd.read_csv('./open/train.csv', encoding='utf-8')
test_x = pd.read_csv('./open/test_x.csv', encoding='utf-8')
submission = pd.read_csv('./open/sample_submission.csv', encoding='utf-8')

## 전처리

### train_test_split

In [7]:
X = train.loc[:, 'text']
y = train.loc[:, 'author']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

### 벡터라이즈

#### tfidfvectorizer

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

# TF-IDF Vectorization 적용하여 학습 데이터셋과 테스트 데이터 셋 변환.
tfidf_vect = CountVectorizer(stop_words='english')
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

#### Multinomial Naive Bayes 적용

In [6]:
from sklearn.naive_bayes import MultinomialNB
mu_clf = MultinomialNB().fit(X_train_tfidf_vect, y_train)

In [7]:
# 정식으로 pipeline을 만들고

from sklearn.pipeline import Pipeline

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [8]:
# 학습 후 train, test accuracy score 적용

from sklearn.metrics import accuracy_score

mu_clf.fit(X_train_tfidf_vect, y_train)

train_pred = mu_clf.predict(X_train_tfidf_vect)
test_pred = mu_clf.predict(X_test_tfidf_vect)

print('MultinomialNB train accuracy score:', accuracy_score(y_train, train_pred))
print('MultinomialNB test accuracy score:', accuracy_score(y_test, test_pred))
# predicted = text_clf.predict(X_test)

# np.mean(predicted == y_test)


MultinomialNB train accuracy score: 0.7595152950823406
MultinomialNB test accuracy score: 0.6868622448979592


In [9]:
from sklearn.metrics import classification_report

# print(confusion_matrix(y_test, pred))
# print('-'*50)
print(classification_report(y_train, train_pred))
print('-----------'*5)
print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

           0       0.73      0.85      0.79     10512
           1       0.94      0.59      0.73      5841
           2       0.85      0.68      0.76      9222
           3       0.66      0.96      0.78     12034
           4       0.97      0.49      0.65      6294

    accuracy                           0.76     43903
   macro avg       0.83      0.72      0.74     43903
weighted avg       0.80      0.76      0.75     43903

-------------------------------------------------------
              precision    recall  f1-score   support

           0       0.67      0.79      0.72      2723
           1       0.89      0.50      0.64      1381
           2       0.76      0.57      0.65      2332
           3       0.61      0.92      0.73      3029
           4       0.93      0.37      0.53      1511

    accuracy                           0.69     10976
   macro avg       0.77      0.63      0.66     10976
weighted avg       0.

#### 여러 모델들 적용

In [12]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier, RidgeClassifierCV, SGDClassifier
from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


models = []
models.append(('LogisticRegression', LogisticRegression(random_state=13)))
models.append(('MultinomialNB', MultinomialNB()))
models.append(('RandomForestClassifier', RandomForestClassifier(random_state=13, n_jobs=-1)))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier(random_state=13)))
models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=13)))
models.append(('GradientBoostingClassifier', GradientBoostingClassifier(random_state=13)))
models.append(('LGBMClassifier', LGBMClassifier(random_state=13)))
models.append(('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=5, n_jobs=-1)))
models.append(('LinearSVC', LinearSVC(C=1, loss='hinge', random_state=13)))
models.append(('XgBoost', XGBClassifier(learning_rate=0.1, max_depth=3, random_state=13, n_jobs=-1)))
models.append(('RidgeClassifier', RidgeClassifier(random_state=13)))
models.append(('SGDClassifier', SGDClassifier(random_state=13, loss='modified_huber')))
# models.append(('RidgeClassifierCV', RidgeClassifierCV(cv=3)))

In [11]:
models

[('LogisticRegression', LogisticRegression(random_state=13)),
 ('MultinomialNB', MultinomialNB()),
 ('RandomForestClassifier',
  RandomForestClassifier(n_jobs=-1, random_state=13)),
 ('DecisionTreeClassifier', DecisionTreeClassifier(random_state=13)),
 ('AdaBoostClassifier', AdaBoostClassifier(random_state=13)),
 ('GradientBoostingClassifier', GradientBoostingClassifier(random_state=13)),
 ('LGBMClassifier', LGBMClassifier(random_state=13)),
 ('KNeighborsClassifier', KNeighborsClassifier(n_jobs=-1)),
 ('LinearSVC', LinearSVC(C=1, loss='hinge', random_state=13)),
 ('XgBoost',
  XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
                colsample_bynode=None, colsample_bytree=None, gamma=None,
                gpu_id=None, importance_type='gain', interaction_constraints=None,
                learning_rate=0.1, max_delta_step=None, max_depth=3,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                n_estimators=100, n_jobs=-

In [None]:
from sklearn.metrics import accuracy_score

train_score = []
test_score = []
names = []

for name, model in models:
    clf = model
    clf.fit(X_train_tfidf_vect, y_train)
    
    train_pred = clf.predict(X_train_tfidf_vect)
    test_pred = clf.predict(X_test_tfidf_vect)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, train_pred))
    test_score.append(accuracy_score(y_test, test_pred))

In [None]:
result = pd.DataFrame({'model name': names,
                       'train score': train_score,
                       'test score': test_score})
result['diff'] = result['train score'] - result['test score']
result.round(2).sort_values(by='train score', ascending=False).reset_index(drop=True)

## 모델 하이퍼파라미터 튜닝 및 분류 알고리즘 모델 적요

In [13]:
tfidf_vect = CountVectorizer(stop_words='english')
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

### MultinomialNB

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

params = {'alpha': [0.01, 0.1, 0.5, 1.0],
         'fit_prior': ['True', 'False']}
clf = MultinomialNB()
grid_cv = GridSearchCV(clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('MultinomialNB best parameters:', grid_cv.best_params_)
print('MultinomialNB best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


MultinomialNB best parameters: {'alpha': 0.1, 'fit_prior': 'True'}
MultinomialNB best accuracy score: 0.7232079814629525


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    1.7s finished


In [15]:
mu_clf = MultinomialNB(alpha=0.1, fit_prior='True')
mu_clf.fit(X_train_tfidf_vect, y_train)

mu_train_pred = mu_clf.predict(X_train_tfidf_vect)
mu_test_pred = mu_clf.predict(X_test_tfidf_vect)

print('MultinomialNB train accuracy score', accuracy_score(y_train, mu_train_pred))
print('MultinomialNB test accuracy score', accuracy_score(y_test, mu_test_pred))

MultinomialNB train accuracy score 0.8094891009725986
MultinomialNB test accuracy score 0.7311406705539358


In [16]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5)

In [17]:
from sklearn.model_selection import cross_validate
cross_validate(mu_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.01932073, 0.01523209, 0.01498818, 0.01482701, 0.01631212]),
 'score_time': array([0.00283217, 0.00265217, 0.00246096, 0.00296497, 0.00250483]),
 'test_score': array([0.73260449, 0.72554379, 0.72429108, 0.73359909, 0.73382688]),
 'train_score': array([0.81364956, 0.81541484, 0.81604123, 0.81425277, 0.81416735])}

### LinearSVC

In [18]:
import sklearn
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

svc_clf = LinearSVC(C=1, loss='hinge', random_state=13)
svc_clf.fit(X_train_tfidf_vect, y_train)

svc_train_pred = svc_clf.predict(X_train_tfidf_vect)
svc_test_pred = svc_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, svc_train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, svc_test_pred))

LinearSVC train accuracy score: 0.8979113044666651
LinearSVC test accuracy score: 0.7117346938775511


In [19]:
params = {
    'C': [0.001, 0.01, 1], 
    'loss': ['squared_hinge', 'hinge'], 
    'penalty': ['l1', 'l2'], 
    'random_state': [13]
    }

grid_cv = GridSearchCV(LinearSVC(), param_grid=params, refit=True, verbose=2)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l1, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.4s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.4s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.4s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.4s
[CV] C=0.001, loss=squared_hinge, penalty=l2, random_state=13 ........
[CV]  C=0.001, loss=squared_hinge, penalty=l2, random_state=13, total=   0.3s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, random_state=13 ................
[CV] . C=0.001, loss=hinge, penalty=l1, random_state=13, total=   0.0s
[CV] C=0.001, loss=hinge, penalty=l1, rand

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:  1.1min finished


GridSearchCV(estimator=LinearSVC(),
             param_grid={'C': [0.001, 0.01, 1],
                         'loss': ['squared_hinge', 'hinge'],
                         'penalty': ['l1', 'l2'], 'random_state': [13]},
             verbose=2)

In [20]:
print('LinearSVC Best parameters:', grid_cv.best_params_)
print('LinearSVC Best accruacy score:', grid_cv.best_score_)

LinearSVC Best parameters: {'C': 1, 'loss': 'hinge', 'penalty': 'l2', 'random_state': 13}
LinearSVC Best accruacy score: 0.704234437627939


In [21]:
svc_svm_clf = LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)
svc_svm_clf.fit(X_train_tfidf_vect, y_train)

svc_train_pred = svc_svm_clf.predict(X_train_tfidf_vect)
svc_test_pred = svc_svm_clf.predict(X_test_tfidf_vect)

print('LinearSVC train accuracy score:', accuracy_score(y_train, svc_train_pred))
print('LinearSVC test accuracy score:', accuracy_score(y_test, svc_test_pred)) 

LinearSVC train accuracy score: 0.8979113044666651
LinearSVC test accuracy score: 0.7117346938775511


In [22]:
svc_svm_clf.decision_function(X_test_tfidf_vect) > 0

array([[ True, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False, False,  True],
       ...,
       [False, False, False, False, False],
       [False, False, False,  True, False],
       [False, False, False, False,  True]])

In [23]:
# 교차검증

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=13)

In [24]:
from sklearn.model_selection import cross_validate, cross_val_score

cross_validate(svc_svm_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([5.5141511 , 5.211555  , 5.40136385, 5.23832273, 5.35703301,
        5.15500402, 5.47970581, 5.34518981, 5.41233492, 5.20682979]),
 'score_time': array([0.00160193, 0.00153708, 0.00156641, 0.00153708, 0.00162101,
        0.00159597, 0.00179029, 0.00166917, 0.00221896, 0.00164723]),
 'test_score': array([0.70690048, 0.70576179, 0.72170348, 0.70410023, 0.71822323,
        0.71343964, 0.70205011, 0.7143508 , 0.71230068, 0.70797267]),
 'train_score': array([0.90215631, 0.90309273, 0.90048593, 0.90155139, 0.90279149,
        0.90031129, 0.90380381, 0.90101992, 0.90142485, 0.90129831])}

### SGDClassifier

In [25]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=13, loss='modified_huber')
sgd_clf.fit(X_train_tfidf_vect, y_train)

sgd_train_pred = sgd_clf.predict(X_train_tfidf_vect)
sgd_test_pred = sgd_clf.predict(X_test_tfidf_vect)

print('SGDClassifier train accuracy score:', accuracy_score(y_train, sgd_train_pred))
print('SGDClassifier test accuracy score:', accuracy_score(y_test, sgd_test_pred))

SGDClassifier train accuracy score: 0.8951096735986152
SGDClassifier test accuracy score: 0.7173833819241983


In [26]:
# 교차 검증

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train_tfidf_vect, y_train, cv=5, scoring='accuracy')

array([0.7190525 , 0.71654709, 0.70937251, 0.71366743, 0.71753986])

In [27]:
# 오차 행렬

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train_tfidf_vect, y_train, cv=3)

In [28]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_train_pred)

array([[7406,  523,  874, 1176,  533],
       [ 663, 3716,  608,  676,  178],
       [ 829,  306, 6154, 1336,  597],
       [ 707,  296,  956, 9784,  291],
       [ 914,  184,  728,  626, 3842]])

In [29]:
params = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
    'loss': ['log', 'modified_huber', 'hinge', 'squared_hinge', 'perceptron'],
    'penalty': ['l2', 'elasticnet'],
    'n_jobs': [-1],
    'random_state': [13],
}

grid_cv = GridSearchCV(sgd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)

Fitting 3 folds for each of 80 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   57.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  4.5min finished


GridSearchCV(cv=3,
             estimator=SGDClassifier(loss='modified_huber', random_state=13),
             n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                   1000.0],
                         'loss': ['log', 'modified_huber', 'hinge',
                                  'squared_hinge', 'perceptron'],
                         'n_jobs': [-1], 'penalty': ['l2', 'elasticnet'],
                         'random_state': [13]},
             scoring='accuracy', verbose=1)

In [30]:
print('SGDClassifier best parameters:', grid_cv.best_params_)
print('SGDClassifier best accuracy score:', grid_cv.best_score_)

SGDClassifier best parameters: {'alpha': 0.0001, 'loss': 'hinge', 'n_jobs': -1, 'penalty': 'l2', 'random_state': 13}
SGDClassifier best accuracy score: 0.7073092822185862


In [31]:
from sklearn.model_selection import cross_validate

cross_validate(sgd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skfold, return_train_score=True)

{'fit_time': array([0.53370309, 0.49738312, 0.47573876, 0.47913599, 0.52821708]),
 'score_time': array([0.00333405, 0.00287819, 0.00293088, 0.00274682, 0.00280905]),
 'test_score': array([0.7190525 , 0.71654709, 0.70937251, 0.71366743, 0.71753986]),
 'train_score': array([0.90333694, 0.90826263, 0.90752235, 0.90456396, 0.9052188 ])}

### RidgeClassifier

In [32]:
rd_clf = RidgeClassifier()
params = {
    'alpha': [0.01, 0.1, 0.5, 1.0],
    'normalize': [True, False],
    'max_iter': [100, 300],
    'random_state': [13]
}

grid_cv = GridSearchCV(rd_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('RidgeClassifier best parameters:', grid_cv.best_params_)
print('RidgeClassifier best accuracy score:', grid_cv.best_score_)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:  2.1min finished


RidgeClassifier best parameters: {'alpha': 1.0, 'max_iter': 100, 'normalize': True, 'random_state': 13}
RidgeClassifier best accuracy score: 0.6771520199732993


In [33]:
rd_clf = RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)
rd_clf.fit(X_train_tfidf_vect, y_train)

rd_train_pred = rd_clf.predict(X_train_tfidf_vect)
rd_test_pred = rd_clf.predict(X_test_tfidf_vect)
print('RidgeClassifier best train accuracy score:', accuracy_score(y_train, rd_train_pred))
print('RidgeClassifier best test accuracy score:', accuracy_score(y_test, rd_test_pred))

RidgeClassifier best train accuracy score: 0.8224950458966358
RidgeClassifier best test accuracy score: 0.6926020408163265


In [34]:
cross_validate(rd_clf, X_train_tfidf_vect, y_train, scoring=None, cv=skf, return_train_score=True)

{'fit_time': array([0.36988807, 0.38548708, 0.35606241, 0.35790896, 0.37560391,
        0.3464241 , 0.35900211, 0.35351133, 0.36649895, 0.34965181]),
 'score_time': array([0.00187206, 0.00188708, 0.0020349 , 0.00190687, 0.00231695,
        0.00206494, 0.00187492, 0.0020318 , 0.0019021 , 0.00228   ]),
 'test_score': array([0.6905033 , 0.68002733, 0.69665224, 0.68200456, 0.69794989,
        0.6977221 , 0.6856492 , 0.70091116, 0.68246014, 0.6881549 ]),
 'train_score': array([0.82653371, 0.82698927, 0.82633124, 0.8268418 , 0.82580417,
        0.8252727 , 0.82628502, 0.82636094, 0.8265381 , 0.82722142])}

### LogisticRegression

In [35]:
clf = LogisticRegression()
params = {
    'C': [0.01, 0.1, 0.5, 1.0],
    'max_iter': [100, 200, 500],
}

grid_cv = GridSearchCV(clf, param_grid=params, scoring='accuracy', verbose=1, n_jobs=-1)
grid_cv.fit(X_train_tfidf_vect, y_train)
print('LogisticRegression best parameters:', grid_cv.best_params_)
print('LogisticRegression best accuracy score:', grid_cv.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.6min finished


LogisticRegression best parameters: {'C': 1.0, 'max_iter': 100}
LogisticRegression best accuracy score: 0.7143476116765879


In [36]:
lr_clf = LogisticRegression(C=1.0, max_iter=500)
lr_clf.fit(X_train_tfidf_vect, y_train)

lr_train_pred = lr_clf.predict(X_train_tfidf_vect)
lr_test_pred = lr_clf.predict(X_test_tfidf_vect)

print('LogisticRegression train accuracy score:', accuracy_score(y_train, lr_train_pred))
print('LogisticRegression test accuracy score:', accuracy_score(y_test, lr_test_pred))

LogisticRegression train accuracy score: 0.8864314511536797
LogisticRegression test accuracy score: 0.7166545189504373


## 모델 결과

In [38]:
t_models = []

t_models.append(('MultinomialNB', MultinomialNB(alpha=0.5, fit_prior='True')))
t_models.append(('LinearSVC', LinearSVC(C=1, loss='hinge', penalty='l2', random_state=13)))
t_models.append(('SGDClassifier', SGDClassifier(alpha=0.0001, loss='modified_huber', n_jobs=-1, penalty='l2', random_state=13)))
t_models.append(('RidgeClassifier', RidgeClassifier(alpha=1.0, max_iter=100, normalize='False', random_state=13)))
t_models.append(('LogisticRegression', LogisticRegression(C=1.0, max_iter=500)))

In [39]:
from sklearn.metrics import accuracy_score

train_score = []
test_score = []
names = []

for name, model in t_models:
    clf = model
    clf.fit(X_train_tfidf_vect, y_train)
    
    train_pred = clf.predict(X_train_tfidf_vect)
    test_pred = clf.predict(X_test_tfidf_vect)
    
    names.append(name)
    train_score.append(accuracy_score(y_train, train_pred))
    test_score.append(accuracy_score(y_test, test_pred))

In [40]:
result = pd.DataFrame({'model name': names,
                       'train score': train_score,
                       'test score': test_score})
result['diff'] = result['train score'] - result['test score']
result.round(2).sort_values(by='train score', ascending=False).reset_index(drop=True)

Unnamed: 0,model name,train score,test score,diff
0,LinearSVC,0.9,0.71,0.19
1,SGDClassifier,0.9,0.72,0.18
2,LogisticRegression,0.89,0.72,0.17
3,RidgeClassifier,0.82,0.69,0.13
4,MultinomialNB,0.8,0.73,0.07
