## Vectorizers + Models
#### Currency exchange topic

### Load

Tokens

In [112]:
import warnings
warnings.filterwarnings('ignore')

In [113]:
import pandas as pd
import numpy as np

In [114]:
tokens_izq = pd.read_pickle('tokens/topic_tokens_izq_new.pkl')
tokens_der = pd.read_pickle('tokens/topic_tokens_der_new.pkl')

In [115]:
tokens_izq_ex = tokens_izq[tokens_izq.topics == 'exchange'].drop('topics', axis=1)
tokens_izq_ex.insert(loc=0, column='publication', value='left')

In [116]:
tokens_der_ex = tokens_der[tokens_der.topics == 'exchange'].drop('topics', axis=1)
tokens_der_ex.insert(loc=0, column='publication', value='right')

In [117]:
tokens = tokens_izq_ex.append(tokens_der_ex)
tokens.reset_index(inplace=True)
tokens.drop('index', axis=1, inplace=True)
tokens.fillna(0, inplace=True)

In [118]:
tokens.sample(5)

Unnamed: 0,publication,abajo,abarca,abastecimiento,abc,abierta,abiertamente,abierto,abiertos,abre,...,vuelva,vuelvan,vulgarmente,woyecheszen,xx,yaguarete,york,yuanes,zamora,zonas
10,left,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,left,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,right,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,right,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
53,right,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Train - Test Split

In [119]:
from sklearn.model_selection import train_test_split

In [120]:
# data balance
tokens.publication.value_counts(normalize=True)

right    0.535714
left     0.464286
Name: publication, dtype: float64

In [121]:
X = tokens.drop('publication', axis=1)

In [122]:
y = tokens['publication']

In [123]:
train, test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify=tokens.publication)

Shapes and balance

In [124]:
# shape train
display(train.shape[0],
        train.shape[0] / X.shape[0])

42

0.75

In [125]:
# shape test
display(test.shape[0],
        test.shape[0] / X.shape[0])

14

0.25

In [126]:
# data balance - test
display(y_train.value_counts(normalize=True), 
        y_test.value_counts(normalize=True))

right    0.52381
left     0.47619
Name: publication, dtype: float64

right    0.571429
left     0.428571
Name: publication, dtype: float64

Vectorización con TF-IDF

In [127]:
from sklearn.feature_extraction.text import TfidfTransformer

In [128]:
tfidf_vector = TfidfTransformer().fit(train)
train_tfidf = tfidf_vector.transform(train)

In [129]:
test_tfidf = tfidf_vector.transform(test)

#### Models calling

In [130]:
from sklearn.naive_bayes import MultinomialNB

In [131]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [132]:
from sklearn.neighbors import KNeighborsClassifier

In [133]:
from sklearn.tree import DecisionTreeClassifier

In [134]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [135]:
from xgboost import XGBClassifier

#### K-Folds + Cross Validation

In [136]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [137]:
from sklearn.metrics import accuracy_score

In [138]:
folds = StratifiedKFold(n_splits=5, random_state=19, shuffle=True)

In [139]:
estimators = [
    ('MNB', MultinomialNB()),
    ('LR', LogisticRegression()),
    ('SGD', SGDClassifier()),
    ('KNN', KNeighborsClassifier()),
    ('CART', DecisionTreeClassifier()),
    ('ADA', AdaBoostClassifier()),
    ('GDB', GradientBoostingClassifier()),
    ('XGB', XGBClassifier())
]
results = []

In [140]:
results = []
scoring = 'accuracy'
print("model\tCV mean\t CV std")
for name, model in estimators:
 	cv_results = cross_val_score(model, X, y, cv=folds, scoring='accuracy')
 	results.append(cv_results)
 	msg = "%s:\t%f (%f)" % (name, cv_results.mean(), cv_results.std())
 	print(msg)

model	CV mean	 CV std
MNB:	0.963636 (0.072727)
LR:	0.928788 (0.035727)
SGD:	0.946970 (0.072029)
KNN:	0.463636 (0.018182)
CART:	0.928788 (0.067692)
ADA:	0.928788 (0.035727)
GDB:	0.928788 (0.035727)
XGB:	0.928788 (0.035727)


### GridSearchCV (Vectorization + Models)

In [141]:
from sklearn.model_selection import GridSearchCV

### 1. Naive Bayes

CountVectorizer

In [142]:
parameters = {
        'alpha': (1e-2, 1e-3, 1e-1),
    }

In [143]:
grid_mnb = GridSearchCV(MultinomialNB(), parameters, cv=folds, scoring='accuracy')

In [144]:
grid_mnb.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=MultinomialNB(),
             param_grid={'alpha': (0.01, 0.001, 0.1)}, scoring='accuracy')

In [145]:
mnb_train_score_cv = grid_mnb.score(train, y_train)
mnb_test_score_cv = accuracy_score(grid_mnb.predict(test), y_test)

In [146]:
mnb_cv_fold_test_mean = grid_mnb.cv_results_['mean_test_score'].mean()
mnb_cv_fold_test_std = grid_mnb.cv_results_['std_test_score'].mean()

In [147]:
print(f'''Best parameters: {grid_mnb.best_params_}
Mean cv test score: {grid_mnb.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_mnb.cv_results_['std_test_score'].mean()}
Train score: {mnb_train_score_cv}
Test test score: {mnb_test_score_cv}''')

Best parameters: {'alpha': 0.01}
Mean cv test score: 0.9444444444444443
Std cv test score: 0.08176554381711859
Train score: 1.0
Test test score: 1.0


Tfidfvectorizer

In [148]:
grid_mnb.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=MultinomialNB(),
             param_grid={'alpha': (0.01, 0.001, 0.1)}, scoring='accuracy')

In [149]:
mnb_train_score_td = grid_mnb.score(train_tfidf, y_train)
mnb_test_score_td = accuracy_score(grid_mnb.predict(test_tfidf), y_test)

In [150]:
mnb_td_fold_test_mean = grid_mnb.cv_results_['mean_test_score'].mean()
mnb_td_fold_test_std = grid_mnb.cv_results_['std_test_score'].mean()

In [151]:
print(f'''Best parameters: {grid_mnb.best_params_}
Mean cv test score: {grid_mnb.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_mnb.cv_results_['std_test_score'].mean()}
Train score: {mnb_train_score_td}
Test test score: {mnb_test_score_td}''')

Best parameters: {'alpha': 0.01}
Mean cv test score: 0.9518518518518518
Std cv test score: 0.07200113521818723
Train score: 1.0
Test test score: 1.0


### 2. LogisticRegression

CountVectorizer

In [152]:
parameters = {
        'C': [1, 10, 100, 1000],
        'penalty': ['l1', 'l2',],
        'solver': ['saga']
    }

In [153]:
grid_log = GridSearchCV(LogisticRegression(), parameters, cv=folds, scoring='accuracy')

In [154]:
grid_log.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 100, 1000], 'penalty': ['l1', 'l2'],
                         'solver': ['saga']},
             scoring='accuracy')

In [155]:
log_cv_train_score = grid_log.score(train, y_train)
log_cv_test_score = accuracy_score(grid_log.predict(test), y_test)

In [156]:
log_cv_fold_test_mean = grid_log.cv_results_['mean_test_score'].mean()
log_cv_fold_test_std = grid_log.cv_results_['std_test_score'].mean()

In [157]:
print(f'''Best parameters: {grid_log.best_params_}
Mean cv test score: {grid_log.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_log.cv_results_['std_test_score'].mean()}
Train score: {log_cv_train_score}
Test test score: {log_cv_test_score}''')

Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
Mean cv test score: 0.8635416666666667
Std cv test score: 0.17048558389891702
Train score: 1.0
Test test score: 1.0


Tfidfvectorizer

In [158]:
grid_log.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 100, 1000], 'penalty': ['l1', 'l2'],
                         'solver': ['saga']},
             scoring='accuracy')

In [159]:
log_td_train_score = grid_log.score(train_tfidf, y_train)
log_td_test_score = accuracy_score(grid_log.predict(test_tfidf), y_test)

In [160]:
log_td_fold_test_mean = grid_log.cv_results_['mean_test_score'].mean()
log_td_fold_test_std = grid_log.cv_results_['std_test_score'].mean()

In [161]:
print(f'''Best parameters: {grid_log.best_params_}
Mean cv test score: {grid_log.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_log.cv_results_['std_test_score'].mean()}
Train score: {log_td_train_score}
Test test score: {log_td_test_score}''')

Best parameters: {'C': 1000, 'penalty': 'l2', 'solver': 'saga'}
Mean cv test score: 0.8604166666666666
Std cv test score: 0.10151172476361903
Train score: 1.0
Test test score: 1.0


### 3. SGDClassifier

CountVectorizer

In [162]:
parameters ={
    'penalty': ('l2', 'elasticnet', 'l1'),
    'max_iter': [50, 80],
    'tol': [1e-4],
    'loss': ['hinge', 'log', 'modified_huber'],
}

In [163]:
grid_sgd = GridSearchCV(SGDClassifier(), parameters, cv=folds, scoring='accuracy')

In [164]:
grid_sgd.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=SGDClassifier(),
             param_grid={'loss': ['hinge', 'log', 'modified_huber'],
                         'max_iter': [50, 80],
                         'penalty': ('l2', 'elasticnet', 'l1'),
                         'tol': [0.0001]},
             scoring='accuracy')

In [165]:
sgd_cv_train_score = grid_sgd.score(train, y_train)
sgd_cv_test_score = accuracy_score(grid_sgd.predict(test), y_test)

In [166]:
sgd_cv_fold_test_mean = grid_sgd.cv_results_['mean_test_score'].mean()
sgd_cv_fold_test_std = grid_sgd.cv_results_['std_test_score'].mean()

In [167]:
print(f'''Best parameters: {grid_sgd.best_params_}
Mean cv test score: {grid_sgd.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_sgd.cv_results_['std_test_score'].mean()}
Train score: {sgd_cv_train_score}
Test test score: {sgd_cv_test_score}''')

Best parameters: {'loss': 'hinge', 'max_iter': 50, 'penalty': 'l2', 'tol': 0.0001}
Mean cv test score: 0.9001543209876542
Std cv test score: 0.11970815625310943
Train score: 1.0
Test test score: 0.9285714285714286


Tfidfvectorizer

In [168]:
grid_sgd.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=SGDClassifier(),
             param_grid={'loss': ['hinge', 'log', 'modified_huber'],
                         'max_iter': [50, 80],
                         'penalty': ('l2', 'elasticnet', 'l1'),
                         'tol': [0.0001]},
             scoring='accuracy')

In [169]:
sgd_td_train_score = grid_sgd.score(train_tfidf, y_train)
sgd_td_test_score = accuracy_score(grid_sgd.predict(test_tfidf), y_test)

In [170]:
sgd_td_fold_test_mean = grid_sgd.cv_results_['mean_test_score'].mean()
sgd_td_fold_test_std = grid_sgd.cv_results_['std_test_score'].mean()

In [171]:
print(f'''Best parameters: {grid_sgd.best_params_}
Mean cv test score: {grid_sgd.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_sgd.cv_results_['std_test_score'].mean()}
Train score: {log_td_train_score}
Test test score: {log_td_test_score}''')

Best parameters: {'loss': 'hinge', 'max_iter': 50, 'penalty': 'l2', 'tol': 0.0001}
Mean cv test score: 0.9208333333333333
Std cv test score: 0.10841773961556156
Train score: 1.0
Test test score: 1.0


### 4. KNeighborsClassifier

CountVectorizer

In [172]:
parameters ={
        'n_neighbors': range(1,4),
        'weights' : ['uniform', 'distance'],
        'p' : [1, 2, 3]
}

In [173]:
grid_knn = GridSearchCV(KNeighborsClassifier(), parameters, cv=folds, scoring='accuracy')

In [174]:
grid_knn.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 4), 'p': [1, 2, 3],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [175]:
knn_cv_train_score = grid_knn.score(train, y_train)
knn_cv_test_score = accuracy_score(grid_knn.predict(test), y_test)

In [176]:
knn_cv_fold_test_mean = grid_knn.cv_results_['mean_test_score'].mean()
knn_cv_fold_test_std = grid_knn.cv_results_['std_test_score'].mean()

In [177]:
print(f'''Best parameters: {grid_knn.best_params_}
Mean cv test score: {grid_knn.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_knn.cv_results_['std_test_score'].mean()}
Train score: {knn_cv_train_score}
Test test score: {knn_cv_test_score}''')

Best parameters: {'n_neighbors': 1, 'p': 3, 'weights': 'uniform'}
Mean cv test score: 0.5121913580246914
Std cv test score: 0.05376097133307451
Train score: 1.0
Test test score: 0.5


Tfidfvectorizer

In [178]:
grid_knn.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 4), 'p': [1, 2, 3],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [179]:
knn_td_train_score = grid_knn.score(train_tfidf, y_train)
knn_td_test_score = accuracy_score(grid_knn.predict(test_tfidf), y_test)

In [180]:
knn_td_fold_test_mean = grid_knn.cv_results_['mean_test_score'].mean()
knn_td_fold_test_std = grid_knn.cv_results_['std_test_score'].mean()

In [181]:
print(f'''Best parameters: {grid_knn.best_params_}
Best cv score: {grid_knn.best_score_}
Train score: {knn_td_train_score}
Test test score: {knn_td_test_score}''')

Best parameters: {'n_neighbors': 1, 'p': 2, 'weights': 'uniform'}
Best cv score: 0.9055555555555556
Train score: 1.0
Test test score: 0.8571428571428571


### 5. CART

CountVectorizer

In [182]:
parameters ={
        "criterion" : ["gini", "entropy"],
        "min_samples_leaf": [5,10,15,20,2], 
        "max_depth" : [1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17],
        "min_samples_split": [2, 3, 4,None]
}

In [183]:
grid_dtc = GridSearchCV(DecisionTreeClassifier(), parameters, cv=folds, scoring='accuracy')

In [184]:
grid_dtc.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13,
                                       14, 15, 16, 17],
                         'min_samples_leaf': [5, 10, 15, 20, 2],
                         'min_samples_split': [2, 3, 4, None]},
             scoring='accuracy')

In [185]:
dtc_cv_train_score = grid_dtc.score(train, y_train)
dtc_cv_test_score = accuracy_score(grid_dtc .predict(test), y_test)

In [186]:
dtc_cv_fold_test_mean = grid_dtc.cv_results_['mean_test_score'].mean()
dtc_cv_fold_test_std = grid_dtc.cv_results_['std_test_score'].mean()

In [187]:
print(f'''Best parameters: {grid_dtc.best_params_}
Mean cv test score: {grid_dtc.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_dtc.cv_results_['std_test_score'].mean()}
Train score: {dtc_cv_train_score}
Test test score: {dtc_cv_test_score}''')

Best parameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 4}
Mean cv test score: nan
Std cv test score: nan
Train score: 1.0
Test test score: 0.9285714285714286


Tfidfvectorizer

In [188]:
grid_dtc.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13,
                                       14, 15, 16, 17],
                         'min_samples_leaf': [5, 10, 15, 20, 2],
                         'min_samples_split': [2, 3, 4, None]},
             scoring='accuracy')

In [189]:
dtc_td_train_score = grid_dtc.score(train_tfidf, y_train)
dtc_td_test_score = accuracy_score(grid_dtc.predict(test_tfidf), y_test)

In [190]:
dtc_td_fold_test_mean = grid_dtc.cv_results_['mean_test_score'].mean()
dtc_td_fold_test_std = grid_dtc.cv_results_['std_test_score'].mean()

In [191]:
print(f'''Best parameters: {grid_dtc.best_params_}
Mean cv test score: {grid_dtc.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_dtc.cv_results_['std_test_score'].mean()}
Train score: {knn_td_train_score}
Test test score: {knn_td_test_score}''')

Best parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2}
Mean cv test score: nan
Std cv test score: nan
Train score: 1.0
Test test score: 0.8571428571428571


### 6. AdaBoost

CountVectorizer

In [192]:
parameters = {
    'learning_rate': [.5]
}

In [193]:
grid_ada = GridSearchCV(AdaBoostClassifier(), parameters, cv=folds, scoring='accuracy')

In [194]:
grid_ada.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5]}, scoring='accuracy')

In [195]:
ada_cv_train_score = grid_ada.score(train, y_train)
ada_cv_test_score = accuracy_score(grid_ada .predict(test), y_test)

In [196]:
ada_cv_fold_test_mean = grid_ada.cv_results_['mean_test_score'].mean()
ada_cv_fold_test_std = grid_ada.cv_results_['std_test_score'].mean()

In [197]:
print(f'''Best parameters: {grid_ada.best_params_}
Mean cv test score: {grid_ada.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_ada.cv_results_['std_test_score'].mean()}
Train score: {ada_cv_train_score}
Test test score: {ada_cv_test_score}''')

Best parameters: {'learning_rate': 0.5}
Mean cv test score: 0.95
Std cv test score: 0.09999999999999999
Train score: 1.0
Test test score: 0.9285714285714286


Tfidfvectorizer

In [198]:
grid_ada.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5]}, scoring='accuracy')

In [199]:
ada_td_train_score = grid_ada.score(train_tfidf, y_train)
ada_td_test_score = accuracy_score(grid_ada.predict(test_tfidf), y_test)

In [200]:
ada_td_fold_test_mean = grid_ada.cv_results_['mean_test_score'].mean()
ada_td_fold_test_std = grid_ada.cv_results_['std_test_score'].mean()

In [201]:
print(f'''Best parameters: {grid_ada.best_params_}
Mean cv test score: {grid_ada.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_ada.cv_results_['std_test_score'].mean()}
Train score: {ada_td_train_score}
Test test score: {ada_td_test_score}''')

Best parameters: {'learning_rate': 0.5}
Mean cv test score: 0.9027777777777779
Std cv test score: 0.09296222517045283
Train score: 1.0
Test test score: 0.9285714285714286


### 7. GradientBoosting

CountVectorizer

In [202]:
parameters = {
    'loss':['deviance', 'exponential']
}

In [203]:
grid_grb = GridSearchCV(GradientBoostingClassifier(), parameters, cv=folds, scoring='accuracy')

In [204]:
grid_grb.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=GradientBoostingClassifier(),
             param_grid={'loss': ['deviance', 'exponential']},
             scoring='accuracy')

In [205]:
grb_cv_train_score = grid_grb.score(train, y_train)
grb_cv_test_score = accuracy_score(grid_grb.predict(test), y_test)

In [206]:
grb_cv_fold_test_mean = grid_grb.cv_results_['mean_test_score'].mean()
grb_cv_fold_test_std = grid_grb.cv_results_['std_test_score'].mean()

In [207]:
print(f'''Best parameters: {grid_grb.best_params_}
Mean cv test score: {grid_grb.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_grb.cv_results_['std_test_score'].mean()}
Train score: {grb_cv_train_score}
Test test score: {grb_cv_test_score}''')

Best parameters: {'loss': 'deviance'}
Mean cv test score: 0.95
Std cv test score: 0.09999999999999999
Train score: 1.0
Test test score: 0.9285714285714286


Tfidfvectorizer

In [208]:
grid_grb.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=GradientBoostingClassifier(),
             param_grid={'loss': ['deviance', 'exponential']},
             scoring='accuracy')

In [209]:
grb_td_train_score = grid_grb.score(train_tfidf, y_train)
grb_td_test_score = accuracy_score(grid_grb.predict(test_tfidf), y_test)

In [210]:
grb_td_fold_test_mean = grid_grb.cv_results_['mean_test_score'].mean()
grb_td_fold_test_std = grid_grb.cv_results_['std_test_score'].mean()

In [212]:
print(f'''Best parameters: {grid_grb.best_params_}
Mean cv test score: {grid_grb.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_grb.cv_results_['std_test_score'].mean()}
Train score: {grb_td_train_score}
Test test score: {grb_td_test_score}''')

Best parameters: {'loss': 'deviance'}
Mean cv test score: 0.9277777777777778
Std cv test score: 0.098757715747951
Train score: 1.0
Test test score: 0.8571428571428571


### 8. XGBoost

CountVectorizer

In [213]:
grid_xgb = GridSearchCV(XGBClassifier(), param_grid={}, cv=folds, scoring='accuracy')

In [214]:
grid_xgb.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=Non

In [215]:
xgb_cv_train_score = grid_xgb.score(train, y_train)
xgb_cv_test_score = accuracy_score(grid_xgb.predict(test), y_test)

In [216]:
xgb_cv_fold_test_mean = grid_xgb.cv_results_['mean_test_score'].mean()
xgb_cv_fold_test_std = grid_xgb.cv_results_['std_test_score'].mean()

In [217]:
print(f'''Best parameters: {grid_xgb.best_params_}
Mean cv test score: {grid_xgb.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_xgb.cv_results_['std_test_score'].mean()}
Train score: {xgb_cv_train_score}
Test test score: {xgb_cv_test_score}''')

Best parameters: {}
Mean cv test score: 0.8583333333333332
Std cv test score: 0.13333333333333333
Train score: 1.0
Test test score: 0.7857142857142857


Tfidfvectorizer

In [218]:
grid_xgb.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=Non

In [219]:
xgb_td_train_score = grid_xgb.score(train_tfidf, y_train)
xgb_td_test_score = accuracy_score(grid_xgb.predict(test_tfidf), y_test)

In [220]:
xgb_td_fold_test_mean = grid_xgb.cv_results_['mean_test_score'].mean()
xgb_td_fold_test_std = grid_xgb.cv_results_['std_test_score'].mean()

In [221]:
print(f'''Best parameters: {grid_xgb.best_params_}
Mean cv test score: {grid_xgb.cv_results_['mean_test_score'].mean()}
Std cv test score: {grid_xgb.cv_results_['std_test_score'].mean()}
Train score: {xgb_td_train_score}
Test test score: {xgb_td_test_score}''')

Best parameters: {}
Mean cv test score: 0.8555555555555555
Std cv test score: 0.08810417515085392
Train score: 1.0
Test test score: 0.9285714285714286


## Results

In [222]:
import seaborn as sns
import matplotlib.pyplot as plt

In [223]:
from sklearn.metrics import classification_report, confusion_matrix

In [235]:
models = [
    'MultinomialNB_cv',
    'LogisticRegression_cv', 
    'SGD_cv',
    'KNeighbors_cv', 
    'DecisionTree_cv',
    'AdaBoost_cv',
    'GradientBoosting_cv',
    'XGB_cv',
    'MultinomialNB_td',
    'LogisticRegression_td',
    'SGD_td',
    'KNeighbors_td',
    'DecisionTree_td',
    'AdaBoost_td',
    'GradientBoosting_td',
    'XGB_td',
]

In [236]:
train_score_cv = [mnb_train_score_cv, log_cv_train_score, sgd_cv_train_score, knn_cv_train_score, dtc_cv_train_score, ada_cv_train_score, grb_cv_train_score, xgb_cv_train_score]
train_score_td = [mnb_train_score_td, log_td_train_score, sgd_td_train_score, knn_td_train_score, dtc_td_train_score, ada_td_train_score, grb_td_train_score, xgb_td_train_score]

In [237]:
test_score_cv = [mnb_test_score_cv, log_cv_test_score, sgd_cv_test_score, knn_cv_test_score, dtc_cv_test_score, ada_cv_test_score, grb_cv_test_score, xgb_cv_test_score]
test_score_td = [mnb_test_score_td, log_td_test_score, sgd_td_test_score, knn_td_test_score, dtc_td_test_score, ada_td_test_score, grb_td_test_score, xgb_td_test_score]

In [238]:
mean_fold_cv = [mnb_cv_fold_test_mean,log_cv_fold_test_mean,sgd_cv_fold_test_mean, knn_cv_fold_test_mean, dtc_cv_fold_test_mean, ada_cv_fold_test_mean, grb_cv_fold_test_mean, xgb_cv_fold_test_mean]
std_fold_cv = [mnb_cv_fold_test_std,log_cv_fold_test_std, sgd_cv_fold_test_std,knn_cv_fold_test_std,dtc_cv_fold_test_std, ada_cv_fold_test_std, grb_cv_fold_test_std, xgb_cv_fold_test_std] 

In [239]:
mean_fold_td = [mnb_td_fold_test_mean, log_td_fold_test_mean, sgd_td_fold_test_mean,  knn_td_fold_test_mean,  dtc_td_fold_test_mean,  ada_td_fold_test_mean,  grb_td_fold_test_mean, xgb_td_fold_test_mean]
std_fold_td = [mnb_td_fold_test_std,  log_td_fold_test_std, sgd_td_fold_test_std, knn_td_fold_test_std, dtc_td_fold_test_std, ada_td_fold_test_std, grb_td_fold_test_std, xgb_td_fold_test_std] 

In [240]:
pd.DataFrame({'model':models, 'mean fold test score': mean_fold_cv+mean_fold_td,'std fold test score':std_fold_cv+st,'train score': train_score_cv + train_score_td, 'test score': test_score_cv + test_score_td})

Unnamed: 0,model,mean fold test score,std fold test score,train score,test score
0,MultinomialNB_cv,0.944444,0.081766,1.0,1.0
1,LogisticRegression_cv,0.863542,0.170486,1.0,1.0
2,SGD_cv,0.900154,0.119708,1.0,0.928571
3,KNeighbors_cv,0.512191,0.053761,1.0,0.5
4,DecisionTree_cv,,,1.0,0.928571
5,AdaBoost_cv,0.95,0.1,1.0,0.928571
6,GradientBoosting_cv,0.95,0.1,1.0,0.928571
7,XGB_cv,0.858333,0.133333,1.0,0.785714
8,MultinomialNB_td,0.951852,0.072001,1.0,1.0
9,LogisticRegression_td,0.860417,0.101512,1.0,1.0
