## Vectorizers + Models
#### Inflation topic

### Load

Tokens

In [44]:
import warnings
warnings.filterwarnings('ignore')

In [45]:
import pandas as pd
import numpy as np

In [46]:
tokens_izq = pd.read_pickle('tokens/topic_tokens_izq_new.pkl')
tokens_der = pd.read_pickle('tokens/topic_tokens_der_new.pkl')

In [47]:
tokens_izq_inf = tokens_izq[tokens_izq.topics == 'inflation'].drop('topics', axis=1)
tokens_izq_inf.insert(loc=0, column='publication', value='left')

In [48]:
tokens_der_inf = tokens_der[tokens_der.topics == 'inflation'].drop('topics', axis=1)
tokens_der_inf.insert(loc=0, column='publication', value='right')

In [49]:
tokens = tokens_izq_inf.append(tokens_der_inf)
tokens.reset_index(inplace=True)
tokens.drop('index', axis=1, inplace=True)
tokens.fillna(0, inplace=True)

In [50]:
tokens.sample(5)

Unnamed: 0,publication,abajo,abarca,abastecimiento,abc,abierta,abiertamente,abierto,abiertos,abre,...,vuelva,vuelvan,vulgarmente,woyecheszen,xx,yaguarete,york,yuanes,zamora,zonas
15,left,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,left,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36,left,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,left,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,left,0,0.0,0,0.0,0,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Train - Test Split

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
# data balance
tokens.publication.value_counts(normalize=True)

right    0.518519
left     0.481481
Name: publication, dtype: float64

In [53]:
X = tokens.drop('publication', axis=1)

In [54]:
y = tokens['publication']

In [55]:
train, test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify=tokens.publication)

Shapes and balance

In [56]:
# shape train
display(train.shape[0],
        train.shape[0] / X.shape[0])

60

0.7407407407407407

In [57]:
# shape test
display(test.shape[0],
        test.shape[0] / X.shape[0])

21

0.25925925925925924

In [58]:
# data balance - test
display(y_train.value_counts(normalize=True), 
        y_test.value_counts(normalize=True))

right    0.516667
left     0.483333
Name: publication, dtype: float64

right    0.52381
left     0.47619
Name: publication, dtype: float64

Vectorización con TF-IDF

In [59]:
from sklearn.feature_extraction.text import TfidfTransformer

In [60]:
tfidf_vector = TfidfTransformer().fit(train)
train_tfidf = tfidf_vector.transform(train)

In [61]:
test_tfidf = tfidf_vector.transform(test)

#### Models calling

In [62]:
from sklearn.naive_bayes import MultinomialNB

In [63]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [64]:
from sklearn.neighbors import KNeighborsClassifier

In [65]:
from sklearn.tree import DecisionTreeClassifier

In [66]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

In [67]:
from xgboost import XGBClassifier

### GridSearchCV (Vectorization + Models)

In [68]:
from sklearn.metrics import accuracy_score

In [69]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [70]:
folds = StratifiedKFold(n_splits=5, random_state=19, shuffle=True)

### 1. Naive Bayes

CountVectorizer

In [71]:
parameters = {
        'alpha': (1e-2, 1e-3, 1e-1),
    }

In [72]:
grid_mnb = GridSearchCV(MultinomialNB(), parameters, cv=folds, scoring='accuracy')

In [73]:
grid_mnb.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=MultinomialNB(),
             param_grid={'alpha': (0.01, 0.001, 0.1)}, scoring='accuracy')

In [74]:
mnb_train_score_cv = grid_mnb.score(train, y_train)
mnb_test_score_cv = accuracy_score(grid_mnb.predict(test), y_test)

In [75]:
print(f'''Best parameters: {grid_mnb.best_params_}
Best cv score: {grid_mnb.best_score_}
Train score: {mnb_train_score_cv}
Test test score: {mnb_test_score_cv}''')

Best parameters: {'alpha': 0.01}
Best cv score: 0.9833333333333332
Train score: 1.0
Test test score: 0.9047619047619048


Tfidfvectorizer

In [76]:
grid_mnb.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=MultinomialNB(),
             param_grid={'alpha': (0.01, 0.001, 0.1)}, scoring='accuracy')

In [77]:
mnb_train_score_td = grid_mnb.score(train_tfidf, y_train)
mnb_test_score_td = accuracy_score(grid_mnb.predict(test_tfidf), y_test)

In [78]:
print(f'''Best parameters: {grid_mnb.best_params_}
Best cv score: {grid_mnb.best_score_}
Train score: {mnb_train_score_td}
Test test score: {mnb_test_score_td}''')

Best parameters: {'alpha': 0.01}
Best cv score: 0.9833333333333332
Train score: 1.0
Test test score: 0.9047619047619048


### 2. LogisticRegression

CountVectorizer

In [79]:
parameters = {
        'C': [1, 10, 100, 1000],
        'penalty': ['l1', 'l2',],
        'solver': ['saga']
    }

In [80]:
grid_log = GridSearchCV(LogisticRegression(), parameters, cv=folds, scoring='accuracy')

In [81]:
grid_log.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 100, 1000], 'penalty': ['l1', 'l2'],
                         'solver': ['saga']},
             scoring='accuracy')

In [84]:
log_cv_train_score = grid_log.score(train, y_train)
log_cv_test_score = accuracy_score(grid_log.predict(test), y_test)

In [85]:
print(f'''Best parameters: {grid_log.best_params_}
Best cv score: {grid_log.best_score_}
Train score: {log_cv_train_score}
Test test score: {log_cv_test_score}''')

Best parameters: {'C': 1, 'penalty': 'l2', 'solver': 'saga'}
Best cv score: 0.9666666666666666
Train score: 1.0
Test test score: 0.8571428571428571


Tfidfvectorizer

In [86]:
grid_log.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [1, 10, 100, 1000], 'penalty': ['l1', 'l2'],
                         'solver': ['saga']},
             scoring='accuracy')

In [87]:
log_td_train_score = grid_log.score(train_tfidf, y_train)
log_td_test_score = accuracy_score(grid_log.predict(test_tfidf), y_test)

In [88]:
print(f'''Best parameters: {grid_log.best_params_}
Best cv score: {grid_log.best_score_}
Train score: {log_td_train_score}
Test test score: {log_td_test_score}''')

Best parameters: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
Best cv score: 0.9833333333333332
Train score: 1.0
Test test score: 0.9047619047619048


### 3. SGDClassifier

CountVectorizer

In [89]:
parameters ={
    'penalty': ('l2', 'elasticnet', 'l1'),
    'max_iter': [50, 80],
    'tol': [1e-4],
    'loss': ['hinge', 'log', 'modified_huber'],
}

In [90]:
grid_sgd = GridSearchCV(SGDClassifier(), parameters, cv=folds, scoring='accuracy')

In [91]:
grid_sgd.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=SGDClassifier(),
             param_grid={'loss': ['hinge', 'log', 'modified_huber'],
                         'max_iter': [50, 80],
                         'penalty': ('l2', 'elasticnet', 'l1'),
                         'tol': [0.0001]},
             scoring='accuracy')

In [92]:
sgd_cv_train_score = grid_sgd.score(train, y_train)
sgd_cv_test_score = accuracy_score(grid_sgd.predict(test), y_test)

In [93]:
print(f'''Best parameters: {grid_sgd.best_params_}
Best cv score: {grid_sgd.best_score_}
Train score: {sgd_cv_train_score}
Test test score: {sgd_cv_test_score}''')

Best parameters: {'loss': 'hinge', 'max_iter': 50, 'penalty': 'elasticnet', 'tol': 0.0001}
Best cv score: 0.9666666666666666
Train score: 1.0
Test test score: 0.8095238095238095


Tfidfvectorizer

In [94]:
grid_sgd.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=SGDClassifier(),
             param_grid={'loss': ['hinge', 'log', 'modified_huber'],
                         'max_iter': [50, 80],
                         'penalty': ('l2', 'elasticnet', 'l1'),
                         'tol': [0.0001]},
             scoring='accuracy')

In [95]:
sgd_td_train_score = grid_sgd.score(train_tfidf, y_train)
sgd_td_test_score = accuracy_score(grid_sgd.predict(test_tfidf), y_test)

In [96]:
print(f'''Best parameters: {grid_sgd.best_params_}
Best cv score: {grid_sgd.best_score_}
Train score: {log_td_train_score}
Test test score: {log_td_test_score}''')

Best parameters: {'loss': 'hinge', 'max_iter': 50, 'penalty': 'elasticnet', 'tol': 0.0001}
Best cv score: 0.9833333333333332
Train score: 1.0
Test test score: 0.9047619047619048


### 4. KNeighborsClassifier

CountVectorizer

In [97]:
parameters ={
        'n_neighbors': range(1,4),
        'weights' : ['uniform', 'distance'],
        'p' : [1, 2, 3]
}

In [98]:
grid_knn = GridSearchCV(KNeighborsClassifier(), parameters, cv=folds, scoring='accuracy')

In [99]:
grid_knn.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 4), 'p': [1, 2, 3],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [100]:
knn_cv_train_score = grid_knn.score(train, y_train)
knn_cv_test_score = accuracy_score(grid_knn.predict(test), y_test)

In [101]:
print(f'''Best parameters: {grid_knn.best_params_}
Best cv score: {grid_knn.best_score_}
Train score: {knn_cv_train_score}
Test test score: {knn_cv_test_score}''')

Best parameters: {'n_neighbors': 1, 'p': 3, 'weights': 'uniform'}
Best cv score: 0.75
Train score: 1.0
Test test score: 0.8095238095238095


Tfidfvectorizer

In [102]:
grid_knn.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 4), 'p': [1, 2, 3],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [103]:
knn_td_train_score = grid_knn.score(train_tfidf, y_train)
knn_td_test_score = accuracy_score(grid_knn.predict(test_tfidf), y_test)

In [104]:
print(f'''Best parameters: {grid_knn.best_params_}
Best cv score: {grid_knn.best_score_}
Train score: {knn_td_train_score}
Test test score: {knn_td_test_score}''')

Best parameters: {'n_neighbors': 3, 'p': 2, 'weights': 'uniform'}
Best cv score: 0.9166666666666666
Train score: 0.95
Test test score: 0.8571428571428571


### 5. CART

CountVectorizer

In [105]:
parameters ={
        "criterion" : ["gini", "entropy"],
        "min_samples_leaf": [5,10,15,20,2], 
        "max_depth" : [1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17],
        "min_samples_split": [2, 3, 4,None]
}

In [106]:
grid_dtc = GridSearchCV(DecisionTreeClassifier(), parameters, cv=folds, scoring='accuracy')

In [107]:
grid_dtc.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13,
                                       14, 15, 16, 17],
                         'min_samples_leaf': [5, 10, 15, 20, 2],
                         'min_samples_split': [2, 3, 4, None]},
             scoring='accuracy')

In [108]:
dtc_cv_train_score = grid_dtc.score(train, y_train)
dtc_cv_test_score = accuracy_score(grid_dtc .predict(test), y_test)

In [109]:
print(f'''Best parameters: {grid_dtc.best_params_}
Best cv score: {grid_dtc.best_score_}
Train score: {dtc_cv_train_score}
Test test score: {dtc_cv_test_score}''')

Best parameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
Best cv score: 0.9
Train score: 0.95
Test test score: 0.8571428571428571


Tfidfvectorizer

In [110]:
grid_dtc.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13,
                                       14, 15, 16, 17],
                         'min_samples_leaf': [5, 10, 15, 20, 2],
                         'min_samples_split': [2, 3, 4, None]},
             scoring='accuracy')

In [111]:
dtc_td_train_score = grid_dtc.score(train_tfidf, y_train)
dtc_td_test_score = accuracy_score(grid_dtc.predict(test_tfidf), y_test)

In [112]:
print(f'''Best parameters: {grid_dtc.best_params_}
Best cv score: {grid_dtc.best_score_}
Train score: {knn_td_train_score}
Test test score: {knn_td_test_score}''')

Best parameters: {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best cv score: 0.9166666666666666
Train score: 0.95
Test test score: 0.8571428571428571


### 6. AdaBoost

CountVectorizer

In [113]:
parameters = {
    'learning_rate': [.5]
}

In [114]:
grid_ada = GridSearchCV(AdaBoostClassifier(), parameters, cv=folds, scoring='accuracy')

In [115]:
grid_ada.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5]}, scoring='accuracy')

In [116]:
ada_cv_train_score = grid_ada.score(train, y_train)
ada_cv_test_score = accuracy_score(grid_ada .predict(test), y_test)

In [117]:
print(f'''Best parameters: {grid_ada.best_params_}
Best cv score: {grid_ada.best_score_}
Train score: {ada_cv_train_score}
Test test score: {ada_cv_test_score}''')

Best parameters: {'learning_rate': 0.5}
Best cv score: 0.85
Train score: 1.0
Test test score: 0.8571428571428571


Tfidfvectorizer

In [118]:
grid_ada.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=AdaBoostClassifier(),
             param_grid={'learning_rate': [0.5]}, scoring='accuracy')

In [119]:
ada_td_train_score = grid_ada.score(train_tfidf, y_train)
ada_td_test_score = accuracy_score(grid_ada.predict(test_tfidf), y_test)

In [120]:
print(f'''Best parameters: {grid_ada.best_params_}
Best cv score: {grid_ada.best_score_}
Train score: {ada_td_train_score}
Test test score: {ada_td_test_score}''')

Best parameters: {'learning_rate': 0.5}
Best cv score: 0.95
Train score: 1.0
Test test score: 0.9047619047619048


### 7. GradientBoosting

CountVectorizer

In [121]:
parameters = {
    'loss':['deviance', 'exponential']
}

In [122]:
grid_grb = GridSearchCV(GradientBoostingClassifier(), parameters, cv=folds, scoring='accuracy')

In [123]:
grid_grb.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=GradientBoostingClassifier(),
             param_grid={'loss': ['deviance', 'exponential']},
             scoring='accuracy')

In [124]:
grb_cv_train_score = grid_grb.score(train, y_train)
grb_cv_test_score = accuracy_score(grid_grb.predict(test), y_test)

In [125]:
print(f'''Best parameters: {grid_grb.best_params_}
Best cv score: {grid_grb.best_score_}
Train score: {grb_cv_train_score}
Test test score: {grb_cv_test_score}''')

Best parameters: {'loss': 'deviance'}
Best cv score: 0.8333333333333333
Train score: 1.0
Test test score: 0.8571428571428571


Tfidfvectorizer

In [126]:
grid_grb.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=GradientBoostingClassifier(),
             param_grid={'loss': ['deviance', 'exponential']},
             scoring='accuracy')

In [127]:
grb_td_train_score = grid_grb.score(train_tfidf, y_train)
grb_td_test_score = accuracy_score(grid_grb.predict(test_tfidf), y_test)

In [128]:
print(f'''Best parameters: {grid_grb.best_params_}
Best cv score: {grid_grb.best_score_}
Train score: {grb_td_train_score}
Test test score: {grb_td_test_score}''')

Best parameters: {'loss': 'deviance'}
Best cv score: 0.9166666666666666
Train score: 1.0
Test test score: 0.9047619047619048


### 8. XGBoost

CountVectorizer

In [129]:
grid_xgb = GridSearchCV(XGBClassifier(), param_grid={}, cv=folds, scoring='accuracy')

In [130]:
grid_xgb.fit(train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=Non

In [131]:
xgb_cv_train_score = grid_xgb.score(train, y_train)
xgb_cv_test_score = accuracy_score(grid_xgb.predict(test), y_test)

In [132]:
print(f'''Best parameters: {grid_xgb.best_params_}
Best cv score: {grid_xgb.best_score_}
Train score: {xgb_cv_train_score}
Test test score: {xgb_cv_test_score}''')

Best parameters: {}
Best cv score: 0.95
Train score: 1.0
Test test score: 0.9523809523809523


Tfidfvectorizer

In [133]:
grid_xgb.fit(train_tfidf, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=19, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=Non

In [134]:
xgb_td_train_score = grid_xgb.score(train_tfidf, y_train)
xgb_td_test_score = accuracy_score(grid_xgb.predict(test_tfidf), y_test)

In [135]:
print(f'''Best parameters: {grid_xgb.best_params_}
Best cv score: {grid_xgb.best_score_}
Train score: {xgb_td_train_score}
Test test score: {xgb_td_test_score}''')

Best parameters: {}
Best cv score: 0.9166666666666666
Train score: 1.0
Test test score: 0.8571428571428571


## Results

In [136]:
import seaborn as sns
import matplotlib.pyplot as plt

In [137]:
from sklearn.metrics import classification_report, confusion_matrix

In [189]:
models = ['MultinomialNB_cv', 
          'MultinomialNB_td', 
          'LogisticRegression_cv', 
          'LogisticRegression_td',
          'SGD_cv',
          'SGD_td',
          'KNeighbors_cv', 
          'KNeighbors_td',
          'DecisionTree_cv',
          'DecisionTree_td',
          'AdaBoost_cv',
          'AdaBoost_td',
          'GradientBoosting_cv',
          'GradientBoosting_td',
          'XGB_cv',
          'XGB_td']

In [190]:
train_score_cv = [mnb_train_score_cv, log_cv_train_score, sgd_cv_train_score, knn_cv_train_score, dtc_cv_train_score, ada_cv_train_score, grb_cv_train_score, xgb_cv_train_score]
train_score_td = [mnb_train_score_td, log_td_train_score, sgd_td_train_score, knn_td_train_score, dtc_td_train_score, ada_td_train_score, grb_td_train_score, xgb_td_train_score]

In [191]:
test_score_cv = [mnb_test_score_cv, log_cv_test_score, sgd_cv_test_score, knn_cv_test_score, dtc_cv_test_score, ada_cv_test_score, grb_cv_test_score, xgb_cv_test_score]
test_score_td = [mnb_test_score_td, log_td_test_score, sgd_td_test_score, knn_td_test_score, dtc_td_test_score, ada_td_test_score, grb_td_test_score, xgb_td_test_score]

In [192]:
pd.DataFrame({'model':models, 'train score': train_score_cv + train_score_td, 'test score': test_score_cv + test_score_td})

Unnamed: 0,model,train score,test score
0,MultinomialNB_cv,1.0,0.904762
1,MultinomialNB_td,1.0,0.857143
2,LogisticRegression_cv,1.0,0.809524
3,LogisticRegression_td,1.0,0.809524
4,SGD_cv,0.95,0.857143
5,SGD_td,1.0,0.857143
6,KNeighbors_cv,1.0,0.857143
7,KNeighbors_td,1.0,0.952381
8,DecisionTree_cv,1.0,0.904762
9,DecisionTree_td,1.0,0.904762
