## Pipeline - Best Vectorization and Model
### Topic (**Inflation**) according to each news portal

In [1]:
import warnings
warnings.filterwarnings('ignore')

### Load

In [2]:
import pickle

In [3]:
import pandas as pd
import numpy as np

In [4]:
izq = pickle.load(open('data/df_clean_izq.pkl','rb'))
der = pickle.load(open('data/df_clean_der.pkl','rb'))

### Preparte Dataframe

In [5]:
izq = izq[izq.topics == 'inflation']
der = der[der.topics == 'inflation']

In [6]:
izq['bias'] = 'left'
der['bias'] = 'right'

In [7]:
df = izq.loc[:,['bias', 'body']].append(der.loc[:,['bias', 'body']])

### Split Train - Test

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X = df.body

In [10]:
y = df.bias

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify=y)

In [12]:
print(f"Train shape: {X_train.shape[0] / X.shape[0]}\nTest shape: {X_test.shape[0] / X.shape[0]}")

Train shape: 0.7435897435897436
Test shape: 0.2564102564102564


In [13]:
print(f"Target balance\nTrain:\n{y_train.value_counts(normalize=True)}\nTest:\n{y_test.value_counts(normalize=True)}")

Target balance
Train:
right    0.534483
left     0.465517
Name: bias, dtype: float64
Test:
right    0.55
left     0.45
Name: bias, dtype: float64


### Looking for the model

#### Preparing the input

In [14]:
import re
import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [15]:
stopwords_sp = stopwords.words('spanish')

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [17]:
def removal(text):
    text = re.sub(r'(\d|\$|\%|\+)', '', text.lower())
    return re.sub(r'\d+', '', text)

#### Folds config

In [18]:
from sklearn.model_selection import StratifiedKFold


In [19]:
folds = StratifiedKFold(n_splits=5, random_state=19, shuffle=True)

#### Calling the models

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [22]:
from sklearn.neighbors import KNeighborsClassifier

In [23]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [24]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

### Pipeline

#### Vectorizers + Model switcher ad-hoc class + Models 

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator

In [26]:
class ClfSwitcher(BaseEstimator):

    def __init__(self, estimator = None):
        self.estimator = estimator

    def fit(self, X, y=None, **kwargs):
        self.estimator.fit(X, y)
        return self

    def predict(self, X, y=None):
        return self.estimator.predict(X)

    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

    def score(self, X, y):
        return self.estimator.score(X, y)

In [27]:
pipeline = Pipeline([
    ('cv', CountVectorizer(stop_words=stopwords_sp)),
    ('tfidf', TfidfTransformer()),
    ('clf', ClfSwitcher()),
])

parameters = [
    {
        # vectorizer params
        'cv__max_df': (0.5, 0.75, 1.0),
        'cv__lowercase':  [True],
        'cv__strip_accents':['unicode'], 
        'cv__preprocessor':[removal],
        'cv__ngram_range': [(1,1),(1,2),(2,2)],
        'cv__min_df':[1,2,3],
        # model params
        'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
        'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
        'clf__estimator__max_iter': [50, 80],
        'clf__estimator__tol': [1e-4],
        'clf__estimator__loss': ['hinge', 'log', 'modified_huber'],
    },
    {
        # vectorizer params
        'cv__max_df': (0.5, 0.75, 1.0),
        'cv__lowercase':  [True],
        'cv__strip_accents':['unicode'], 
        'cv__preprocessor':[removal],
        'cv__ngram_range': [(1,1),(1,2),(2,2)],
        'cv__min_df':[1,2,3],
        # model params
        'clf__estimator': [MultinomialNB()],
        'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    },
    {
        # vectorizer params
        'cv__max_df': (0.5, 0.75, 1.0),
        'cv__lowercase':  [True],
        'cv__strip_accents':['unicode'], 
        'cv__preprocessor':[removal],
        'cv__ngram_range': [(1,1),(1,2),(2,2)],
        'cv__min_df':[1,2,3],
        # model params
        'clf__estimator': [KNeighborsClassifier()],
        'clf__estimator__n_neighbors': range(1,4),
        'clf__estimator__weights' : ['uniform', 'distance'],
        'clf__estimator__p' : [1, 2, 3]
    },
    {
        # vectorizer params
        'cv__max_df': (0.5, 0.75, 1.0),
        'cv__lowercase':  [True],
        'cv__strip_accents':['unicode'], 
        'cv__preprocessor':[removal],
        'cv__ngram_range': [(1,1),(1,2),(2,2)],
        'cv__min_df':[1,2,3],
        # model params
        'clf__estimator': [LogisticRegression()],
        'clf__estimator__C': [1, 10, 100, 1000],
        'clf__estimator__penalty': ['l1', 'l2',],
        'clf__estimator__solver': ['saga']
    },
    {
        # vectorizer params
        'cv__max_df': (0.5, 0.75, 1.0),
        'cv__lowercase':  [True],
        'cv__strip_accents':['unicode'], 
        'cv__preprocessor':[removal],
        'cv__ngram_range': [(1,1),(1,2),(2,2)],
        'cv__min_df':[1,2,3],
        # model params
        'clf__estimator': [DecisionTreeClassifier()],
        "clf__estimator__criterion" : ["gini", "entropy"],
        "clf__estimator__min_samples_leaf": [5,10,15,20,2], 
        "clf__estimator__max_depth" : [1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17],
        "clf__estimator__min_samples_split": [2, 3, 4,None]
    },    
]

#### Tuning model

In [28]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(pipeline, parameters, cv=folds, n_jobs=12, return_train_score=True, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 18549 candidates, totalling 92745 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   8 tasks      | elapsed:    8.1s
[Parallel(n_jobs=12)]: Done 104 tasks      | elapsed:   17.6s
[Parallel(n_jobs=12)]: Done 264 tasks      | elapsed:   30.5s
[Parallel(n_jobs=12)]: Done 488 tasks      | elapsed:   49.4s
[Parallel(n_jobs=12)]: Done 776 tasks      | elapsed:  1.2min
[Parallel(n_jobs=12)]: Done 1128 tasks      | elapsed:  1.7min
[Parallel(n_jobs=12)]: Done 1544 tasks      | elapsed:  2.3min
[Parallel(n_jobs=12)]: Done 2024 tasks      | elapsed:  3.0min
[Parallel(n_jobs=12)]: Done 2568 tasks      | elapsed:  3.7min
[Parallel(n_jobs=12)]: Done 3176 tasks      | elapsed:  4.6min
[Parallel(n_jobs=12)]: Done 3848 tasks      | elapsed:  5.4min
[Parallel(n_jobs=12)]: Done 4584 tasks      | elapsed:  6.3min
[Parallel(n_jobs=12)]: Done 5384 tasks      | elapsed:  7.3min
[Parallel(n_jobs=12)]: Done 6248 tasks      | elapsed:  8.8min
[Parallel(n_jobs=12)]: Done 7176 tasks      | 

#### Report

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
grid.best_estimator_, grid.best_score_, grid.best_params_

In [None]:
grid_pred = grid.predict(X_test)

In [None]:
accuracy_score(y_test, grid_pred)

In [None]:
print(classification_report(y_test, grid_pred))

In [None]:
confusion_matrix(y_test, grid_pred)