# Logistic Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import roc_auc_score,confusion_matrix, accuracy_score, make_scorer, f1_score,precision_score,recall_score, plot_confusion_matrix
from componetes_preprocessamento import RemoveStopWords, Cleaner, Tokenizador, Stemmer, Joiner, pega_resultados, salvando_em_arquivo
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold


[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Coleta de Dados

In [2]:
dataset = pd.read_csv("datasets/reviews.csv")

In [3]:
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,order_id,review_id,review_score,review_comment_message
0,3,658677c97b385a9be170737859d3511b,e64fb393e7b32834bb789ff8bb30750e,1,Recebi bem antes do prazo estipulado.
1,4,8e6bfb81e283fa7e4f11123a3fb894f1,f7c4243c7fe1938f181bec41a392bdeb,1,Parabéns lojas lannister adorei comprar pela I...
2,9,b9bf720beb4ab3728760088589c62129,8670d52e15e00043ae7de4c01cc2fe06,0,aparelho eficiente. no site a marca do aparelh...
3,12,9d6f15f95d01e79bd1349cc208361f09,4b49719c8a200003f700d3d986ea1a19,0,"Mas um pouco ,travando...pelo valor ta Boa.\r\n"
4,15,e51478e7e277a83743b6f9991dbfa3fb,3948b09f7c818e2d86c9a546758b2335,1,"Vendedor confiável, produto ok e entrega antes..."


In [4]:
X = dataset["review_comment_message"].copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)

## Parâmetros Padrão

In [6]:
pipeline = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("LR", LogisticRegression(random_state=199)),
                    ])

pipeline.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('LR', LogisticRegression(random_state=199))])

In [7]:
print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.74      0.85      0.79      3208
           0       0.57      0.25      0.35      2918
           1       0.77      0.92      0.84      6137

    accuracy                           0.74     12263
   macro avg       0.69      0.67      0.66     12263
weighted avg       0.72      0.74      0.71     12263



In [8]:
resultados = []

## Grid Search F1-Score

In [9]:
parameters = dict()
parameters["penalty"] = ["l1","l2","elasticnet","none"]
parameters["solver"] = ["newton-cg", "lbfgs","liblinear", "sag","saga"]
parameters["C"] = np.logspace(-4, 4, 20)
lrgs = LogisticRegression(random_state=199)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [10]:
pipeline_gsf1 = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("GS", GridSearchCV(lrgs, parameters, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_gsf1.fit(X_train, y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


1800 fits failed out of a total of 4000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penal

Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('GS',
                 GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                              estimator=LogisticRegression(random_state=199),
                              n_jobs=-1,...
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                          'penalty': ['l1', 'l2', 'elasticnet',
                                                      'none'],
                                          '

In [11]:
cvres = pipeline_gsf1["GS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

nan {'C': 0.0001, 'penalty': 'l1', 'solver': 'newton-cg'}
nan {'C': 1.623776739188721, 'penalty': 'l1', 'solver': 'sag'}
nan {'C': 11.288378916846883, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
nan {'C': 11.288378916846883, 'penalty': 'elasticnet', 'solver': 'newton-cg'}
nan {'C': 11.288378916846883, 'penalty': 'l1', 'solver': 'sag'}


In [12]:
predicted_y_gsf1 = pipeline_gsf1.predict(X_test)
print(classification_report(y_test, predicted_y_gsf1))
resultado = pega_resultados("lr", "Grid Search", y_test, predicted_y_gsf1, "f1 score", pipeline_gsf1["GS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.74      0.84      0.79      3208
           0       0.55      0.26      0.35      2918
           1       0.77      0.92      0.84      6137

    accuracy                           0.74     12263
   macro avg       0.69      0.67      0.66     12263
weighted avg       0.71      0.74      0.71     12263

['lr', 'Grid Search', 0.7397863491804616, 0.6595564999430278, 0.68880994411755, 0.6713863077298748, 'f1 score', {'C': 1.623776739188721, 'penalty': 'l2', 'solver': 'newton-cg'}]


In [13]:
print(pipeline_gsf1["GS"].best_estimator_)
print(pipeline_gsf1["GS"].best_params_)

LogisticRegression(C=1.623776739188721, random_state=199, solver='newton-cg')
{'C': 1.623776739188721, 'penalty': 'l2', 'solver': 'newton-cg'}


## Grid Search Accuracy Score

In [14]:
pipeline_gsac = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("GS", GridSearchCV(lrgs, parameters, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_gsac.fit(X_train, y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


1800 fits failed out of a total of 4000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
200 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penal

Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('GS',
                 GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                              estimator=LogisticRegression(random_state=199),
                              n_jobs=-1,...
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                          'penalty': ['l1', 'l2', 'elasticnet',
                                                      'none'],
                                          '

In [15]:
cvres = pipeline_gsac["GS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

nan {'C': 0.0001, 'penalty': 'l1', 'solver': 'newton-cg'}
nan {'C': 1.623776739188721, 'penalty': 'l1', 'solver': 'newton-cg'}
nan {'C': 11.288378916846883, 'penalty': 'l1', 'solver': 'sag'}
nan {'C': 11.288378916846883, 'penalty': 'l1', 'solver': 'lbfgs'}
nan {'C': 11.288378916846883, 'penalty': 'l1', 'solver': 'newton-cg'}


In [16]:
predicted_y_gsac = pipeline_gsac.predict(X_test)
print(classification_report(y_test, predicted_y_gsac))
resultado = pega_resultados("lr", "Grid Search", y_test, predicted_y_gsac, "acuracia", pipeline_gsac["GS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.74      0.86      0.79      3208
           0       0.58      0.24      0.34      2918
           1       0.78      0.92      0.84      6137

    accuracy                           0.74     12263
   macro avg       0.70      0.68      0.66     12263
weighted avg       0.72      0.74      0.71     12263

['lr', 'Grid Search', 0.744842208268776, 0.6603232654349752, 0.6978910822800519, 0.6754524631491075, 'acuracia', {'C': 0.615848211066026, 'penalty': 'l2', 'solver': 'saga'}]


In [17]:
print(pipeline_gsac["GS"].best_estimator_)
print(pipeline_gsac["GS"].best_params_)

LogisticRegression(C=0.615848211066026, random_state=199, solver='saga')
{'C': 0.615848211066026, 'penalty': 'l2', 'solver': 'saga'}


## Randomized Search F1-Score

In [18]:
parameters_rs = dict()
parameters_rs["penalty"] = ["l1","l2","elasticnet","none"]
parameters_rs["solver"] = ["newton-cg", "lbfgs","liblinear", "sag","saga"]
parameters_rs["C"] = np.logspace(-8, 8, 40)
lrrs = LogisticRegression(random_state=199)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [19]:
pipeline_rsf1 = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("RS", RandomizedSearchCV(lrrs, parameters_rs, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_rsf1.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 457, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.

--

Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('RS',
                 RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                                    estimator=LogisticRegression(random_state=199),
                                    n_jo...
       7.01703829e+01, 1.80472177e+02, 4.64158883e+02, 1.19377664e+03,
       3.07029063e+03, 7.89652287e+03, 2.03091762e+04, 5.22334507e+04,
       1.34339933e+05, 3.45510729e+05, 8.88623816e+05, 2.28546386e+06,
       5.87801607e+06, 1.51177507e+07, 3.88815518e+07, 1.00000000e+08]),
                                                         'penalty': ['l1', 'l2',
                                                                     'elasticnet',
          

In [20]:
cvres = pipeline_rsf1["RS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

nan {'solver': 'sag', 'penalty': 'elasticnet', 'C': 6.614740641230146e-08}
nan {'solver': 'newton-cg', 'penalty': 'elasticnet', 'C': 6.614740641230146e-08}
nan {'solver': 'lbfgs', 'penalty': 'elasticnet', 'C': 10.608183551394482}
nan {'solver': 'liblinear', 'penalty': 'elasticnet', 'C': 7896.522868499733}
0.6504205403705169 {'solver': 'saga', 'penalty': 'l2', 'C': 4.124626382901348}


In [21]:
predicted_y_rsf1 = pipeline_rsf1.predict(X_test)
print(classification_report(y_test, predicted_y_rsf1))
resultado = pega_resultados("lr", "Randomized Search", y_test, predicted_y_rsf1, "f1 score", pipeline_rsf1["RS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.74      0.83      0.78      3208
           0       0.54      0.26      0.35      2918
           1       0.77      0.91      0.83      6137

    accuracy                           0.74     12263
   macro avg       0.68      0.67      0.66     12263
weighted avg       0.71      0.74      0.71     12263

['lr', 'Randomized Search', 0.7352197667781131, 0.6571509892778424, 0.6837729759095046, 0.6673300028807286, 'f1 score', {'solver': 'saga', 'penalty': 'l2', 'C': 4.124626382901348}]


In [22]:
print(pipeline_rsf1["RS"].best_estimator_)
print(pipeline_rsf1["RS"].best_params_)

LogisticRegression(C=4.124626382901348, random_state=199, solver='saga')
{'solver': 'saga', 'penalty': 'l2', 'C': 4.124626382901348}


## Randomized Search Accuracy

In [23]:
pipeline_rsac = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("RS", RandomizedSearchCV(lrrs, parameters_rs, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_rsac.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


40 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\cfpc2\anaconda3\envs\rv\lib\site-packages\sklearn\linear_model\_logistic.py", line 464, in _check_solver
    raise ValueError("penalty='none' is not supported for the liblinear solver")
ValueError: penalty='none'

Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('RS',
                 RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                                    estimator=LogisticRegression(random_state=199),
                                    n_jo...
       7.01703829e+01, 1.80472177e+02, 4.64158883e+02, 1.19377664e+03,
       3.07029063e+03, 7.89652287e+03, 2.03091762e+04, 5.22334507e+04,
       1.34339933e+05, 3.45510729e+05, 8.88623816e+05, 2.28546386e+06,
       5.87801607e+06, 1.51177507e+07, 3.88815518e+07, 1.00000000e+08]),
                                                         'penalty': ['l1', 'l2',
                                                                     'elasticnet',
          

In [24]:
cvres = pipeline_rsac["RS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

nan {'solver': 'liblinear', 'penalty': 'elasticnet', 'C': 100000000.0}
nan {'solver': 'sag', 'penalty': 'l1', 'C': 100000000.0}
nan {'solver': 'sag', 'penalty': 'elasticnet', 'C': 1e-08}
nan {'solver': 'liblinear', 'penalty': 'none', 'C': 0.0003257020655659783}
0.7042399155270365 {'solver': 'saga', 'penalty': 'none', 'C': 7896.522868499733}


In [25]:
predicted_y_rsac = pipeline_rsac.predict(X_test)
print(classification_report(y_test, predicted_y_rsac))
resultado = pega_resultados("lr", "Randomized Search", y_test, predicted_y_rsac, "acuracia", pipeline_rsac["RS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.73      0.75      0.74      3208
           0       0.46      0.26      0.33      2918
           1       0.75      0.89      0.82      6137

    accuracy                           0.71     12263
   macro avg       0.65      0.64      0.63     12263
weighted avg       0.68      0.71      0.68     12263

['lr', 'Randomized Search', 0.7065970806491071, 0.6299313535671683, 0.6467160392855865, 0.6361113020490129, 'acuracia', {'solver': 'saga', 'penalty': 'l2', 'C': 888623.8162743407}]


In [26]:
print(pipeline_rsac["RS"].best_estimator_)
print(pipeline_rsac["RS"].best_params_)

LogisticRegression(C=888623.8162743407, random_state=199, solver='saga')
{'solver': 'saga', 'penalty': 'l2', 'C': 888623.8162743407}


In [27]:
salvando_em_arquivo("resultados/LR_resultados.csv", resultados)