In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,confusion_matrix, accuracy_score, make_scorer, f1_score,precision_score,recall_score, plot_confusion_matrix
from componetes_preprocessamento import RemoveStopWords, Cleaner, Tokenizador, Stemmer, Joiner
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
dataset = pd.read_csv("datasets/reviews.csv")

In [3]:
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,order_id,review_id,review_score,review_comment_message
0,3,658677c97b385a9be170737859d3511b,e64fb393e7b32834bb789ff8bb30750e,1,Recebi bem antes do prazo estipulado.
1,4,8e6bfb81e283fa7e4f11123a3fb894f1,f7c4243c7fe1938f181bec41a392bdeb,1,Parabéns lojas lannister adorei comprar pela I...
2,9,b9bf720beb4ab3728760088589c62129,8670d52e15e00043ae7de4c01cc2fe06,0,aparelho eficiente. no site a marca do aparelh...
3,12,9d6f15f95d01e79bd1349cc208361f09,4b49719c8a200003f700d3d986ea1a19,0,"Mas um pouco ,travando...pelo valor ta Boa.\r\n"
4,15,e51478e7e277a83743b6f9991dbfa3fb,3948b09f7c818e2d86c9a546758b2335,1,"Vendedor confiável, produto ok e entrega antes..."


In [4]:
X = dataset["review_comment_message"].copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)

## Parâmetros Padrão

In [6]:
pipeline = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("DT", DecisionTreeClassifier(random_state=199)),
                    ])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('DT', DecisionTreeClassifier(random_state=199))])

In [7]:
print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.69      0.71      0.70      3208
           0       0.35      0.26      0.30      2918
           1       0.74      0.82      0.77      6137

    accuracy                           0.66     12263
   macro avg       0.59      0.60      0.59     12263
weighted avg       0.63      0.66      0.64     12263



## Grid Search F1-Score

In [8]:
parameters = dict()
parameters["max_leaf_nodes"] = list(range(2, 6))
parameters["min_samples_split"] = list(range(2, 10))
parameters["max_depth"] = list(range(2,10))
parameters["criterion"] = ["gini", "entropy"] 
parameters["min_samples_leaf"] = list(range(2,6))
dtgs = DecisionTreeClassifier(random_state=1)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [9]:
pipeline_gsf1 = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("GS", GridSearchCV(dtgs, parameters, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_gsf1.fit(X_train, y_train)

Fitting 10 folds for each of 2048 candidates, totalling 20480 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('GS',
                 GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                              estimator=DecisionTreeClassifier(random_state=1),
                              n_jobs=-1,
                              param_grid={'criterion': ['gini', 'entropy'],
                                          'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                                          'max_leaf_nodes': [2, 3, 4, 5],
                                          'min_samples_leaf': [2, 3, 4, 5],
                                          'min_samples_split': [2, 3, 4, 5, 6,
                                                                7, 8, 9]},
            

In [10]:
predicted_y_gsf1 = pipeline_gsf1.predict(X_test)
print(classification_report(y_test, predicted_y_gsf1))

              precision    recall  f1-score   support

          -1       0.63      0.45      0.53      3208
           0       0.00      0.00      0.00      2918
           1       0.58      0.94      0.72      6137

    accuracy                           0.59     12263
   macro avg       0.40      0.46      0.41     12263
weighted avg       0.45      0.59      0.50     12263



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
print(pipeline_gsf1["GS"].best_estimator_)
print(pipeline_gsf1["GS"].best_params_)

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=5, min_samples_leaf=2,
                       random_state=1)
{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}


## Grid Search Accuracy Score

In [12]:
pipeline_gsac = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("GS", GridSearchCV(dtgs, parameters, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_gsac.fit(X_train, y_train)

Fitting 10 folds for each of 2048 candidates, totalling 20480 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('GS',
                 GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                              estimator=DecisionTreeClassifier(random_state=1),
                              n_jobs=-1,
                              param_grid={'criterion': ['gini', 'entropy'],
                                          'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                                          'max_leaf_nodes': [2, 3, 4, 5],
                                          'min_samples_leaf': [2, 3, 4, 5],
                                          'min_samples_split': [2, 3, 4, 5, 6,
                                                                7, 8, 9]},
            

In [23]:
predicted_y_gsac = pipeline_gsac.predict(X_test)
print(classification_report(y_test, predicted_y_gsac))

              precision    recall  f1-score   support

          -1       0.63      0.45      0.53      3208
           0       0.00      0.00      0.00      2918
           1       0.58      0.94      0.72      6137

    accuracy                           0.59     12263
   macro avg       0.40      0.46      0.41     12263
weighted avg       0.45      0.59      0.50     12263



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
print(pipeline_gsac["GS"].best_estimator_)
print(pipeline_gsac["GS"].best_params_)

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=5, min_samples_leaf=2,
                       random_state=1)
{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}


## Randomized Search F1-Score

In [15]:
parameters_rs = dict()
parameters_rs["max_leaf_nodes"] = list(range(2,100))
parameters_rs["min_samples_split"] = list(range(2, 14))
parameters_rs["max_depth"] = list(range(2,14))
parameters_rs["criterion"] = ["gini", "entropy"] 
parameters_rs["min_samples_leaf"] = list(range(2,14))
dtrs = DecisionTreeClassifier(random_state=199)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [16]:
pipeline_rsf1 = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("RS", RandomizedSearchCV(dtrs, parameters_rs, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_rsf1.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('RS',
                 RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                                    estimator=DecisionTreeClassifier(random_state=199),
                                    n_jobs=-1,
                                    param_distributions={'criterion': ['gini',
                                                                       'entropy'],
                                                         'max_depth': [2, 3, 4,
                                                                       5, 6, 7,
                                                                       8, 9, 10,
                                                  

In [18]:
predicted_y_rsf1 = pipeline_rsf1.predict(X_test)
print(classification_report(y_test, predicted_y_rsf1))

              precision    recall  f1-score   support

          -1       0.56      0.79      0.66      3208
           0       0.34      0.01      0.02      2918
           1       0.69      0.86      0.76      6137

    accuracy                           0.64     12263
   macro avg       0.53      0.55      0.48     12263
weighted avg       0.57      0.64      0.56     12263



In [19]:
print(pipeline_rsf1["RS"].best_estimator_)
print(pipeline_rsf1["RS"].best_params_)

DecisionTreeClassifier(max_depth=12, max_leaf_nodes=40, min_samples_leaf=2,
                       min_samples_split=13, random_state=199)
{'min_samples_split': 13, 'min_samples_leaf': 2, 'max_leaf_nodes': 40, 'max_depth': 12, 'criterion': 'gini'}


## Randomized Search Accuracy

In [20]:
pipeline_rsac = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("RS", RandomizedSearchCV(dtrs, parameters_rs, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_rsac.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('RS',
                 RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                                    estimator=DecisionTreeClassifier(random_state=199),
                                    n_jobs=-1,
                                    param_distributions={'criterion': ['gini',
                                                                       'entropy'],
                                                         'max_depth': [2, 3, 4,
                                                                       5, 6, 7,
                                                                       8, 9, 10,
                                                  

In [21]:
predicted_y_rsac = pipeline_rsac.predict(X_test)
print(classification_report(y_test, predicted_y_rsac))

              precision    recall  f1-score   support

          -1       0.58      0.79      0.67      3208
           0       0.41      0.02      0.03      2918
           1       0.69      0.87      0.77      6137

    accuracy                           0.65     12263
   macro avg       0.56      0.56      0.49     12263
weighted avg       0.59      0.65      0.57     12263



In [22]:
print(pipeline_rsac["RS"].best_estimator_)
print(pipeline_rsac["RS"].best_params_)

DecisionTreeClassifier(max_depth=13, max_leaf_nodes=52, min_samples_leaf=7,
                       min_samples_split=11, random_state=199)
{'min_samples_split': 11, 'min_samples_leaf': 7, 'max_leaf_nodes': 52, 'max_depth': 13, 'criterion': 'gini'}
