# Decision Tree

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from componetes_preprocessamento import RemoveStopWords, Cleaner, Tokenizador, Stemmer, Joiner, pega_resultados, salvando_em_arquivo
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Coleta de Dados

In [2]:
dataset = pd.read_csv("datasets/reviews.csv")

In [3]:
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,order_id,review_id,review_score,review_comment_message
0,3,658677c97b385a9be170737859d3511b,e64fb393e7b32834bb789ff8bb30750e,1,Recebi bem antes do prazo estipulado.
1,4,8e6bfb81e283fa7e4f11123a3fb894f1,f7c4243c7fe1938f181bec41a392bdeb,1,Parabéns lojas lannister adorei comprar pela I...
2,9,b9bf720beb4ab3728760088589c62129,8670d52e15e00043ae7de4c01cc2fe06,0,aparelho eficiente. no site a marca do aparelh...
3,12,9d6f15f95d01e79bd1349cc208361f09,4b49719c8a200003f700d3d986ea1a19,0,"Mas um pouco ,travando...pelo valor ta Boa.\r\n"
4,15,e51478e7e277a83743b6f9991dbfa3fb,3948b09f7c818e2d86c9a546758b2335,1,"Vendedor confiável, produto ok e entrega antes..."


In [4]:
X = dataset["review_comment_message"].copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)

## Parâmetros Padrão

In [6]:
pipeline = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("DT", DecisionTreeClassifier(random_state=199)),
                    ])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('DT', DecisionTreeClassifier(random_state=199))])

In [7]:
print(classification_report(y_test, pipeline.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.69      0.71      0.70      3208
           0       0.35      0.26      0.30      2918
           1       0.74      0.82      0.77      6137

    accuracy                           0.66     12263
   macro avg       0.59      0.60      0.59     12263
weighted avg       0.63      0.66      0.64     12263



In [11]:
resultados = []

## Grid Search F1-Score

In [12]:
parameters = dict()
parameters["max_leaf_nodes"] = list(range(2, 6))
parameters["min_samples_split"] = list(range(2, 10))
parameters["max_depth"] = list(range(2,10))
parameters["criterion"] = ["gini", "entropy"] 
parameters["min_samples_leaf"] = list(range(2,6))
dtgs = DecisionTreeClassifier(random_state=199)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

In [13]:
pipeline_gsf1 = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("GS", GridSearchCV(dtgs, parameters, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_gsf1.fit(X_train, y_train)

Fitting 10 folds for each of 2048 candidates, totalling 20480 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('GS',
                 GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                              estimator=DecisionTreeClassifier(random_state=199),
                              n_jobs=-1,
                              param_grid={'criterion': ['gini', 'entropy'],
                                          'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                                          'max_leaf_nodes': [2, 3, 4, 5],
                                          'min_samples_leaf': [2, 3, 4, 5],
                                          'min_samples_split': [2, 3, 4, 5, 6,
                                                                7, 8, 9]},
          

In [14]:
cvres = pipeline_gsf1["GS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.4132345570159666 {'criterion': 'gini', 'max_depth': 9, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 9}
0.4132345570159666 {'criterion': 'gini', 'max_depth': 7, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 3}
0.4132345570159666 {'criterion': 'gini', 'max_depth': 4, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 9}
0.4132345570159666 {'criterion': 'gini', 'max_depth': 7, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 8}
0.4132345570159666 {'criterion': 'gini', 'max_depth': 7, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 7}


In [15]:
predicted_y_gsf1 = pipeline_gsf1.predict(X_test)
print(classification_report(y_test, predicted_y_gsf1))
resultado = pega_resultados("dt", "Grid Search", y_test, predicted_y_gsf1, "f1 score", pipeline_gsf1["GS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.63      0.45      0.53      3208
           0       0.00      0.00      0.00      2918
           1       0.58      0.94      0.72      6137

    accuracy                           0.59     12263
   macro avg       0.40      0.46      0.41     12263
weighted avg       0.45      0.59      0.50     12263

['dt', 'Grid Search', 0.5884367609883389, 0.4144199128967814, 0.4019519488358177, 0.4642440731585842, 'f1 score', {'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(pipeline_gsf1["GS"].best_estimator_)
print(pipeline_gsf1["GS"].best_params_)

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=5, min_samples_leaf=2,
                       random_state=199)
{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}


## Grid Search Accuracy Score

In [17]:
pipeline_gsac = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("GS", GridSearchCV(dtgs, parameters, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_gsac.fit(X_train, y_train)

Fitting 10 folds for each of 2048 candidates, totalling 20480 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('GS',
                 GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=1, shuffle=True),
                              estimator=DecisionTreeClassifier(random_state=199),
                              n_jobs=-1,
                              param_grid={'criterion': ['gini', 'entropy'],
                                          'max_depth': [2, 3, 4, 5, 6, 7, 8, 9],
                                          'max_leaf_nodes': [2, 3, 4, 5],
                                          'min_samples_leaf': [2, 3, 4, 5],
                                          'min_samples_split': [2, 3, 4, 5, 6,
                                                                7, 8, 9]},
          

In [18]:
cvres = pipeline_gsac["GS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.5886900779684672 {'criterion': 'gini', 'max_depth': 9, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 9}
0.5886900779684672 {'criterion': 'gini', 'max_depth': 7, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 3}
0.5886900779684672 {'criterion': 'gini', 'max_depth': 4, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 9}
0.5886900779684672 {'criterion': 'gini', 'max_depth': 7, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 8}
0.5886900779684672 {'criterion': 'gini', 'max_depth': 7, 'max_leaf_nodes': 5, 'min_samples_leaf': 5, 'min_samples_split': 7}


In [19]:
predicted_y_gsac = pipeline_gsac.predict(X_test)
print(classification_report(y_test, predicted_y_gsac))
resultado = pega_resultados("dt", "Grid Search", y_test, predicted_y_gsac, "acuracia", pipeline_gsac["GS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.63      0.45      0.53      3208
           0       0.00      0.00      0.00      2918
           1       0.58      0.94      0.72      6137

    accuracy                           0.59     12263
   macro avg       0.40      0.46      0.41     12263
weighted avg       0.45      0.59      0.50     12263

['dt', 'Grid Search', 0.5884367609883389, 0.4144199128967814, 0.4019519488358177, 0.4642440731585842, 'acuracia', {'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
print(pipeline_gsac["GS"].best_estimator_)
print(pipeline_gsac["GS"].best_params_)

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=5, min_samples_leaf=2,
                       random_state=199)
{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}


## Randomized Search F1-Score

In [21]:
parameters_rs = dict()
parameters_rs["max_leaf_nodes"] = list(range(2,100))
parameters_rs["min_samples_split"] = list(range(2, 14))
parameters_rs["max_depth"] = list(range(2,14))
parameters_rs["criterion"] = ["gini", "entropy"] 
parameters_rs["min_samples_leaf"] = list(range(2,14))
dtrs = DecisionTreeClassifier(random_state=1)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=199)

In [22]:
pipeline_rsf1 = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("RS", RandomizedSearchCV(dtrs, parameters_rs, scoring="f1_macro", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_rsf1.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('RS',
                 RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=199, shuffle=True),
                                    estimator=DecisionTreeClassifier(random_state=1),
                                    n_jobs=-1,
                                    param_distributions={'criterion': ['gini',
                                                                       'entropy'],
                                                         'max_depth': [2, 3, 4,
                                                                       5, 6, 7,
                                                                       8, 9, 10,
                                                  

In [23]:
cvres = pipeline_rsf1["RS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.5079789973442697 {'min_samples_split': 8, 'min_samples_leaf': 10, 'max_leaf_nodes': 80, 'max_depth': 13, 'criterion': 'gini'}
0.49206406304755196 {'min_samples_split': 9, 'min_samples_leaf': 7, 'max_leaf_nodes': 73, 'max_depth': 11, 'criterion': 'gini'}
0.48017949066502175 {'min_samples_split': 11, 'min_samples_leaf': 7, 'max_leaf_nodes': 97, 'max_depth': 13, 'criterion': 'entropy'}
0.4571819556181705 {'min_samples_split': 11, 'min_samples_leaf': 9, 'max_leaf_nodes': 27, 'max_depth': 12, 'criterion': 'entropy'}
0.4505897558162119 {'min_samples_split': 13, 'min_samples_leaf': 9, 'max_leaf_nodes': 68, 'max_depth': 9, 'criterion': 'gini'}


In [24]:
predicted_y_rsf1 = pipeline_rsf1.predict(X_test)
print(classification_report(y_test, predicted_y_rsf1))
resultado = pega_resultados("dt", "Randomized Search", y_test, predicted_y_rsf1, "f1 score", pipeline_rsf1["RS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.58      0.78      0.67      3208
           0       0.43      0.06      0.10      2918
           1       0.70      0.86      0.77      6137

    accuracy                           0.65     12263
   macro avg       0.57      0.57      0.51     12263
weighted avg       0.60      0.65      0.58     12263

['dt', 'Randomized Search', 0.648454701133491, 0.5129431467634921, 0.5715241533099452, 0.5664856647747397, 'f1 score', {'min_samples_split': 8, 'min_samples_leaf': 10, 'max_leaf_nodes': 80, 'max_depth': 13, 'criterion': 'gini'}]


In [25]:
print(pipeline_rsf1["RS"].best_estimator_)
print(pipeline_rsf1["RS"].best_params_)

DecisionTreeClassifier(max_depth=13, max_leaf_nodes=80, min_samples_leaf=10,
                       min_samples_split=8, random_state=1)
{'min_samples_split': 8, 'min_samples_leaf': 10, 'max_leaf_nodes': 80, 'max_depth': 13, 'criterion': 'gini'}


## Randomized Search Accuracy

In [26]:
pipeline_rsac = Pipeline([("Cleaner", Cleaner()), 
                    ("Tokenizador", Tokenizador("portuguese")), 
                    ("RemoveStopWords", RemoveStopWords("portuguese")), 
                    ("Stemmer", Stemmer()), 
                    ("Joiner", Joiner()),
                    ("Tfidf", TfidfVectorizer()),
                    ("RS", RandomizedSearchCV(dtrs, parameters_rs, scoring="accuracy", cv=kfold, verbose=1, refit=True,n_jobs=-1)),
                    ])

pipeline_rsac.fit(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Pipeline(steps=[('Cleaner', Cleaner()),
                ('Tokenizador', Tokenizador(lingua='portuguese')),
                ('RemoveStopWords', RemoveStopWords(lingua='portuguese')),
                ('Stemmer', Stemmer()), ('Joiner', Joiner()),
                ('Tfidf', TfidfVectorizer()),
                ('RS',
                 RandomizedSearchCV(cv=StratifiedKFold(n_splits=10, random_state=199, shuffle=True),
                                    estimator=DecisionTreeClassifier(random_state=1),
                                    n_jobs=-1,
                                    param_distributions={'criterion': ['gini',
                                                                       'entropy'],
                                                         'max_depth': [2, 3, 4,
                                                                       5, 6, 7,
                                                                       8, 9, 10,
                                                  

In [27]:
cvres = pipeline_rsac["RS"].cv_results_
idx_args = np.argsort(cvres["mean_test_score"])[::-1]
for i in idx_args[:5]:
    print(cvres["mean_test_score"][i], cvres["params"][i])

0.6355944457512057 {'min_samples_split': 5, 'min_samples_leaf': 6, 'max_leaf_nodes': 47, 'max_depth': 11, 'criterion': 'gini'}
0.6301071226799795 {'min_samples_split': 2, 'min_samples_leaf': 11, 'max_leaf_nodes': 24, 'max_depth': 11, 'criterion': 'gini'}
0.6239557327865942 {'min_samples_split': 7, 'min_samples_leaf': 8, 'max_leaf_nodes': 36, 'max_depth': 10, 'criterion': 'gini'}
0.6147636190793024 {'min_samples_split': 13, 'min_samples_leaf': 4, 'max_leaf_nodes': 98, 'max_depth': 7, 'criterion': 'gini'}
0.607074464148452 {'min_samples_split': 10, 'min_samples_leaf': 4, 'max_leaf_nodes': 70, 'max_depth': 4, 'criterion': 'gini'}


In [28]:
predicted_y_rsac = pipeline_rsac.predict(X_test)
print(classification_report(y_test, predicted_y_rsac))
resultado = pega_resultados("dt", "Randomized Search", y_test, predicted_y_rsac, "acuracia", pipeline_rsac["RS"].best_params_)
resultados.append(resultado)
print(resultado)

              precision    recall  f1-score   support

          -1       0.55      0.79      0.64      3208
           0       0.37      0.01      0.03      2918
           1       0.69      0.84      0.76      6137

    accuracy                           0.63     12263
   macro avg       0.53      0.55      0.48     12263
weighted avg       0.58      0.63      0.55     12263

['dt', 'Randomized Search', 0.6314115632390117, 0.4760656703997103, 0.5349764611418134, 0.5479704759249275, 'acuracia', {'min_samples_split': 5, 'min_samples_leaf': 6, 'max_leaf_nodes': 47, 'max_depth': 11, 'criterion': 'gini'}]


In [29]:
print(pipeline_rsac["RS"].best_estimator_)
print(pipeline_rsac["RS"].best_params_)

DecisionTreeClassifier(max_depth=11, max_leaf_nodes=47, min_samples_leaf=6,
                       min_samples_split=5, random_state=1)
{'min_samples_split': 5, 'min_samples_leaf': 6, 'max_leaf_nodes': 47, 'max_depth': 11, 'criterion': 'gini'}


In [30]:
salvando_em_arquivo("resultados/DT_resultados.csv", resultados)