In [68]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score,confusion_matrix, accuracy_score, make_scorer, f1_score,precision_score,recall_score, plot_confusion_matrix
nltk.download('rslp')
nltk.download('stopwords')

[nltk_data] Downloading package rslp to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cfpc2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [69]:
dataset = pd.read_csv("datasets/reviews.csv")

In [70]:
dataset.head(5)

Unnamed: 0.1,Unnamed: 0,order_id,review_id,review_score,review_comment_message
0,3,658677c97b385a9be170737859d3511b,e64fb393e7b32834bb789ff8bb30750e,1,Recebi bem antes do prazo estipulado.
1,4,8e6bfb81e283fa7e4f11123a3fb894f1,f7c4243c7fe1938f181bec41a392bdeb,1,Parabéns lojas lannister adorei comprar pela I...
2,9,b9bf720beb4ab3728760088589c62129,8670d52e15e00043ae7de4c01cc2fe06,0,aparelho eficiente. no site a marca do aparelh...
3,12,9d6f15f95d01e79bd1349cc208361f09,4b49719c8a200003f700d3d986ea1a19,0,"Mas um pouco ,travando...pelo valor ta Boa.\r\n"
4,15,e51478e7e277a83743b6f9991dbfa3fb,3948b09f7c818e2d86c9a546758b2335,1,"Vendedor confiável, produto ok e entrega antes..."


In [71]:
stopword = stopwords.words("portuguese")
stem = RSLPStemmer()
vectorizer = TfidfVectorizer()
def clear(review):
  review = review.lower()
  # remove pula de linha 
  review = re.sub('\n', ' ', review)        
  review = re.sub('\r', ' ', review)

  # remove numero 
  review = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' #numero ', review)

  # remove caracters especiais 
  review = re.sub(r'R\$', ' ', review)
  review = re.sub(r'\W', ' ', review)
  review = re.sub(r'\s+', ' ', review)

  # remove links 
  urls = re.findall('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', review)
  if len(urls) > 0:
      for url in urls:
          for link in url:
              review = review.replace(link, '')
      review = review.replace(':', '')
      review = review.replace('/', '')
  return review
reviews = dataset["review_comment_message"]
reviews = reviews.apply(lambda review: clear(review))
reviews = reviews.apply(lambda review: word_tokenize(review))
reviews = reviews.apply(lambda words_review: [word for word in words_review if word not in stopword])
reviews = reviews.apply(lambda words_review: [stem.stem(word) for word in words_review ])
reviews = reviews.apply(lambda words_review: " ".join(words_review))
reviews = vectorizer.fit_transform(reviews)
reviews


<40874x7920 sparse matrix of type '<class 'numpy.float64'>'
	with 277461 stored elements in Compressed Sparse Row format>

In [72]:
X = reviews.copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)

In [74]:
dt = DecisionTreeClassifier(random_state=199)
dt.fit(X_train, y_train)
print(classification_report(y_test, dt.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.69      0.71      0.70      3208
           0       0.37      0.27      0.31      2918
           1       0.74      0.82      0.78      6137

    accuracy                           0.66     12263
   macro avg       0.60      0.60      0.60     12263
weighted avg       0.64      0.66      0.65     12263



In [75]:
parameters = dict()
parameters["max_leaf_nodes"] = list(range(2, 6))
parameters["min_samples_split"] = list(range(2, 10))
parameters["max_depth"] = list(range(2,10))
parameters["criterion"] = ["gini", "entropy"] 
parameters["min_samples_leaf"] = list(range(2,6))

In [76]:
dtgs = DecisionTreeClassifier(random_state=199)

In [77]:
search = GridSearchCV(dtgs, parameters, scoring="accuracy", cv=10, verbose=1, refit=True,n_jobs=-4)

In [78]:
result = search.fit(X_train, y_train)

Fitting 10 folds for each of 2048 candidates, totalling 20480 fits


In [79]:
best_model = result.best_estimator_
predicted_y = best_model.predict(X_test)
print(classification_report(y_test, predicted_y))

              precision    recall  f1-score   support

          -1       0.63      0.46      0.53      3208
           0       0.00      0.00      0.00      2918
           1       0.58      0.94      0.72      6137

    accuracy                           0.59     12263
   macro avg       0.40      0.46      0.41     12263
weighted avg       0.45      0.59      0.50     12263



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
print(result.best_estimator_)
print(search.best_params_)

DecisionTreeClassifier(max_depth=3, max_leaf_nodes=5, min_samples_leaf=2,
                       random_state=199)
{'criterion': 'gini', 'max_depth': 3, 'max_leaf_nodes': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}
