# Coleta de dados

In [31]:
import re
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [32]:
dataset = pd.read_csv("datasets/reviews.csv")

In [34]:

stopword = stopwords.words("portuguese")
stem = RSLPStemmer()
vectorizer = TfidfVectorizer()
def clear(review):
  review = review.lower()
  # remove pula de linha 
  review = re.sub('\n', ' ', review)        
  review = re.sub('\r', ' ', review)

  # remove numero 
  review = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' #numero ', review)

  # remove caracters especiais 
  review = re.sub(r'R\$', ' ', review)
  review = re.sub(r'\W', ' ', review)
  review = re.sub(r'\s+', ' ', review)

  # remove links 
  urls = re.findall('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', review)
  if len(urls) > 0:
      for url in urls:
          for link in url:
              review = review.replace(link, '')
      review = review.replace(':', '')
      review = review.replace('/', '')
  return review
reviews = dataset["review_comment_message"]
reviews = reviews.apply(lambda review: clear(review))
reviews = reviews.apply(lambda review: word_tokenize(review))
reviews = reviews.apply(lambda words_review: [word for word in words_review if word not in stopword])
reviews = reviews.apply(lambda words_review: [stem.stem(word) for word in words_review ])
reviews = reviews.apply(lambda words_review: " ".join(words_review))
reviews = vectorizer.fit_transform(reviews)
reviews

<40874x7920 sparse matrix of type '<class 'numpy.float64'>'
	with 277461 stored elements in Compressed Sparse Row format>

In [35]:
X = reviews.copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)

In [37]:
# parâmetros aplicados aos otimizadores
model_params = {
  'C': range(1, 11),
  'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
}

In [38]:
# grid search
gs = GridSearchCV(
  SVC(),
  model_params,
  verbose = 3,
  cv = 5,
  n_jobs = -1,
)

gs_svm = gs.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [39]:
# melhores parâmetros encontrados no gs
print(gs_svm.best_params_)

{'C': 1, 'kernel': 'rbf'}


In [40]:
# performance dos melhores hiperparâmetros do gs
print(classification_report(y_test, gs_svm.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.74      0.87      0.80      3208
           0       0.60      0.24      0.34      2918
           1       0.78      0.93      0.85      6137

    accuracy                           0.75     12263
   macro avg       0.71      0.68      0.66     12263
weighted avg       0.73      0.75      0.72     12263



In [41]:
# random search
rs = RandomizedSearchCV(
  SVC(),
  model_params,
  cv = 5,
  n_jobs = -1,
  scoring = 'accuracy',
  n_iter = 10,
  random_state = 199
)

rs_svm = rs.fit(X_train, y_train)

In [42]:
# melhores parâmetros encontrados no rs
print(rs_svm.best_params_)

{'kernel': 'rbf', 'C': 1}


In [43]:
# performance dos melhores hiperparâmetros do gs
print(classification_report(y_test, rs_svm.predict(X_test)))

              precision    recall  f1-score   support

          -1       0.74      0.87      0.80      3208
           0       0.60      0.24      0.34      2918
           1       0.78      0.93      0.85      6137

    accuracy                           0.75     12263
   macro avg       0.71      0.68      0.66     12263
weighted avg       0.73      0.75      0.72     12263

