# Coleta de dados

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
# Classes do modelo
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 
nltk.download('rslp')
nltk.download('stopwords')

In [None]:
dataset = pd.read_csv("datasets/reviews.csv")

# Pré-Processamento dos Dados

## Definindo Tipos

In [None]:
dataset.head(5)

In [None]:

stopword = stopwords.words("portuguese")
stem = RSLPStemmer()
vectorizer = TfidfVectorizer()
def clear(review):
  review = review.lower()
  # remove pula de linha 
  review = re.sub('\n', ' ', review)        
  review = re.sub('\r', ' ', review)

  # remove numero 
  review = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' #numero ', review)

  # remove caracters especiais 
  review = re.sub(r'R\$', ' ', review)
  review = re.sub(r'\W', ' ', review)
  review = re.sub(r'\s+', ' ', review)

  # remove links 
  urls = re.findall('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', review)
  if len(urls) > 0:
      for url in urls:
          for link in url:
              review = review.replace(link, '')
      review = review.replace(':', '')
      review = review.replace('/', '')
  return review
reviews = dataset["review_comment_message"]
reviews = reviews.apply(lambda review: clear(review))
reviews = reviews.apply(lambda review: word_tokenize(review))
reviews = reviews.apply(lambda words_review: [word for word in words_review if word not in stopword])
reviews = reviews.apply(lambda words_review: [stem.stem(word) for word in words_review ])
reviews = reviews.apply(lambda words_review: " ".join(words_review))
reviews = vectorizer.fit_transform(reviews)
reviews


In [None]:
X = reviews.copy()
y = dataset["review_score"].copy()
y = np.array(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 199)


In [None]:
# vamos criar um classificador RandonForest:
model_RF= RandomForestClassifier(n_estimators=10, random_state=42)
model_RF.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model_RF.predict(X_test)))


In [None]:
from sklearn import metrics

k_range=range(1,35)
scores=[]

for k in k_range:
    
    RF= RandomForestClassifier(n_estimators=k, random_state=42)
    RF.fit(X_train, y_train)
    y_pred=RF.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))    

In [None]:
max(scores)

In [None]:
import matplotlib.pyplot as plt

plt.plot(k_range, scores)
plt.xlabel('k value')
plt.ylabel('Accuracy')

plt.show()

# Otimizando com o Grid Search:

In [None]:
# Hiperprâmetros para o Random Forest:

param_grid = { 
    'n_estimators': [10, 15,20],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import StratifiedKFold



model_RF= RandomForestClassifier( random_state=42)

skf = StratifiedKFold(n_splits=10,shuffle=True, random_state=1)

gs = GridSearchCV(model_RF, param_grid, refit= False, cv=skf)

gs.fit(X_train, y_train)

In [None]:
# Hiperparâmetros otimizados:
gs.best_params_

In [None]:
# Trinamento do modelo com o parâmetros otimizados:
model_RF1= RandomForestClassifier(n_estimators= 10, max_features= 'auto', criterion= 'gini', random_state=42)
model_RF1.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model_RF1.predict(X_test)))



# Otimizando com Randon Search:

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rand = RandomizedSearchCV(model_RF, param_grid, cv=skf, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X_train, y_train)

In [29]:
# Hiperparâmetros otimizados:
print("Os melhores hiparametros:", rand.best_params_)

Os melhores hiparametros: {'n_estimators': 15, 'max_features': 'sqrt', 'max_depth': 8, 'criterion': 'gini'}


In [30]:
# Trinamento do modelo com o parâmetros otimizados:
model_RF2= RandomForestClassifier(n_estimators= 15, max_features= 'sqrt', criterion= 'gini', random_state=42)
model_RF2.fit(X_train, y_train)

# e ver a sua performance no dataset de teste
print(classification_report(y_test, model_RF2.predict(X_test)))


              precision    recall  f1-score   support

          -1       0.70      0.84      0.76      3208
           0       0.52      0.21      0.30      2918
           1       0.76      0.90      0.82      6137

    accuracy                           0.72     12263
   macro avg       0.66      0.65      0.63     12263
weighted avg       0.69      0.72      0.68     12263

