## 1. Connect to the Dataset (Google Collab only)

In [None]:
from google.colab import drive

drive.mount('/content/drive')

## Check hardware and versions

In [None]:
import os, platform, subprocess, re

import sys
import sklearn
import pandas as pd
import nltk
import imblearn

def get_processor_name():
    if platform.system() == "Windows":
        return platform.processor()
    elif platform.system() == "Darwin":
        os.environ['PATH'] = os.environ['PATH'] + os.pathsep + '/usr/sbin'
        command ="sysctl -n machdep.cpu.brand_string"
        return subprocess.check_output(command).strip()
    elif platform.system() == "Linux":
        command = "cat /proc/cpuinfo"
        all_info = subprocess.check_output(command, shell=True).decode().strip()
        for line in all_info.split("\n"):
            if "model name" in line:
                return re.sub( ".*model name.*:", "", line,1)
    return ""

print(f"Python version: {sys.version}")
print(f"Processor: {get_processor_name()}")
print(f"sklearn version: {sklearn.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"nltk version: {nltk.__version__}")
print(f"imbalanced-learn version: {imblearn.__version__}")

## 2. Loading the dataset

In [None]:
import pandas as pd

random_state = 42 # for reproducibility

# Global constants
SUBMISSION_DATE = 'submission_date'
OVERALL_RATING = 'overall_rating'
RECOMMEND_TO_A_FRIEND = 'recommend_to_a_friend'
REVIEW_TEXT = 'review_text'
REVIEWER_GENDER = 'reviewer_gender'
PROCESSED_TEXT = 'processed_text'

# dtypes = {
#     'submission_date': str,
#     'reviewer_id': 'string',
#     'product_id': 'string'
#     'product_name': str,
#     'product_brand': str,
#     'site_category_lv1': str,
#     'site_category_lv2': str,
#     'review_title': str,
#     'overall_rating': int,
#     'recommend_to_a_friend': bool,
#     'review_text': str,
#     'reviewer_birth_year': float,
#     'reviewer_gender': str,
#     'reviewer_state': str
# }

df = pd.read_csv('/content/drive/MyDrive/Datasets/B2W-Reviews01.csv', parse_dates=[SUBMISSION_DATE], low_memory=False)
print('dtypes:')
print(df.dtypes)

print('\n\n')
print(f'Data size: { str(len(df)) }')

print('\n\n')
df.info()

## 3. Exploring the dataset

### Checking for columns with missing data:

In [None]:
def calculate_percentage(number_of_null_rows):
  total_rows = len(df)
  percentage = (number_of_null_rows / total_rows) * 100
  formatted_percentage = format(percentage, ".2f")
  return f'{ formatted_percentage }%'

def check_for_null(column_name):
  number_of_null_rows = df[column_name].isnull().sum()
  print(f'{ column_name }: { number_of_null_rows }, which is { calculate_percentage(number_of_null_rows) }')

for column_name in df.columns:
  check_for_null(column_name)

### Checking the distribution of ratings:

In [None]:
def total_of_ratings(dataframe, rating):
  total = (dataframe[OVERALL_RATING] == rating).sum()
  print(f'{ rating } stars: { total }, which is { calculate_percentage(total) }')

for rating in [1, 2, 3, 4, 5]:
  total_of_ratings(df, rating)

It means that, if we take 3 stars as neutral, unfavorable reviews would be 27.02% and favorable reviews would be 60,66.

### Drawing a word cloud of the positive and negative reviews

In [None]:
import matplotlib.pyplot as plt
import random
from wordcloud import WordCloud

def blue_shades_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    # Generate varying shades of blue
    hue = 210  # Hue for blue color in HSL
    saturation = 100  # Saturation for blue color in HSL
    lightness = random.randint(30, 70)  # Vary the lightness component

    # Return the HSL color string
    return f"hsl({hue}, {saturation}%, {lightness}%)"


def red_shades_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    # Generate varying shades of red
    hue = random.randint(0, 20)  # Vary the hue component for different shades of red
    saturation = 100  # Saturation for red color in HSL
    lightness = 50  # Lightness for red color in HSL (you can adjust this)

    # Return the HSL color string
    return f"hsl({hue}, {saturation}%, {lightness}%)"


filtered_df = df.dropna(subset = [REVIEW_TEXT])

negative_reviews = filtered_df[filtered_df[OVERALL_RATING].isin([1, 2])]
positive_reviews = filtered_df[filtered_df[OVERALL_RATING].isin([4, 5])]

negative_text = " ".join(negative_reviews[REVIEW_TEXT].str.lower())
positive_text = " ".join(positive_reviews[REVIEW_TEXT].str.lower())

positive_wordcloud = WordCloud(width=800, height=400, background_color='white', color_func=blue_shades_color_func).generate(positive_text)
negative_wordcloud = WordCloud(width=800, height=400, background_color='white', color_func=red_shades_color_func).generate(negative_text)

plt.figure(figsize=(10, 5))
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

plt.figure(figsize=(10, 5))
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## 4. Pre-Processing of Data

### Select the samples

In [None]:
# SAMPLE_SIZE = 15000
SAMPLE_SIZE = 3000

# Drop rows with null values in 'review_text' column and select a sample
samples = df.dropna(subset = [REVIEW_TEXT])

# Drops samples with 3 stars, as they will be considered neutral
samples = samples[samples[OVERALL_RATING] != 3]

samples = samples.sample(
    n=SAMPLE_SIZE,
    random_state=random_state
)

In [None]:
print(f'samples size is {len(samples)}')

### Remove stop words and apply stemming

In [None]:
!pip install nltk

import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer
from nltk.tokenize import RegexpTokenizer

# Get the stopwords for Portuguese
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

SHOULD_PREPROCESS_TEXT = True

portuguese_stopwords = set(stopwords.words('portuguese'))

print(f'portuguese stopwords: {portuguese_stopwords}')


def preprocess(text):
  tokenizer = RegexpTokenizer(r'\w+')
  words = tokenizer.tokenize(text)
  stemmer = RSLPStemmer()
  filtered_words = [word for word in words if word.lower() not in portuguese_stopwords]
  stemmed_words = [stemmer.stem(word) for word in filtered_words]
  return ' '.join(stemmed_words)


# Create a TF-IDF vectorizer to convert text data into numerical features
if SHOULD_PREPROCESS_TEXT:
  samples[PROCESSED_TEXT] = samples[REVIEW_TEXT].apply(preprocess)
else:
  samples[PROCESSED_TEXT] = samples[REVIEW_TEXT]


print('\n\nSamples:')
print(samples[PROCESSED_TEXT])

## 5. Prepare training and test

### Create X and Y values

In [None]:
X = samples[PROCESSED_TEXT].values

Y = samples[OVERALL_RATING]

# Maps 1 and 2 stars as negative reviews, and 4 and 5 stars as positive reviews
Y = Y.map({1:0, 2:0, 4: 1, 5: 1}).values

print('X values:')
print(X)

print('\nY values:')
print(Y)

### Separate training and test

In [None]:
from sklearn.model_selection import train_test_split

TEST_PERCENTAGE = 0.2

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=TEST_PERCENTAGE, random_state=random_state)

### Balance the training data

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.utils import shuffle


print(f'X_train length: {len(X_train)}')
print(X_train)

reshaped_X_train = X_train.reshape(-1, 1)
print(reshaped_X_train)

print(f'\n\nY_train length: {len(Y_train)}')
print(Y_train)

Y_train_series = pd.Series(Y_train)

undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, Y_train_resampled = undersampler.fit_resample(reshaped_X_train, Y_train)

X_train_resampled = X_train_resampled.flatten()

print(f'\n\nX_train_resampled length: {len(X_train_resampled)}')
print(X_train_resampled)

print(f'\n\nY_train_resampled length: {len(Y_train_resampled)}')

X_train_resampled = shuffle(X_train_resampled, random_state=random_state)
Y_train_resampled = shuffle(Y_train_resampled, random_state=random_state)

print(f'\n\nX_test length: {len(X_test)}')
print(X_test)

print(f'\n\nY_test length: {len(Y_test)}')
print(Y_test)

## 6. Train and evaluate

### Vectorize, fit and transform

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train_resampled)
X_test_vectorized = vectorizer.transform(X_test)

### Run model to evaluate

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, f1_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn
import time

confusion_matrices = []

def evaluate_model(model_name, model_instance):
  t1 = time.time()
  # Train the model
  model_instance.fit(X_train_vectorized, Y_train_resampled)
  # Make predictions on the test data
  Y_pred = model_instance.predict(X_test_vectorized)
  # Calculate accuracy
  accuracy = accuracy_score(Y_test, Y_pred)
  recall = recall_score(Y_test, Y_pred, average='macro')
  f1 = f1_score(Y_test, Y_pred, average='macro')
  print(f'Accuracy of {model_name}: {accuracy}')
  print(f'Recall of {model_name}: {recall}')
  print(f'F1 of {model_name}: {f1}')

  t2 = time.time()
  print(f'\nTook: {t2 - t1} seconds to run')

  fpr, tpr, _ = roc_curve(Y_test, Y_pred)
  roc_auc = roc_auc_score(Y_test, Y_pred)

  plot_roc_curve(fpr, tpr, roc_auc)

  data = confusion_matrix(Y_test, Y_pred)
  data.astype(int)
  confusion_matrices.append(data)


def plot_roc_curve(fpr, tpr, roc_auc):
  plt.figure()
  plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title('Receiver Operating Characteristic')
  plt.legend(loc="lower right")
  plt.show()


def plot_confusion_matrix(matrix, labels, ax):
  """Plot a single confusion matrix in a subplot."""
  seaborn.set(color_codes=True)
  seaborn.set(font_scale=1.4)
  seaborn.heatmap(matrix, annot=True, fmt='d', cmap="YlGnBu", cbar_kws={'label': 'Scale'}, ax=ax)
  ax.set_xticklabels(labels)
  ax.set_yticklabels(labels)
  ax.set(ylabel="True Label", xlabel="Predicted Label")


def plot_confusion_matrices(matrices_list, labels, titles):
  """Plot multiple confusion matrices in a 4x4 grid."""
  num_matrices = len(matrices_list)
  num_rows = 2  # Number of rows in the grid
  num_cols = 2  # Number of columns in the grid

  # Create a grid of subplots
  fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 12))

  for i in range(num_matrices):
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]
    plot_confusion_matrix(matrices_list[i], labels, ax)
    ax.set_title(titles[i])

  # Adjust layout and display
  plt.tight_layout()
  plt.show()

In [None]:
from sklearn.svm import SVC

evaluate_model('Support Vector Machines', SVC(kernel='linear'))

In [None]:
from sklearn.naive_bayes import MultinomialNB

evaluate_model('Naive Bayes', MultinomialNB())

In [None]:
from sklearn.linear_model import LogisticRegression

evaluate_model('Logistic Regression', LogisticRegression(n_jobs=-1))

In [None]:
from sklearn.ensemble import RandomForestClassifier

evaluate_model('Random Forest', RandomForestClassifier(n_jobs=-1, random_state=random_state))

In [None]:
plot_confusion_matrices(confusion_matrices, labels=['negative', 'positive'], titles=['SVM', 'Naive Bayes', 'Regressão Logística', 'Random Forest'])