In [2]:
# Install requirement packages.
%pip install -r https://raw.githubusercontent.com/gomesluiz/product-review-analytics/main/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
# Required packages.
import os
import re
import numpy as np
import logging
import string

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile

import nltk
import matplotlib.pyplot as plt
import pandas as pd

from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("portuguese")

RANDOM_SEED = 19730115
rng = np.random.RandomState(RANDOM_SEED)

print("Required packages installed.")

Required packages installed.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
# Script constants.
DATA_ROOT_FOLDER = os.path.join(
    os.path.dirname(os.path.dirname(os.path.abspath(__name__))), "data"
)
DATA_PROCESSED_FOLDER = os.path.join(DATA_ROOT_FOLDER, "processed")
DATA_EMBEDDINGS_FOLDER = os.path.join(DATA_ROOT_FOLDER, "embeddings")
URL_SOURCE = "https://raw.githubusercontent.com/gomesluiz/product-review-analytics/main/data/raw/buscape.csv"
RANDOM_SEED = 19730115
NUMBER_OF_WORDS = 50
rng = np.random.RandomState(RANDOM_SEED)

EMBEDDING_NAMES = [
    ["word2vec", "cbow_s50"],
    ["word2vec", "skip_s50"],
    ["fasttext", "cbow_s50"],
    ["fasttext", "skip_s50"],
    ["glove", "glove_s50"],
    ["wang2vec", "cbow_s50"],
    ["wang2vec", "skip_s50"],
]


In [21]:

# Common Functions.
def load_dataset(path, frac=None):
    """Get the data from csv file

    Args:
        path(str): the file complete path.

    Returns:
        dataframe: A pandas dataframe.
    """
    dataset = pd.read_csv(path)

    if frac:
        dataset = dataset.groupby("polarity", group_keys=False).apply(
            lambda x: x.sample(frac=0.4, random_state=rng)
        )
        dataset.reset_index(drop=True, inplace=True)

    return dataset


def load_pickle_dataset(path):
    """Read pickle.

    Args:
        path (str): The full dataset file.

    Returns:
        features(array) and target(array):
    """

    # Get the features and target variable.
    data = pd.read_pickle(path)
    features, target = data.iloc[:, 1:-1].values, data.iloc[:, -1].values

    return features, target


def count_word(text):
    """Word counter."""
    return len(text.split())


def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove punctuation and
        remove words containing numbers.

    Args:
        text(str): string text to be cleaned.

    Returns:
        A cleaned text

    """
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub('[``""...]', "", text)
    text = re.sub("\n", " ", text)

    return text


def text_to_vector(model, dataset):

    vectorizer = model.fit_transform(dataset["review_text_cleaned_no_stopwords"])
    vocab = model.get_feature_names_out()
    dtm = pd.DataFrame(vectorizer.toarray(), columns=vocab)
    dtm.index = dataset.index
    return (
        pd.concat(
            [dataset[["original_index"]], dtm, dataset[["polarity"]]],
            axis=1,
        ),
        vocab,
    )


# def text_to_bert(text)
def text_to_embedding(text, model, vectorizer=None, vocab=None, size=50):
    if not vectorizer:
        raise Exception("The vectorizer parameter must not be None")

    transformed = vectorizer.transform(text)
    vectorized = pd.DataFrame(
        transformed.toarray(), columns=vectorizer.get_feature_names_out()
    )

    embeedings = pd.DataFrame()
    for i in range(vectorized.shape[0]):
        sentence = np.zeros(size)
        for word in vocab[vectorized.iloc[i, :] > 0]:
            if model.get_index(word, default=-1) != -1:
                sentence = sentence + model.get_vector(word)
            else:
                print("Out of Vocabulary")

        embeedings = pd.concat([embeedings, pd.DataFrame([sentence])])

    return embeedings

def download_extract(model, architecture):
    """
    """
    url = f"http://143.107.183.175:22980/download.php?file=embeddings/{model}/{architecture}.zip"
    out_folder_path = os.path.join(DATA_EMBEDDINGS_FOLDER, model)
    out_file_path = os.path.join(out_folder_path, architecture)
    print(f"Downloading: {model}_{architecture}")
    if not os.path.exists(out_file_path):
        with urlopen(url) as response:
            with ZipFile(BytesIO(response.read())) as in_file_zip:
                in_file_zip.extractall(out_folder_path)

# Load and prepare dataset

In [25]:
reviews = load_dataset(URL_SOURCE, frac=0.4)
print(f"The reviews dataset loaded from {URL_SOURCE}.")
print(f"The reviews dataset has {reviews.shape[0]} rows and {reviews.shape[1]} cols.")

The reviews dataset loaded from https://raw.githubusercontent.com/gomesluiz/product-review-analytics/main/data/raw/buscape.csv.
The reviews dataset has 29451 rows and 8 cols.


In [26]:
# Clean dataset and collect text statistics.
reviews.dropna(subset=["review_text"], inplace=True)
reviews.loc[:, ["review_text_cleaned"]] = reviews["review_text"].apply(
    lambda x: clean_text(x)
)

reviews.loc[:, ["review_text_cleaned_len"]] = reviews["review_text_cleaned"].apply(
    count_word
)
reviews.loc[:, ["review_text_cleaned_no_stopwords"]] = reviews[
    "review_text_cleaned"
].apply(lambda x: " ".join([word for word in x.split() if word not in (stopwords)]))

reviews.loc[:, ["review_text_cleaned_len_no_stopwords"]] = reviews[
    "review_text_cleaned_no_stopwords"
].apply(count_word)

# Replace the original polarity to -1 from 0, nan to 0.
reviews_cleaned = reviews[
    [
        "original_index",
        "review_text",
        "review_text_cleaned",
        "review_text_cleaned_len",
        "review_text_cleaned_no_stopwords",
        "review_text_cleaned_len_no_stopwords",
        "polarity",
    ]
].copy()
reviews_cleaned["polarity"] = reviews_cleaned["polarity"].replace({0: -1, np.nan: 0})
reviews_cleaned["polarity"] = reviews_cleaned["polarity"].astype(int)
#
reviews_cleaned.dropna(subset=["review_text_cleaned_no_stopwords"], inplace=True)
reviews_cleaned.head()

  reviews.loc[:, ["review_text_cleaned_len"]] = reviews["review_text_cleaned"].apply(
  reviews.loc[:, ["review_text_cleaned_len_no_stopwords"]] = reviews[


Unnamed: 0,original_index,review_text,review_text_cleaned,review_text_cleaned_len,review_text_cleaned_no_stopwords,review_text_cleaned_len_no_stopwords,polarity
0,0_430974,Dá muito problema no encanamento e faz muito b...,dá muito problema no encanamento e faz muito b...,9,dá problema encanamento faz barulho,5,-1
1,0_43825,poow galera aki naum da para upa direito por q...,poow galera aki naum da para upa direito por q...,28,poow galera aki naum upa direito muita gente f...,14,-1
2,0_401867,Não gostei deste aparelho. Não dá pra deixar o...,não gostei deste aparelho não dá pra deixar os...,24,gostei deste aparelho dá pra deixar aplicativo...,16,-1
3,9_159935,Muito ruim a adega acd 28 pois o compartimento...,muito ruim a adega acd pois o compartimento i...,79,ruim adega acd pois compartimento inferior alt...,46,-1
4,minus_1_393969,A tv LCD é muito boa e economiza espaço dentro...,a tv lcd é muito boa e economiza espaço dentro...,23,tv lcd boa economiza espaço dentro casa gostei...,13,-1


# Vectorize dataset

In [27]:
reviews_train_dataset, reviews_test_dataset = train_test_split(
    reviews_cleaned,
    stratify=reviews_cleaned["polarity"],
    test_size=0.20,
    random_state=rng,
)

In [28]:
# Counter vectorizer
vectorizers = {
    "cv": CountVectorizer(stop_words=stopwords, max_features=NUMBER_OF_WORDS),
    "tv": TfidfVectorizer(stop_words=stopwords, max_features=NUMBER_OF_WORDS),
}

if not os.path.exists(DATA_PROCESSED_FOLDER):
    os.makedirs(DATA_PROCESSED_FOLDER)

for name, model in vectorizers.items():
    reviews_train_vectorized, vocab = text_to_vector(model, reviews_train_dataset)
    reviews_train_vectorized.to_pickle(os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_train_dataset_{name}_s{NUMBER_OF_WORDS}.pkl"))

    reviews_test_vectorized, _ = text_to_vector(model, reviews_test_dataset)
    reviews_test_vectorized.to_pickle(os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_test_dataset_{name}_s{NUMBER_OF_WORDS}.pkl"))
    
    print(
        f"The {name} vectorizer train matrix has {reviews_train_vectorized.shape[0]} rows and {reviews_train_vectorized.shape[1]} columns"
    )
    print(
        f"The {name} vectorizer test matrix has {reviews_test_vectorized.shape[0]} rows and {reviews_test_vectorized.shape[1]} columns"
    )

The cv vectorizer train matrix has 23560 rows and 52 columns
The cv vectorizer test matrix has 5891 rows and 52 columns
The tv vectorizer train matrix has 23560 rows and 52 columns
The tv vectorizer test matrix has 5891 rows and 52 columns


In [29]:
# Download embeddings model.
for model, architecture in EMBEDDING_NAMES:
    download_extract(model, architecture)


Downloading: word2vec_cbow_s50
Downloading: word2vec_skip_s50
Downloading: fasttext_cbow_s50
Downloading: fasttext_skip_s50
Downloading: glove_glove_s50
Downloading: wang2vec_cbow_s50
Downloading: wang2vec_skip_s50


In [None]:
# Load the pre-trainned fast text embedding.
logger.info("Load fast text embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "fasttext")
fasttext_cbow_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "cbow_s50.txt"))
fasttext_skip_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "skip_s50.txt"))

In [None]:
# Load the pre-trainned glove embedding.
logger.info("Load glove embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "glove")
glove_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER,"glove_s50.txt"))

In [None]:
# Load the pre-trainned wang2vec embedding.
logger.info("Load wang2vec embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "wang2vec")
wang2vec_cbow_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "cbow_s50.txt"))
wang2vec_skip_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "skip_s50.txt"))

In [None]:
# Load the pre-trainned word2vec embedding.
logger.info("Load word2vec embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "word2vec")
word2vec_cbow_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "cbow_s50.txt"))
word2vec_skip_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "skip_s50.txt"))

In [None]:
embedding_models = [fasttext_cbow_s50, fasttext_skip_s50, glove_s50,
                    wang2vec_cbow_s50, wang2vec_skip_s50, word2vec_cbow_s50, word2vec_skip_s50]

for name, model in zip(EMBEDDING_NAMES, embedding_models):
    reviews_train_dtm = text_to_embedding(
        reviews_train_dataset['review_text'], model, vectorizers["tv"], vocab, 50)
    reviews_train_processed = pd.concat([reviews_train_dataset.reset_index()[['original_index']], reviews_train_dtm.reset_index(
        drop=True), reviews_train_dataset.reset_index()[['polarity']]], axis=1, ignore_index=True)
    reviews_train_processed.to_pickle(
        os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_train_dataset_{name[0]}_{name[1]}.pkl"))
    logger.info(
        f"The {name} vectorized train dataframe has {reviews_train_processed.shape[0]} rows and {reviews_train_processed.shape[1]} columns")

    reviews_test_dtm = text_to_embedding(
        reviews_test_dataset['review_text'], model, vectorizers["tv"], vocab, 50)
    reviews_test_processed = pd.concat([reviews_test_dataset.reset_index()[['original_index']], reviews_test_dtm.reset_index(
        drop=True), reviews_test_dataset.reset_index()[['polarity']]], axis=1, ignore_index=True)
    reviews_test_processed.to_pickle(
        os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_test_dataset_{name[0]}_{name[1]}.pkl"))
    logger.info(
        f"The {name} vectorized test dataframe has {reviews_test_processed.shape[0]} rows and {reviews_test_processed.shape[1]} columns")


In [None]:
import torch

from transformers import AutoTokenizer
from transformers import AutoModel

model = AutoModel.from_pretrained("neuralmind/bert-base-portuguese-cased")
logging.info(f"Transformers model class model: {type(model)}")
tokenizer = AutoTokenizer.from_pretrained(
    "neuralmind/bert-base-portuguese-cased", do_lower_case=True
)
logging.info(f"Transformers tokenizer class: {type(tokenizer)}")


# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
    """Perform required preprocessing steps for pretrained BERT.
    @param    data (np.array): Array of texts to be processed.
    @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
    @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                  tokens should be attended to by the model.
    """
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        # `encode_plus` will:
        #    (1) Tokenize the sentence
        #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
        #    (3) Truncate/Pad sentence to max length
        #    (4) Map tokens to their IDs
        #    (5) Create attention mask
        #    (6) Return a dictionary of outputs
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,     # Add `[CLS]` and `[SEP]`
            max_length=64,               # Max length to truncate/pad
            padding='max_length',        # Pad sentence to max length
            truncation='only_first',     # Truncate sentence to max length
            return_attention_mask=True,  # Return attention mask
        )

        # Add the outputs to the lists
        input_ids.append(encoded_sent.get("input_ids"))
        attention_masks.append(encoded_sent.get("attention_mask"))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks


review_train_inputs, review_train_masks = preprocessing_for_bert(
    list(reviews_train_dataset["review_text"])
)
with torch.no_grad():
    outs = model(review_train_inputs, review_train_masks)
    review_train_bert_encoded = outs[0][:, 0, :]

review_test_inputs, review_test_masks = preprocessing_for_bert(
    list(reviews_test_dataset["review_text"])
)
with torch.no_grad():
    outs = model(review_test_inputs, review_test_masks)
    review_test_bert_encoded = outs[0][:, 0, :]

In [None]:
reviews_train_processed_bert = pd.concat(
    [
        reviews_train_dataset[["original_index"]],
        review_train_bert_encoded,
        reviews_train_dataset[["polarity"]],
    ],
    axis=1,
)
reviews_train_processed_bert.to_pickle(os.path.join(DATA_PROCESSED_FOLDER, "buscape_reviews_train_dataset_bert.pkl"))

reviews_test_processed_bert = pd.concat(
    [
        reviews_test_dataset[["original_index"]],
        review_test_bert_encoded,
        reviews_test_dataset[["polarity"]],
    ],
    axis=1,
)
reviews_test_processed_bert.to_pickle(os.path.join(DATA_PROCESSED_FOLDER,"buscape_reviews_train_dataset_bert.pkl"))