In [28]:
%%python
# Required packages.
import os
import re
import numpy as np
import logging
import string


import nltk
import matplotlib.pyplot as plt
import pandas as pd

from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("portuguese")

RANDOM_SEED = 19730115
rng = np.random.RandomState(RANDOM_SEED)

logger = logging.getLogger(__name__)
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
logger.info("Required packages installed.")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gomesluiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2022-12-03 12:37:32,452 - Required packages installed.


In [29]:
# Script constants.
DATA_ROOT_FOLDER = os.path.join(
    os.path.dirname(os.path.dirname(os.path.abspath(__name__))), "data"
)
DATA_PROCESSED_FOLDER = os.path.join(DATA_ROOT_FOLDER, "processed")
DATA_EMBEDDINGS_FOLDER = os.path.join(DATA_ROOT_FOLDER, "embeddings")
URL_SOURCE = "https://raw.githubusercontent.com/gomesluiz/product-review-analytics/main/data/raw/buscape.csv"
RANDOM_SEED = 19730115
NUMBER_OF_WORDS = 50
rng = np.random.RandomState(RANDOM_SEED)


In [30]:
# Scripts Functions.
def load_dataset(source) -> None:
    """Download data from a url.

    Args:
        source (str): source data file

    Returns:
        None
    """

    return pd.read_csv(source)


def load_stratify_dataset(path, stratify=False):
    """Get the data from csv file

    Args:
        path(str): the file complete path.

    Returns:
        dataframe: A pandas dataframe.
    """
    dataset = pd.read_csv(path)

    if stratify:
        dataset = dataset.groupby("polarity", group_keys=False).apply(
            lambda x: x.sample(frac=0.4)
        )
        dataset.reset_index(drop=True, inplace=True)

    return dataset


def load_pickle_dataset(path):
    """Read pickle.

    Args:
        path (str): The full dataset file.

    Returns:
        features(array) and target(array):
    """

    # Get the features and target variable.
    data = pd.read_pickle(path)
    features, target = data.iloc[:, 1:-1].values, data.iloc[:, -1].values

    return features, target


def word_counter(text):
    """Word counter."""
    return len(text.split())


def clean_text(text):
    """Make text lowercase, remove text in square brackets, remove punctuation and
        remove words containing numbers.

    Args:
        text(str): string text to be cleaned.

    Returns:
        A cleaned text

    """
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub('[``""...]', "", text)
    text = re.sub("\n", " ", text)

    return text


def text_to_vector(model, dataset):

    vectorizer = model.fit_transform(dataset["review_text_cleaned_no_stopwords"])
    vocab = model.get_feature_names_out()
    dtm = pd.DataFrame(vectorizer.toarray(), columns=vocab)
    dtm.index = dataset.index
    return (
        pd.concat(
            [dataset[["original_index"]], dtm, dataset[["polarity"]]],
            axis=1,
        ),
        vocab,
    )


# def text_to_bert(text)
def text_to_embedding(text, model, vectorizer=None, vocab=None, size=50):
    if not vectorizer:
        raise Exception("The vectorizer parameter must not be None")

    transformed = vectorizer.transform(text)
    vectorized = pd.DataFrame(
        transformed.toarray(), columns=vectorizer.get_feature_names_out()
    )

    embeedings = pd.DataFrame()
    for i in range(vectorized.shape[0]):
        sentence = np.zeros(size)
        for word in vocab[vectorized.iloc[i, :] > 0]:
            if model.get_index(word, default=-1) != -1:
                sentence = sentence + model.get_vector(word)
            else:
                print("Out of Vocabulary")

        embeedings = pd.concat([embeedings, pd.DataFrame([sentence])])

    return embeedings


In [31]:
reviews = load_dataset(URL_SOURCE)
logger.info(f"Dataset loaded from {URL_SOURCE}.")


2022-12-03 12:38:11,483 - Dataset loaded from https://raw.githubusercontent.com/gomesluiz/product-review-analytics/main/data/raw/buscape.csv.


In [None]:
reviews.dropna(subset=["review_text"], inplace=True)
reviews.loc[:, ["review_text_cleaned"]] = reviews["review_text"].apply(
    lambda x: clean_text(x)
)
reviews.loc[:, ["review_text_cleaned_len"]] = reviews["review_text_cleaned"].apply(
    word_counter
)
reviews.loc[:, ["review_text_cleaned_no_stopwords"]] = reviews[
    "review_text_cleaned"
].apply(lambda x: " ".join([word for word in x.split() if word not in (stopwords)]))
reviews.loc[:, ["review_text_cleaned_len_no_stopwords"]] = reviews[
    "review_text_cleaned_no_stopwords"
].apply(word_counter)


In [None]:
# Replace the original polarity to -1 from 0, nan to 0.
reviews_cleaned = reviews[
    [
        "original_index",
        "review_text",
        "review_text_cleaned",
        "review_text_cleaned_len",
        "review_text_cleaned_no_stopwords",
        "review_text_cleaned_len_no_stopwords",
        "polarity",
    ]
].copy()
reviews_cleaned["polarity"] = reviews_cleaned["polarity"].replace({0: -1, np.nan: 0})
reviews_cleaned["polarity"] = reviews_cleaned["polarity"].astype(int)
#
reviews_cleaned.dropna(subset=["review_text_cleaned_no_stopwords"], inplace=True)
reviews_cleaned.head()


In [None]:
reviews_train_dataset, reviews_test_dataset = train_test_split(
    reviews_cleaned,
    stratify=reviews_cleaned["polarity"],
    test_size=0.20,
    random_state=rng,
)

In [None]:
# Counter vectorizer
vectorizers = {
    "cv": CountVectorizer(stop_words=stopwords, max_features=NUMBER_OF_WORDS),
    "tv": TfidfVectorizer(stop_words=stopwords, max_features=NUMBER_OF_WORDS),
}

if not os.path.exists(DATA_PROCESSED_FOLDER):
    os.makedirs(DATA_PROCESSED_FOLDER)

for name, model in vectorizers.items():
    reviews_train_vectorized, vocab = text_to_vector(model, reviews_train_dataset)
    reviews_train_vectorized.to_pickle(os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_train_dataset_{name}_s{NUMBER_OF_WORDS}.pkl"))

    reviews_test_vectorized, _ = text_to_vector(model, reviews_test_dataset)
    reviews_test_vectorized.to_pickle(os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_test_dataset_{name}_s{NUMBER_OF_WORDS}.pkl"))
    
    logger.info(
        f"The {name} vectorizer train matrix has {reviews_train_vectorized.shape[0]} rows and {reviews_train_vectorized.shape[1]} columns"
    )
    logger.info(
        f"The {name} vectorizer test matrix has {reviews_test_vectorized.shape[0]} rows and {reviews_test_vectorized.shape[1]} columns"
    )

In [None]:
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile


EMBEDDING_NAMES = [
    ["word2vec", "cbow_s50"],
    ["word2vec", "skip_s50"],
    ["fasttext", "cbow_s50"],
    ["fasttext", "skip_s50"],
    ["glove", "glove_s50"],
    ["wang2vec", "cbow_s50"],
    ["wang2vec", "skip_s50"],
]


def download_extract(model, architecture):
    """
    """
    url = f"http://143.107.183.175:22980/download.php?file=embeddings/{model}/{architecture}.zip"
    out_folder_path = os.path.join(DATA_EMBEDDINGS_FOLDER, model)
    out_file_path = os.path.join(out_folder_path, architecture)
    logger.info(f"Downloading: {model}_{architecture}")
    if not os.path.exists(out_file_path):
        with urlopen(url) as response:
            with ZipFile(BytesIO(response.read())) as in_file_zip:
                in_file_zip.extractall(out_folder_path)


for model, architecture in EMBEDDING_NAMES:
    download_extract(model, architecture)


In [None]:
# Load the pre-trainned fast text embedding.
logger.info("Load fast text embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "fasttext")
fasttext_cbow_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "cbow_s50.txt"))
fasttext_skip_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "skip_s50.txt"))

In [None]:
# Load the pre-trainned glove embedding.
logger.info("Load glove embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "glove")
glove_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER,"glove_s50.txt"))

In [None]:
# Load the pre-trainned wang2vec embedding.
logger.info("Load wang2vec embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "wang2vec")
wang2vec_cbow_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "cbow_s50.txt"))
wang2vec_skip_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "skip_s50.txt"))

In [None]:
# Load the pre-trainned word2vec embedding.
logger.info("Load word2vec embeddings.")
DATA_EMBEDDING_FOLDER=os.path.join(DATA_EMBEDDINGS_FOLDER, "word2vec")
word2vec_cbow_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "cbow_s50.txt"))
word2vec_skip_s50 = KeyedVectors.load_word2vec_format(os.path.join(DATA_EMBEDDING_FOLDER, "skip_s50.txt"))

In [None]:
embedding_models = [fasttext_cbow_s50, fasttext_skip_s50, glove_s50,
                    wang2vec_cbow_s50, wang2vec_skip_s50, word2vec_cbow_s50, word2vec_skip_s50]

for name, model in zip(EMBEDDING_NAMES, embedding_models):
    reviews_train_dtm = text_to_embedding(
        reviews_train_dataset['review_text'], model, vectorizers["tv"], vocab, 50)
    reviews_train_processed = pd.concat([reviews_train_dataset.reset_index()[['original_index']], reviews_train_dtm.reset_index(
        drop=True), reviews_train_dataset.reset_index()[['polarity']]], axis=1, ignore_index=True)
    reviews_train_processed.to_pickle(
        os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_train_dataset_{name[0]}_{name[1]}.pkl"))
    logger.info(
        f"The {name} vectorized train dataframe has {reviews_train_processed.shape[0]} rows and {reviews_train_processed.shape[1]} columns")

    reviews_test_dtm = text_to_embedding(
        reviews_test_dataset['review_text'], model, vectorizers["tv"], vocab, 50)
    reviews_test_processed = pd.concat([reviews_test_dataset.reset_index()[['original_index']], reviews_test_dtm.reset_index(
        drop=True), reviews_test_dataset.reset_index()[['polarity']]], axis=1, ignore_index=True)
    reviews_test_processed.to_pickle(
        os.path.join(DATA_PROCESSED_FOLDER, f"buscape_reviews_test_dataset_{name[0]}_{name[1]}.pkl"))
    logger.info(
        f"The {name} vectorized test dataframe has {reviews_test_processed.shape[0]} rows and {reviews_test_processed.shape[1]} columns")
