In [19]:
# Required packages.
import os
import re
import numpy as np
import logging
import string


import nltk
import matplotlib.pyplot as plt
import pandas as pd

from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split 

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

%matplotlib inline
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
logging.info("Required packages installed.")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gomesluiz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2022-12-02 22:10:35,274 - Required packages installed.


In [21]:
# Script constants.
DATA_ROOT_FOLDER = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__name__))), "data")
DATA_PROCESSED_FOLDER = os.path.join(DATA_ROOT_FOLDER, "processed")
URL_SOURCE = "https://raw.githubusercontent.com/gomesluiz/product-review-analytics/main/data/raw/buscape.csv"
RANDOM_SEED = 19730115
rng = np.random.RandomState(RANDOM_SEED)

In [22]:
# Scripts Functions.
def load_dataset(source) -> None:
    """Download data from a url.
    
    Args:
        source (str): source data file
        
    Returns:
        None
    """
       
    return pd.read_csv(source)

def word_counter(text):
    """ Word counter.
    """
    return len(text.split())

def clean_text(text):
    """ Make text lowercase, remove text in square brackets, remove punctuation and 
        remove words containing numbers.
    
    Args:
        text(str): string text to be cleaned.

    Returns:
        A cleaned text

    """
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[``""...]', '', text)
    text = re.sub('\n', ' ', text)

    return text

In [23]:
reviews = load_dataset(URL_SOURCE)
logging.info(f"Dataset loaded from {URL_SOURCE}.")

2022-12-02 22:10:56,306 - Dataset loaded from https://raw.githubusercontent.com/gomesluiz/product-review-analytics/main/data/raw/buscape.csv.


In [24]:
reviews.dropna(subset=['review_text'], inplace=True)
reviews.loc[:, ['review_text_cleaned']] = reviews['review_text'].apply(lambda x: clean_text(x))
reviews.loc[:, ['review_text_cleaned_len']] = reviews['review_text_cleaned'].apply(word_counter)
reviews.loc[:, ['review_text_cleaned_no_stopwords']] = reviews['review_text_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))
reviews.loc[:, ['review_text_cleaned_len_no_stopwords']] = reviews['review_text_cleaned_no_stopwords'].apply(word_counter)

  reviews.loc[:, ['review_text_cleaned_len']] = reviews['review_text_cleaned'].apply(word_counter)
  reviews.loc[:, ['review_text_cleaned_len_no_stopwords']] = reviews['review_text_cleaned_no_stopwords'].apply(word_counter)


In [25]:

if not os.path.exists(DATA_PROCESSED_FOLDER):
    os.makedirs(DATA_PROCESSED_FOLDER)

reviews[['original_index', 'review_text', 'review_text_cleaned', 'review_text_cleaned_len',
         'review_text_cleaned_no_stopwords', 'review_text_cleaned_len_no_stopwords', 'polarity']].to_csv(f"{DATA_PROCESSED_FOLDER}/buscape_reviews_full_dataset.csv", index=False)

In [26]:
reviews = load_dataset(os.path.join(DATA_PROCESSED_FOLDER,"buscape_reviews_full_dataset.csv"))

# Replace the original polarity to -1 from 0, nan to 0.  
reviews_cleaned = reviews.copy() 
reviews_cleaned['polarity'] = reviews_cleaned['polarity'].replace({0:-1, np.nan: 0})
reviews_cleaned['polarity'] = reviews_cleaned['polarity'].astype(int)
# 
reviews_cleaned.dropna(subset=['review_text_cleaned_no_stopwords'], inplace=True)
reviews_cleaned.head()

Unnamed: 0,original_index,review_text,review_text_cleaned,review_text_cleaned_len,review_text_cleaned_no_stopwords,review_text_cleaned_len_no_stopwords,polarity
0,4_55516,"Estou muito satisfeito, o visor é melhor do qu...",estou muito satisfeito o visor é melhor do que...,45,satisfeito visor melhor imaginava boas imagens...,25,1
1,minus_1_105339,"""muito boa\n\nO que gostei: preco\n\nO que não...",muito boa o que gostei preco o que não goste...,12,boa gostei preco gostei poderia,5,1
2,23_382139,"Rápida, ótima qualidade de impressão e fácil d...",rápida ótima qualidade de impressão e fácil de...,37,rápida ótima qualidade impressão fácil usar pr...,22,1
3,2_446456,Produto de ótima qualidade em todos os quesito!,produto de ótima qualidade em todos os quesito,8,produto ótima qualidade todos quesito,5,1
4,0_11324,Precisava comprar uma tv compatível com meu dv...,precisava comprar uma tv compatível com meu dv...,38,precisava comprar tv compatível dvd esra melho...,17,1


In [27]:
reviews_cleaned_train, reviews_cleaned_test = train_test_split(reviews_cleaned, stratify=reviews_cleaned['polarity'], test_size=.20, random_state=rng)

In [28]:
reviews_cleaned_train.to_csv(os.path.join(DATA_PROCESSED_FOLDER, "buscape_reviews_train_dataset.csv"), index=False)
reviews_cleaned_test.to_csv(os.path.join(DATA_PROCESSED_FOLDER,"buscape_reviews_test_dataset.csv"), index=False)