# Feature Engineering

In [2]:
import logging
import re
import string

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

RANDOM_SEED = 19730115
NUMBER_OF_WORDS = 50
rng = np.random.RandomState(RANDOM_SEED)

#logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
#1591632
logging.info("Required packages installed.")


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def load_dataset(path, stratify=False):
    """Get the data from csv file

    Args:
        path(str): the file complete path. 

    Returns:
        dataframe: A pandas dataframe.
    """
    dataset = pd.read_csv(path)

    if stratify:
        dataset = dataset.groupby('polarity', group_keys=False).apply(
            lambda x: x.sample(frac=0.4))
        dataset.reset_index(drop=True, inplace=True)

    return dataset


In [4]:
# Load the reviews datasets.
reviews_train_dataset = load_dataset(
    "../data/processed/buscape_reviews_train_dataset.csv", True)
reviews_test_dataset = load_dataset(
    "../data/processed/buscape_reviews_test_dataset.csv", True)


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/buscape_reviews_train_dataset.csv'

In [None]:
reviews_train_dataset.head()


In [None]:
plt.figure(figsize=(20, 10))
plt.title('Polarity Distribution in Train')
reviews_train_dataset['polarity'].value_counts().plot(kind='bar')


In [None]:
reviews_test_dataset.info()


In [None]:
plt.figure(figsize=(20, 10))
plt.title('Polarity Distribution in Test')
reviews_test_dataset['polarity'].value_counts().plot(kind='bar')


### Counter Vectorizer

In [None]:
cv = CountVectorizer(stop_words=stopwords, max_features=NUMBER_OF_WORDS)
reviews_train_cv = cv.fit_transform(
    reviews_train_dataset['review_text_cleaned_no_stopwords'])
reviews_train_dtm_cv = pd.DataFrame(
    reviews_train_cv.toarray(), columns=cv.get_feature_names_out())
reviews_train_dtm_cv.index = reviews_train_dataset.index
reviews_train_processed_cv = pd.concat([reviews_train_dataset[[
                                       'original_index']], reviews_train_dtm_cv, reviews_train_dataset[['polarity']]], axis=1)
print(
    f"The counter vectorizer train matrix has {reviews_train_processed_cv.shape[0]} rows and {reviews_train_processed_cv.shape[1]} columns")

reviews_test_cv = cv.transform(
    reviews_test_dataset['review_text_cleaned_no_stopwords'])
reviews_test_dtm_cv = pd.DataFrame(
    reviews_test_cv.toarray(), columns=cv.get_feature_names_out())
reviews_test_dtm_cv.index = reviews_test_dataset.index
reviews_test_processed_cv = pd.concat([reviews_test_dataset[[
                                      'original_index']], reviews_test_dtm_cv, reviews_test_dataset[['polarity']]], axis=1)
print(
    f"The counter vectorizer test matrix has {reviews_test_processed_cv.shape[0]} rows and {reviews_test_processed_cv.shape[1]} columns")


In [None]:
reviews_train_processed_cv.head(5)


In [None]:
reviews_test_processed_cv.head(5)


In [None]:
reviews_train_processed_cv.to_pickle(
    f'../data/processed/buscape_reviews_train_dataset_cv_s{NUMBER_OF_WORDS}.pkl')
reviews_test_processed_cv.to_pickle(
    f'../data/processed/buscape_reviews_test_dataset_cv_s{NUMBER_OF_WORDS}.pkl')


### TF-IDF Vectorizer

In [None]:

tv = TfidfVectorizer(stop_words=stopwords, max_features=50)
reviews_train_tv = tv.fit_transform(reviews_train_dataset['review_text'])
reviews_train_dtm_tv = pd.DataFrame(
    reviews_train_tv.toarray(), columns=tv.get_feature_names_out())
reviews_train_dtm_tv.index = reviews_train_dataset.index
reviews_train_processed_tv = pd.concat([reviews_train_dataset[[
                                       'original_index']], reviews_train_dtm_tv, reviews_train_dataset[['polarity']]], axis=1)
print(
    f"The tf-idf vectorizer train matrix has {reviews_train_processed_tv.shape[0]} rows and {reviews_train_processed_tv.shape[1]} columns")

reviews_test_tv = tv.transform(reviews_test_dataset['review_text'])
reviews_test_dtm_tv = pd.DataFrame(
    reviews_test_tv.toarray(), columns=tv.get_feature_names_out())
reviews_test_dtm_tv.index = reviews_test_dataset.index
reviews_test_processed_tv = pd.concat([reviews_test_dataset[[
                                      'original_index']], reviews_test_dtm_tv, reviews_test_dataset[['polarity']]], axis=1)
print(
    f"The tf-idf vectorizer test matrix has {reviews_test_processed_tv.shape[0]} rows and {reviews_test_processed_tv.shape[1]} columns")


In [None]:
reviews_train_processed_tv.head(5)


In [None]:
reviews_train_processed_tv.to_pickle(
    f'../data/processed/buscape_reviews_train_dataset_tv_s{NUMBER_OF_WORDS}.pkl')
reviews_test_processed_tv.to_pickle(
    f'../data/processed/buscape_reviews_test_dataset_tv_s{NUMBER_OF_WORDS}.pkl')


### Embedding Vectorizer

In [None]:
# Load the pre-trainned fast text embedding.
logging.info("Load fast text embeddings.")
fasttext_cbow_s50 = KeyedVectors.load_word2vec_format(
    '../data/embeedings/fasttext_cbow_s50/cbow_s50.txt')
fasttext_skip_s50 = KeyedVectors.load_word2vec_format(
    '../data/embeedings/fasttext_skip_s50/skip_s50.txt')


In [None]:
# Load the pre-trainned glove embedding.
logging.info("Load glove embeddings.")
glove_s50 = KeyedVectors.load_word2vec_format(
    '../data/embeedings/glove_s50/glove_s50.txt')


In [None]:
# Load the pre-trainned fast text embedding.
logging.info("Load wang2vec embeddings.")
wang2vec_cbow_s50 = KeyedVectors.load_word2vec_format(
    '../data/embeedings/wang2vec_cbow_s50/cbow_s50.txt')
wang2vec_skip_s50 = KeyedVectors.load_word2vec_format(
    '../data/embeedings/wang2vec_skip_s50/skip_s50.txt')


In [None]:
# Load the pre-trainned word2vec embedding.
logging.info("Load word2vec embeddings.")
word2vec_cbow_s50 = KeyedVectors.load_word2vec_format(
    '../data/embeedings/word2vec_cbow_s50/cbow_s50.txt')
word2vec_skip_s50 = KeyedVectors.load_word2vec_format(
    '../data/embeedings/word2vec_skip_s50/skip_s50.txt')


In [None]:
# def text_to_bert(text)
def text_to_embedding(text, model, vectorizer=None, vocab=None, size=50):
    if not vectorizer:
        raise Exception("The vectorizer parameter must not be None")

    transformed = vectorizer.transform(text)
    vectorized = pd.DataFrame(transformed.toarray(
    ), columns=vectorizer.get_feature_names_out())

    embeedings = pd.DataFrame()
    for i in range(vectorized.shape[0]):
        sentence = np.zeros(size)
        for word in vocab[vectorized.iloc[i, :] > 0]:
            if model.get_index(word, default=-1) != -1:
                sentence = sentence + model.get_vector(word)
            else:
                print("Out of Vocabulary")

        embeedings = pd.concat([embeedings, pd.DataFrame([sentence])])

    return embeedings


In [None]:
embedding_names = ["fasttext_cbow_s50", "fasttext_skip_s50", "glove_s50",
                   "wang2vec_cbow_s50", "wang2vec_skip_s50", "word2vec_cbow_s50", "word2vec_skip_s50"]
embedding_models = [fasttext_cbow_s50, fasttext_skip_s50, glove_s50,
                    wang2vec_cbow_s50, wang2vec_skip_s50, word2vec_cbow_s50, word2vec_skip_s50]

for name, model in zip(embedding_names, embedding_models):
    reviews_train_dtm = text_to_embedding(
        reviews_train_dataset['review_text'], model, tv, reviews_test_processed_tv.columns[1:-1], 50)
    reviews_train_processed = pd.concat([reviews_train_dataset.reset_index()[['original_index']], reviews_train_dtm.reset_index(
        drop=True), reviews_train_dataset.reset_index()[['polarity']]], axis=1, ignore_index=True)
    reviews_train_processed.to_pickle(
        f"../data/processed/buscape_reviews_train_dataset_{name}.pkl")
    print(
        f"The {name} vectorizer train dataframe has {reviews_train_processed.shape[0]} rows and {reviews_train_processed.shape[1]} columns")

    reviews_test_dtm = text_to_embedding(
        reviews_test_dataset['review_text'], model, tv, reviews_test_processed_tv.columns[1:-1], 50)
    reviews_test_processed = pd.concat([reviews_test_dataset.reset_index()[['original_index']], reviews_test_dtm.reset_index(
        drop=True), reviews_test_dataset.reset_index()[['polarity']]], axis=1, ignore_index=True)
    reviews_test_processed.to_pickle(
        f"../data/processed/buscape_reviews_test_dataset_{name}.pkl")
    print(
        f"The {name} vectorizer test dataframe has {reviews_test_processed.shape[0]} rows and {reviews_test_processed.shape[1]} columns")


In [None]:
import torch

from transformers import AutoTokenizer
from transformers import AutoModel

model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
print(f"Transformers model class model: {type(model)}")
tokenizer = AutoTokenizer.from_pretrained(
    'neuralmind/bert-base-portuguese-cased', do_lower_case=True)
print(f"Transformers tokenizer class: {type(tokenizer)}")


# `encode_plus` will:
#   (1) Tokenize the sentence.
#   (2) Prepend the `[CLS]` token to the start.
#   (3) Append the `[SEP]` token to the end.
#   (4) Map tokens to their IDs.
#   (5) Pad or truncate the sentence to `max_length`
#   (6) Create attention masks for [PAD] tokens.
text_train_encoded = reviews_train_dataset['review_text_cleaned'].apply(
    lambda sentence:
    tokenizer.encode_plus(
        text=sentence,
        add_special_tokens=True,
        max_length=10,
        padding='max_length',
        return_attention_mask=True,
        return_tensors="pt"
    )
)

input_ids = [s['input_ids'] for s in text_train_encoded]
attn_mask = [s['attention_mask'] for s in text_train_encoded]
with torch.no_grad():
    outputs = model(input_ids, attn_mask)


In [15]:
def build_tensors(descriptions, tokenizer, max_tokens=128):
    # tokenization.
    sentences = descriptions['review_text_cleaned'].apply(
        (lambda s: ' '.join(s.split()[:max_tokens])))
    tokenized = sentences.apply(
        (lambda s: tokenizer.encode(s, add_special_tokens=True, truncation=True)))

    # padding
    max_len = max_tokens
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

    # masking
    attention_mask = np.where(padded != 0, 1, 0)

    # model#1
    input_ids = torch.tensor(padded)
    attention_mask = torch.tensor(attention_mask)

    return (input_ids, attention_mask)


def extract_features(dataset, model, tokenizer):
    
    bug_ids = dataset['original_index']

    input_ids, attention_mask = build_tensors(dataset, tokenizer)
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:, 0, :].numpy()
    
    labels  = dataset['polarity']
    
    return (features, labels, bug_ids)


In [16]:
extract_features(reviews_test_dataset, model, tokenizer)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
