In [None]:
!pip install scikit-learn==0.24.2
!pip install nltk==3.6.5
!pip install seaborn==0.11.2
!pip install gensim==4.1.2

In [None]:
import os

base_dir = '/tf/sa-experiments/corpus'

if not os.path.exists(base_dir):
    !tar xvzf corpus.tar.gz

In [None]:
import nltk
import numpy as np
import tensorflow as tf
from tensorflow.keras import preprocessing

nltk.download('stopwords')
nltk.download('rslp')

In [None]:
from nltk.stem import RSLPStemmer
stemmer = RSLPStemmer()

In [None]:
training_samples = 745307
validation_samples = 82811
batch_size = 1024

In [None]:
seed = 42

corpus_dir = '/tf/sa-experiments/corpus/reviews'

dataset_training = (preprocessing
    .text_dataset_from_directory(corpus_dir,
                                 validation_split=0.1,
                                 subset='training',
                                 shuffle=True,
                                 batch_size=batch_size,
                                 seed=seed)
)

class_names = dataset_training.class_names

dataset_validation = (preprocessing
    .text_dataset_from_directory(
        corpus_dir,
        validation_split=0.1,
        subset='validation',
        shuffle=True,
        batch_size=batch_size,
        seed=seed)
)

In [None]:
def to_np_dataset(dataset, vectorizer, fit_dict = True):

    dataset_np = np.concatenate([
        np.concatenate([x.reshape(x.shape[0], 1),
                        y.reshape(x.shape[0], 1)], axis=1)
                        for x, y in dataset.as_numpy_iterator()])
    
    
    X = vectorizer.fit_transform(dataset_np[:,0]) if fit_dict else vectorizer.transform(dataset_np[:,0])
    Y = dataset_np[:,1].astype(int)

    return X, Y

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.set(font_scale=2)

def plot_results(clf, X_validation, Y_validation):

    Y_predict = clf.predict(X_validation)
    fig, axs = plt.subplots(figsize=(15, 20), nrows=2, ncols=1)

    map_classes = np.vectorize(lambda x: class_names[x])
    
    sns.heatmap(pd.DataFrame(confusion_matrix(map_classes(Y_predict), 
                                              map_classes(Y_validation), 
                                              normalize='true', 
                                              labels=class_names), 
                             columns=class_names, 
                             index=class_names), 
                annot=True,
                cmap='Blues',
                ax=axs[0])
    
    sns.heatmap(pd.DataFrame(confusion_matrix(map_classes(Y_predict), 
                                              map_classes(Y_validation),
                                              labels=class_names), 
                             columns=class_names, 
                             index=class_names), 
                annot=True,
                cmap='Blues',
                ax=axs[1])

    axs[0].set_title(f'{clf.__class__.__name__} | Accuracy : {round(accuracy_score(Y_validation, Y_predict), 2)}')
    axs[1].set_title(f'{clf.__class__.__name__} | Accuracy : {accuracy_score(Y_validation, Y_predict, normalize=False)}')

    fig.savefig(f'/tf/sa-experiments/{clf.__class__.__name__}.svg', bbox_inches='tight')

# Classificador Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

analyzer = CountVectorizer().build_analyzer()

def analyzer_stemming(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(stop_words=stopwords.words('portuguese'))

X_training, Y_training = to_np_dataset(dataset_training, vectorizer)

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_training, Y_training)

In [None]:
X_validation, Y_validation = to_np_dataset(dataset_validation, vectorizer, fit_dict=False)

In [None]:
plot_results(clf, X_validation, Y_validation)

# Classificador SVM

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_training, Y_training = to_np_dataset(dataset_training, vectorizer)

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X_training, Y_training)

In [None]:
X_validation, Y_validation = to_np_dataset(dataset_validation, vectorizer, fit_dict=False)

In [None]:
plot_results(clf, X_validation, Y_validation)

# Classificador SVM (Embeddings)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
tokenizer = vectorizer.build_tokenizer()

In [None]:
from gensim.models import KeyedVectors

EMBEDDING_DIM = 300
POOL = 12

base_dir = 'sa-experiments/corpus'
embeddings_index = KeyedVectors.load_word2vec_format(f'{base_dir}/embeddings/glove_s{EMBEDDING_DIM}.txt')

In [None]:
from multiprocessing import Pool

def to_embedding(tokens):
    
    hits = 0
    embedding = np.zeros((EMBEDDING_DIM,1))
    
    for token in tokens:
        
        if embeddings_index.has_index_for(token):
            hits+=1
            embedding = embedding + embeddings_index[token].reshape(EMBEDDING_DIM,1)

    return embedding / hits if hits > 0 else embedding

def to_embedding_batch(batch):
    reviews, classes = batch
    embeddings =  np.concatenate([to_embedding(
                                  tokenizer(review.decode()))
                                  for review in reviews], axis=1).T
    return np.concatenate([embeddings, 
                           classes.reshape(classes.shape[0], 1)], 
                           axis = 1)

def split_dimensions(dataset_numpy):
    
    dataset_numpy = np.concatenate(dataset_numpy)

    return (dataset_numpy[:,:EMBEDDING_DIM],
            dataset_numpy[:,EMBEDDING_DIM].astype(int))


In [None]:
with Pool(POOL) as p:
    dataset_training_numpy = p.map(to_embedding_batch, [batch for batch
                                                        in dataset_training.as_numpy_iterator()])

X_training, Y_training = split_dimensions(dataset_training_numpy)

In [None]:
with Pool(POOL) as p:
    dataset_validation_numpy = p.map(to_embedding_batch, [batch for batch
                                                         in dataset_validation.as_numpy_iterator()])

X_validation, Y_validation = split_dimensions(dataset_validation_numpy)

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC()
clf.fit(X_training, Y_training)

In [None]:
plot_results(clf, X_validation, Y_validation)