In [1]:
import re
import csv
import time
import nltk
import numpy as np
from gensim import utils
from gensim.models import KeyedVectors
import gensim.models
from tqdm import tqdm
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')

csv.field_size_limit(100000000)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


131072

In [2]:
class TCUCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
    @staticmethod
    def clean_text(text):
        #source: https://medium.com/ml2vec/using-word2vec-to-analyze-reddit-comments-28945d8cee57
        
        #Normalize tabs and remove newlines
        no_tabs = str(text).replace('\t', ' ').replace('\n', '');
        
        #Remove all characters except A-Z
        alphas_only = re.sub("[^a-zA-Z]", " ", no_tabs);
        
        #Normalize spaces to 1
        multi_spaces = re.sub(" +", " ", alphas_only);
        
        #Strip trailing and leading spaces
        no_spaces = multi_spaces.strip();
        
        #Remove stopwords 
        stopwords = nltk.corpus.stopwords.words('portuguese')
        clean_text = [w for w in no_spaces.split() if not w in stopwords] 

        return ' '.join(clean_text)
    
    @staticmethod
    def preprocess(text):
        return utils.simple_preprocess(TCUCorpus.clean_text(text))
                

    def __iter__(self):
        filename = 'datasets/acordaos_relator_5k.csv'
        with open(filename, "r") as csvfile:
            corpus = csv.reader(csvfile)
            next(corpus) #ignores header
            for doc in tqdm(corpus):
                yield TCUCorpus.preprocess(doc[1])

In [3]:
def train_w2v(corpus, sg=0):
#     sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
# In the CBOW model, the distributed representations of context (or surrounding words) 
# are combined to predict the word in the middle. 
# While in the Skip-gram model, the distributed representation of the input word is used 
# to predict the context.
# https://towardsdatascience.com/nlp-101-word2vec-skip-gram-and-cbow-93512ee24314
    model = gensim.models.Word2Vec(sentences=corpus, min_count=50, size=100, workers=4, sg=sg)
    return model

def load_w2v(path='model.bin'):
    model = gensim.models.Word2Vec.load(path)
    return model

def load_pretrained_w2v(path):
    model = KeyedVectors.load_word2vec_format(path)
    return model

In [4]:
# source: https://github.com/v1shwa/document-similarity/blob/master/DocSim.py
class DocW2V:
    
    def __init__(self, w2v_model):
        self.w2v_model = w2v_model
        
    def transform(self, docs):
        return np.array([self.vectorize(doc) for doc in docs])

    def vectorize(self, doc: str) -> np.ndarray:
        """
        Identify the vector values for each word in the given document
        :param doc:
        :return:
        """
        
        words = TCUCorpus.preprocess(doc)
        word_vecs = []
        for word in words:
            try:
                vec = self.w2v_model.wv[word]
                word_vecs.append(vec)
            except KeyError:
                # Ignore, if the word doesn't exist in the vocabulary
                pass

        # Assuming that document vector is the mean of all the word vectors
        # PS: There are other & better ways to do it.
        vector = np.mean(word_vecs, axis=0)
        return vector

In [5]:
docs = pd.read_csv('datasets/acordaos_relator_5k.csv')
# corpus = TCUCorpus()
# model = train_w2v(corpus)
# model.save('model.bin')
model_trained = load_w2v('model.bin')
# http://www.nilc.icmc.usp.br/embeddings - CBOW 100
model_pretrained = load_pretrained_w2v('cbow_s100.txt')

In [6]:
tfidf = TfidfVectorizer()
tfidf.fit([TCUCorpus.clean_text(doc) for doc in docs])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [7]:
X = np.array(docs.acordao)
le = preprocessing.LabelEncoder()
y = le.fit_transform(np.array(docs.relator.tolist()))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

embedders = [
    tfidf, 
    DocW2V(model_pretrained), 
    DocW2V(model_trained)
]
embedders_map = { 0:'TFIDF', 1:'PRETRAINED', 2:'TRAINED'}
data_embedded = []
for embedder in tqdm(embedders):
    X_train_enc = embedder.transform(X_train)
    X_test_enc = embedder.transform(X_test)
    data_embedded.append((X_train_enc, X_test_enc))

100%|██████████| 3/3 [15:36<00:00, 312.18s/it]


In [9]:
results = []
clfs = [KNeighborsClassifier(n_neighbors=7),
        SVC(gamma='auto', random_state=42), 
        MLPClassifier(random_state=42, max_iter=300)]
clfs_map = { 0:'KNN', 1:'SVM', 2:'MLP'}
kf = KFold(n_splits=10)
for train_index, test_index in tqdm(kf.split(X_train)):
    for ci, clf in enumerate(clfs):
        for ei, data in enumerate(data_embedded):
            X_enc_train, X_enc_test = data
            X_, X_validation = X_enc_train[train_index], X_enc_train[test_index]
            y_, y_validation = y_train[train_index], y_train[test_index]
            startTime = time.perf_counter()
            
            clf.fit(X_, y_)
            y_pred = clf.predict(X_validation)
            
            elapsed_time = time.perf_counter() - startTime
            
            p, r, f, s = precision_recall_fscore_support(y_validation, y_pred, average='micro')
            
            result = {"CLASSIFIER": clfs_map[ci],
                     "EMBEDDER": embedders_map[ei],
                     "PRECISION": p,
                     "RECALL":r,
                     "F1SCORE":f,
                     "TIME": elapsed_time}
            print(result)
            results.append(result)
df = pd.DataFrame(results) 
df.to_csv('results.csv')

0it [00:00, ?it/s]

{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.11129185305679816, 'RECALL': 0.11129185305679816, 'F1SCORE': 0.11129185305679816, 'TIME': 71.49899262399776}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.714111427409516, 'RECALL': 0.714111427409516, 'F1SCORE': 0.714111427409516, 'TIME': 507.5953508349994}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7246396457457864, 'RECALL': 0.7246396457457864, 'F1SCORE': 0.7246396457457864, 'TIME': 192.03062428599878}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.1661470335728164, 'RECALL': 0.1661470335728164, 'F1SCORE': 0.1661470335728164, 'TIME': 1814.2494784329974}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.34697935023270526, 'RECALL': 0.34697935023270526, 'F1SCORE': 0.34697935023270526, 'TIME': 10349.277052106001}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7308300573855677, 'RECALL': 0.7308300573855677, 'F1SCORE': 0.7308300573855677, 'TIME': 4906.7708052480



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.704667660747368, 'RECALL': 0.704667660747368, 'F1SCORE': 0.704667660747368, 'TIME': 1746.1100166589968}


1it [5:32:21, 19941.80s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7684695675748949, 'RECALL': 0.7684695675748949, 'F1SCORE': 0.7684695675748949, 'TIME': 337.22980569100037}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16470109800732005, 'RECALL': 0.16470109800732005, 'F1SCORE': 0.16470109800732005, 'TIME': 71.31086349999532}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7149247661651078, 'RECALL': 0.7149247661651078, 'F1SCORE': 0.7149247661651078, 'TIME': 507.43941625799926}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7223351859382766, 'RECALL': 0.7223351859382766, 'F1SCORE': 0.7223351859382766, 'TIME': 191.16286838200176}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16470109800732005, 'RECALL': 0.16470109800732005, 'F1SCORE': 0.16470109800732005, 'TIME': 1827.385535239002}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.34521711626225654, 'RECALL': 0.34521711626225654, 'F1SCORE': 0.34521711626225654, 'TIME': 10338.410



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7047580317202115, 'RECALL': 0.7047580317202115, 'F1SCORE': 0.7047580317202115, 'TIME': 1765.0889573999957}


2it [11:03:18, 19916.41s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7675658578464597, 'RECALL': 0.7675658578464597, 'F1SCORE': 0.7675658578464597, 'TIME': 340.40355318500224}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.11020740138267589, 'RECALL': 0.11020740138267589, 'F1SCORE': 0.11020740138267589, 'TIME': 70.71676280799875}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7126654918440197, 'RECALL': 0.7126654918440197, 'F1SCORE': 0.7126654918440197, 'TIME': 507.70736871100235}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7268989200668745, 'RECALL': 0.7268989200668745, 'F1SCORE': 0.7268989200668744, 'TIME': 192.1323286600018}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.17057521124214903, 'RECALL': 0.17057521124214903, 'F1SCORE': 0.17057521124214903, 'TIME': 1825.6051497189983}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.35023270525507205, 'RECALL': 0.35023270525507205, 'F1SCORE': 0.35023270525507205, 'TIME': 10328.605



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7028150558040758, 'RECALL': 0.7028150558040758, 'F1SCORE': 0.7028150558040758, 'TIME': 1698.0389175180026}


3it [16:32:49, 19872.70s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7702318015453437, 'RECALL': 0.7702318015453437, 'F1SCORE': 0.7702318015453435, 'TIME': 338.6869627269916}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16818038046179568, 'RECALL': 0.16818038046179568, 'F1SCORE': 0.16818038046179568, 'TIME': 69.4876587089966}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7159188468663865, 'RECALL': 0.7159188468663865, 'F1SCORE': 0.7159188468663865, 'TIME': 507.5534023539949}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.726989291039718, 'RECALL': 0.726989291039718, 'F1SCORE': 0.726989291039718, 'TIME': 193.0247450099996}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16818038046179568, 'RECALL': 0.16818038046179568, 'F1SCORE': 0.16818038046179568, 'TIME': 1828.2594947250036}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.3519949392255208, 'RECALL': 0.3519949392255208, 'F1SCORE': 0.3519949392255208, 'TIME': 10531.117637360992



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7094121368216529, 'RECALL': 0.7094121368216529, 'F1SCORE': 0.7094121368216529, 'TIME': 1764.7053078129975}


4it [22:08:04, 19945.31s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7705932854367177, 'RECALL': 0.7705932854367177, 'F1SCORE': 0.7705932854367177, 'TIME': 339.14477849399555}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.06416339071890109, 'RECALL': 0.06416339071890109, 'F1SCORE': 0.06416339071890109, 'TIME': 72.12239768900326}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7127106773304415, 'RECALL': 0.7127106773304415, 'F1SCORE': 0.7127106773304415, 'TIME': 510.23869624800864}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7204825809949844, 'RECALL': 0.7204825809949844, 'F1SCORE': 0.7204825809949844, 'TIME': 191.74545180200948}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16931001762233971, 'RECALL': 0.16931001762233971, 'F1SCORE': 0.16931001762233971, 'TIME': 1880.163234268999}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.34779268898829696, 'RECALL': 0.34779268898829696, 'F1SCORE': 0.34779268898829696, 'TIME': 10699.718



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.6989291039718043, 'RECALL': 0.6989291039718043, 'F1SCORE': 0.6989291039718043, 'TIME': 2000.8276386180078}


5it [27:49:11, 20101.82s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7614206316931001, 'RECALL': 0.7614206316931001, 'F1SCORE': 0.7614206316931001, 'TIME': 292.4932198119932}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.12028376485472866, 'RECALL': 0.12028376485472866, 'F1SCORE': 0.12028376485472866, 'TIME': 70.12115343900223}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7128462337897068, 'RECALL': 0.7128462337897068, 'F1SCORE': 0.7128462337897068, 'TIME': 503.9682261160051}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.723600379558086, 'RECALL': 0.723600379558086, 'F1SCORE': 0.723600379558086, 'TIME': 193.8865762130008}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16962631602729203, 'RECALL': 0.16962631602729203, 'F1SCORE': 0.16962631602729203, 'TIME': 1832.4249324850098}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.351000858524242, 'RECALL': 0.351000858524242, 'F1SCORE': 0.351000858524242, 'TIME': 10322.415950027993}




{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7099091771722923, 'RECALL': 0.7099091771722923, 'F1SCORE': 0.7099091771722922, 'TIME': 1838.9841060699837}


6it [33:20:53, 20041.88s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.768650309520582, 'RECALL': 0.768650309520582, 'F1SCORE': 0.768650309520582, 'TIME': 335.22778361698147}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.116623740454566, 'RECALL': 0.116623740454566, 'F1SCORE': 0.116623740454566, 'TIME': 72.89276041902485}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7135240160860332, 'RECALL': 0.7135240160860332, 'F1SCORE': 0.7135240160860331, 'TIME': 509.3047375330061}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7232388956667118, 'RECALL': 0.7232388956667118, 'F1SCORE': 0.7232388956667117, 'TIME': 192.40234002200305}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.1687226062988568, 'RECALL': 0.1687226062988568, 'F1SCORE': 0.1687226062988568, 'TIME': 1815.0189905550214}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.3515882698477249, 'RECALL': 0.3515882698477249, 'F1SCORE': 0.3515882698477249, 'TIME': 10334.057076180005}
{'CL



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7102706610636663, 'RECALL': 0.7102706610636663, 'F1SCORE': 0.7102706610636663, 'TIME': 1920.0992628270178}


7it [38:53:49, 20022.15s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.765984365821698, 'RECALL': 0.765984365821698, 'F1SCORE': 0.765984365821698, 'TIME': 337.4668362659868}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.06244634223487416, 'RECALL': 0.06244634223487416, 'F1SCORE': 0.06244634223487416, 'TIME': 69.56056229397655}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7160544033256518, 'RECALL': 0.7160544033256518, 'F1SCORE': 0.7160544033256518, 'TIME': 513.555589088006}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7203018390492973, 'RECALL': 0.7203018390492973, 'F1SCORE': 0.7203018390492973, 'TIME': 194.1175866620033}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16316479146898016, 'RECALL': 0.16316479146898016, 'F1SCORE': 0.16316479146898016, 'TIME': 1954.8033098489977}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.3471600921783923, 'RECALL': 0.3471600921783923, 'F1SCORE': 0.3471600921783923, 'TIME': 10659.874153122015



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.6975735393791515, 'RECALL': 0.6975735393791515, 'F1SCORE': 0.6975735393791515, 'TIME': 2043.6329534680117}


8it [44:40:36, 20257.60s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7698251321675478, 'RECALL': 0.7698251321675478, 'F1SCORE': 0.7698251321675477, 'TIME': 356.1703970489907}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.08793095657674754, 'RECALL': 0.08793095657674754, 'F1SCORE': 0.08793095657674754, 'TIME': 74.78385540898307}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7198951696715015, 'RECALL': 0.7198951696715015, 'F1SCORE': 0.7198951696715015, 'TIME': 516.5914464880188}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7252270570692694, 'RECALL': 0.7252270570692694, 'F1SCORE': 0.7252270570692694, 'TIME': 195.9045896870084}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.1660566625999729, 'RECALL': 0.1660566625999729, 'F1SCORE': 0.1660566625999729, 'TIME': 2136.429832281021}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.3497356649044327, 'RECALL': 0.3497356649044327, 'F1SCORE': 0.3497356649044327, 'TIME': 10643.756315900973



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7062039672857078, 'RECALL': 0.7062039672857078, 'F1SCORE': 0.7062039672857078, 'TIME': 1990.2304524469946}


9it [50:23:50, 20358.43s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7660295513081198, 'RECALL': 0.7660295513081198, 'F1SCORE': 0.7660295513081199, 'TIME': 333.33021656799247}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16456554154805478, 'RECALL': 0.16456554154805478, 'F1SCORE': 0.16456554154805478, 'TIME': 69.70296594599495}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.7212959197505762, 'RECALL': 0.7212959197505762, 'F1SCORE': 0.7212959197505762, 'TIME': 503.90887529900647}
{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7270344765261398, 'RECALL': 0.7270344765261398, 'F1SCORE': 0.72703447652614, 'TIME': 188.77443715601112}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.16461072703447652, 'RECALL': 0.16461072703447652, 'F1SCORE': 0.16461072703447652, 'TIME': 1725.4063511080167}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.34982603587727623, 'RECALL': 0.34982603587727623, 'F1SCORE': 0.34982603587727623, 'TIME': 10121.5923



{'CLASSIFIER': 'MLP', 'EMBEDDER': 'PRETRAINED', 'PRECISION': 0.705978039853599, 'RECALL': 0.705978039853599, 'F1SCORE': 0.705978039853599, 'TIME': 2001.261262097978}


10it [55:51:41, 20110.15s/it]

{'CLASSIFIER': 'MLP', 'EMBEDDER': 'TRAINED', 'PRECISION': 0.7671140029822421, 'RECALL': 0.7671140029822421, 'F1SCORE': 0.7671140029822421, 'TIME': 329.1742454560008}





* A escolha pela representação CBOW se dá pela grande quantidade de dados usados no experimento, pois usar o skip-gram afetaria muito o tempo de treinamento.
* O experimento que gerou os resultados foi completado em 55 horas, 51 minutos e 41 segundos.
* A escolha da dimensão dos _word vectors_ foi devido a limitações do hardware do experimento.

In [14]:
tfidf_ = TfidfVectorizer()
tfidf_.fit([TCUCorpus.clean_text(doc) for doc in docs.acordao])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [25]:
new_df = df[df.EMBEDDER != 'TFIDF']
new_df.reset_index(drop=True)

Unnamed: 0,CLASSIFIER,EMBEDDER,PRECISION,RECALL,F1SCORE,TIME
0,KNN,PRETRAINED,0.714111,0.714111,0.714111,507.595351
1,KNN,TRAINED,0.72464,0.72464,0.72464,192.030624
2,SVM,PRETRAINED,0.346979,0.346979,0.346979,10349.277052
3,SVM,TRAINED,0.73083,0.73083,0.73083,4906.770805
4,MLP,PRETRAINED,0.704668,0.704668,0.704668,1746.110017
5,MLP,TRAINED,0.76847,0.76847,0.76847,337.229806
6,KNN,PRETRAINED,0.714925,0.714925,0.714925,507.439416
7,KNN,TRAINED,0.722335,0.722335,0.722335,191.162868
8,SVM,PRETRAINED,0.345217,0.345217,0.345217,10338.410488
9,SVM,TRAINED,0.730966,0.730966,0.730966,4790.256861


In [26]:
for embedder in tqdm(embedders):
    new_X_train_enc = tfidf_.transform(X_train)
    new_X_test_enc = tfidf_.transform(X_test)

100%|██████████| 3/3 [02:40<00:00, 53.54s/it]


In [None]:
new_results = []

for train_index, test_index in tqdm(kf.split(X_train)):
    for ci, clf in enumerate(clfs):
        X_, X_validation = new_X_train_enc[train_index], new_X_test_enc[test_index]
        y_, y_validation = y_train[train_index], y_train[test_index]
        startTime = time.perf_counter()

        clf.fit(X_, y_)
        y_pred = clf.predict(X_validation)

        elapsed_time = time.perf_counter() - startTime

        p, r, f, s = precision_recall_fscore_support(y_validation, y_pred, average='micro')

        result = {"CLASSIFIER": clfs_map[ci],
                 "EMBEDDER": 'TFIDF',
                 "PRECISION": p,
                 "RECALL":r,
                 "F1SCORE":f,
                 "TIME": elapsed_time}
        print(result)
        new_results.append(result)

0it [00:00, ?it/s]

{'CLASSIFIER': 'KNN', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.06506710044733631, 'RECALL': 0.06506710044733631, 'F1SCORE': 0.06506710044733631, 'TIME': 640.0019952070434}
{'CLASSIFIER': 'SVM', 'EMBEDDER': 'TFIDF', 'PRECISION': 0.11124666757037639, 'RECALL': 0.11124666757037639, 'F1SCORE': 0.11124666757037639, 'TIME': 55685.40869214601}
