In [1]:
import matplotlib as mpl

import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', 300)

from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics import confusion_matrix
from datetime import date
import matplotlib.pyplot as plt
import numpy as np
import unicodedata
import string
import time

from wordcloud import WordCloud
from nltk import FreqDist
import seaborn as sns
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

from sklearn.metrics import accuracy_score #(TP+TN)/(TP+FP+FN+TN) >> Proporção de TP e TN sobre o total.
from sklearn.metrics import f1_score #2*(pr*rec)/(pr+rec) >> Média harmônica entre Precision e Recall.
from sklearn.metrics import roc_auc_score #Area Under the Receiver Operating Characteristic Curve.
from sklearn.metrics import recall_score #(TP)/(TP+FN) >> Proporção de TP verdadeiramente classificados. (Gargalo)
from sklearn.metrics import precision_score #(TP)/(TP+FP) >> Proporção de TP nas predições positivas.

class Model:
    def __init__(self):
        
        #Addressing datasets
        self.ds_fortuna = pd.read_csv("datasets/binary_classification.csv")
        self.ds_fortuna = self.ds_fortuna.drop(columns = ["hatespeech_G1","annotator_G1","hatespeech_G2",
                                                        "annotator_G2","hatespeech_G3","annotator_G3"])
        
        self.ds_pelle = pd.read_csv("datasets/OffComBR2.txt", sep=",", names = ["label","text"])
        
        #Standardizing Paula Fortuna (2019) dataset
        self.ds_fortuna = self.ds_fortuna.rename(columns = {"hatespeech_comb":"label"})
        self.ds_fortuna["text"] = self.ds_fortuna["text"].replace(to_replace = "@[A-Za-z0-9_]*", value = "", regex = True) #Remover o @ dos usuários do tweet.
        self.ds_fortuna["text"] = self.ds_fortuna["text"].replace(to_replace = "\\n", value = " ", regex = True) #Remover o \n do texto
        self.ds_fortuna["text"] = self.ds_fortuna["text"].replace(to_replace = "'", value = "", regex = True) #Remover '' das palavras
        self.ds_fortuna["text"] = self.ds_fortuna["text"].replace(to_replace = "http[^ ]+", value = "", regex = True) #Remover links
        self.ds_fortuna["text"] = self.ds_fortuna["text"].replace(to_replace = "www.[^ ]+", value = "", regex = True) #Remover links
        self.ds_fortuna["text"] = self.ds_fortuna["text"].str.lower() #Padronização de caixa
        
        #Standardizing Rogers de Pelle (2017) dataset
        self.ds_pelle["text"] = self.ds_pelle["text"].replace(to_replace = "'", value = "", regex = True)
        self.ds_pelle["label"] = self.ds_pelle["label"].replace(to_replace = "yes", value = 1, regex = True)
        self.ds_pelle["label"] = self.ds_pelle["label"].replace(to_replace = "no", value = 0, regex = True)
        self.ds_pelle = self.ds_pelle[["text","label"]]
        self.ds_pelle["text"] = self.ds_pelle["text"].str.lower()
        
        #Experiment Outcomes
        self.exp_counter = 1
        self.bow_outcomes = pd.DataFrame(columns=["id","tags","feature","ngram","pelle","split",
                                                  "acc_lr","rec_lr","pr_lr","f1_lr","roc_auc_lr",
                                                  "acc_nb","rec_nb","pr_nb","f1_nb","roc_auc_nb",
                                                  "acc_svm","rec_svm","pr_svm","f1_svm","roc_auc_svm",
                                                  "acc_rf","rec_rf","pr_rf","f1_rf","roc_auc_rf",
                                                  "created","elapsed"])
        
        self.tfidf_outcomes = pd.DataFrame(columns=["id","tags","feature","ngram","pelle","split",
                                                  "acc_lr","rec_lr","pr_lr","f1_lr","roc_auc_lr",
                                                  "acc_nb","rec_nb","pr_nb","f1_nb","roc_auc_nb",
                                                  "acc_svm","rec_svm","pr_svm","f1_svm","roc_auc_svm",
                                                  "acc_rf","rec_rf","pr_rf","f1_rf","roc_auc_rf",
                                                  "created","elapsed"])
        
        self.w2v_outcomes = pd.DataFrame(columns=["id","tags","feature","ngram","pelle","split",
                                                  "acc_lr","rec_lr","pr_lr","f1_lr","roc_auc_lr",
                                                  "acc_nb","rec_nb","pr_nb","f1_nb","roc_auc_nb",
                                                  "acc_svm","rec_svm","pr_svm","f1_svm","roc_auc_svm",
                                                  "acc_rf","rec_rf","pr_rf","f1_rf","roc_auc_rf",
                                                  "created","elapsed"])
        
        self.rs_outcomes = pd.DataFrame(columns=["tags","split","n_sample","feature","ngram","pelle","f1_mean_test_score",
                                                 "n_estimators","max_features","max_depth","min_samples_split",
                                                 "min_samples_leaf","bootstrap","class_weight","created"])
        
        #Preprocessing functions
        def tknze(sentence):
            space_tnkzr = tokenize.WhitespaceTokenizer()
            return space_tnkzr.tokenize(sentence)

        def joinTokens(wordlist):
            sentence = " "
            return sentence.join(wordlist) 

        def tweetTknze(sentence):
            tt_tknzr = tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
            return joinTokens(tt_tknzr.tokenize(sentence))

        def removeStopWords(sentence):
            #https://virtuati.com.br/cliente/knowledgebase/25/Lista-de-StopWords.html
            #https://gist.github.com/alopes/5358189
            stopWords1 = ['a', 'agora', 'ainda', 'alguem', 'algum', 'alguma', 'algumas', 'alguns', 'ampla', 'amplas', 'amplo', 'amplos', 'ante', 'antes', 'ao', 'aos', 'apos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'ate', 'atraves', 'cada', 'coisa', 'coisas', 'com', 'como', 'contra', 'contudo', 'da', 'daquele', 'daqueles', 'das', 'de', 'dela', 'delas', 'dele', 'deles', 'depois', 'dessa', 'dessas', 'desse', 'desses', 'desta', 'destas', 'deste', 'deste', 'destes', 'deve', 'devem', 'devendo', 'dever', 'devera', 'deverao', 'deveria', 'deveriam', 'devia', 'deviam', 'disse', 'disso', 'disto', 'dito', 'diz', 'dizem', 'do', 'dos', 'e', 'e', 'ela', 'elas', 'ele', 'eles', 'em', 'enquanto', 'entre', 'era', 'essa', 'essas', 'esse', 'esses', 'esta', 'esta', 'estamos', 'estao', 'estas', 'estava', 'estavam', 'estavamos', 'este', 'estes', 'estou', 'eu', 'fazendo', 'fazer', 'feita', 'feitas', 'feito', 'feitos', 'foi', 'for', 'foram', 'fosse', 'fossem', 'grande', 'grandes', 'ha', 'isso', 'isto', 'ja', 'la', 'la', 'lhe', 'lhes', 'lo', 'mas', 'me', 'mesma', 'mesmas', 'mesmo', 'mesmos', 'meu', 'meus', 'minha', 'minhas', 'muita', 'muitas', 'muito', 'muitos', 'na', 'nao', 'nas', 'nem', 'nenhum', 'nessa', 'nessas', 'nesta', 'nestas', 'ninguem', 'no', 'nos', 'nos', 'nossa', 'nossas', 'nosso', 'nossos', 'num', 'numa', 'nunca', 'o', 'os', 'ou', 'outra', 'outras', 'outro', 'outros', 'para', 'pela', 'pelas', 'pelo', 'pelos', 'pequena', 'pequenas', 'pequeno', 'pequenos', 'per', 'perante', 'pode', 'pude', 'podendo', 'poder', 'poderia', 'poderiam', 'podia', 'podiam', 'pois', 'por', 'porem', 'porque', 'posso', 'pouca', 'poucas', 'pouco', 'poucos', 'primeiro', 'primeiros', 'propria', 'proprias', 'proprio', 'proprios', 'quais', 'qual', 'quando', 'quanto', 'quantos', 'que', 'quem', 'sao', 'se', 'seja', 'sejam', 'sem', 'sempre', 'sendo', 'sera', 'serao', 'seu', 'seus', 'si', 'sido', 'so', 'sob', 'sobre', 'sua', 'suas', 'talvez', 'tambem', 'tampouco', 'te', 'tem', 'tendo', 'tenha', 'ter', 'teu', 'teus', 'ti', 'tido', 'tinha', 'tinham', 'toda', 'todas', 'todavia', 'todo', 'todos', 'tu', 'tua', 'tuas', 'tudo', 'ultima', 'ultimas', 'ultimo', 'ultimos', 'um', 'uma', 'umas', 'uns', 'vendo', 'ver', 'vez', 'vindo', 'vir', 'vos']
            #análise de frequências
            stopWords2 = ["ca","terra", "europa", "pros", "sr", "ninguem", "algo", "msm", "to", "vcs", "vi", "ir", "ficar",
                          "dois", "onde", "sera", "ate", "entao", "ue", "pa", "ia", "va", "qdo", "via", "qq",
                          "acho", "nada", "quero", "sim", "da", "dia", "pras",
                          "aqui", "faz", "ta", "dar", "vao", "quer", "vem", "voces", "sabe", "so", "fica", "ser", "ne",
                          "ne", "estao", "sao", "voce", "nao", "tambem", "tb",
                          "volta", "ja", "rt", "vc", "ai", "la", "pro", "pra", "ve", "hj", "disso", "outro", "uns", "eh",
                          "sobre", "tao", "assim", "disse", "estar", "vou", "pode", "vez", "vai", "estar",
                          "pq", "mil", "mt", "mim"]
            wordlist = tknze(sentence)
            stopWords3 = stopwords.words("portuguese")
            newWordList = []
            for w in wordlist:
                if w not in stopWords1 and w not in stopWords2 and w not in stopWords3 and len(w) > 1 and "kkkk" not in w:
                    newWordList.append(w)

            return joinTokens(newWordList)

        def removePunctuation(sentence):
            wordlist = tknze(sentence)
            newWordList = []
            for w in wordlist:
                newWordList.append(w.translate(str.maketrans("","",string.punctuation)))

            return joinTokens(newWordList)

        def removeNumbers(sentence):
            wordlist = tknze(sentence)
            newWordList = []
            for w in wordlist:
                newWordList.append(w.translate(str.maketrans("","","0123456789")))

            return joinTokens(newWordList)

        def removeAccents(sentence):
            wordlist = tknze(sentence)
            newWordList = []
            for w in wordlist:
                newWord = ''.join(ch for ch in unicodedata.normalize('NFKD', w) if not unicodedata.combining(ch))
                newWordList.append(newWord)

            return joinTokens(newWordList)

        def stemming(sentence):
            stemmer = SnowballStemmer("portuguese")
            wordlist = tknze(sentence)
            newWordList = []
            for w in wordlist:
                newWordList.append(stemmer.stem(w))

            return joinTokens(newWordList)

        def lemmatization(sentence):
            pass
        
        #Preprocesing
        self.ds_fortuna["preprocessing"] = self.ds_fortuna["text"].apply(tweetTknze)
        self.ds_fortuna["preprocessing"] = self.ds_fortuna["preprocessing"].apply(removePunctuation)
        self.ds_fortuna["preprocessing"] = self.ds_fortuna["preprocessing"].apply(removeNumbers)
        self.ds_fortuna["preprocessing"] = self.ds_fortuna["preprocessing"].apply(removeAccents)
        self.ds_fortuna["preprocessing"] = self.ds_fortuna["preprocessing"].apply(removeStopWords)
        #self.ds_fortuna["preprocessing"] = self.ds_fortuna["preprocessing"].apply(stemming)
        self.ds_fortuna["pp_tokens"] = self.ds_fortuna["preprocessing"].apply(tknze)

        self.ds_pelle["preprocessing"] = self.ds_pelle["text"].apply(removePunctuation)
        self.ds_pelle["preprocessing"] = self.ds_pelle["preprocessing"].apply(removeNumbers)
        self.ds_pelle["preprocessing"] = self.ds_pelle["preprocessing"].apply(removeAccents)
        self.ds_pelle["preprocessing"] = self.ds_pelle["preprocessing"].apply(removeStopWords)
        #self.ds_pelle["preprocessing"] = self.ds_pelle["preprocessing"].apply(stemming)
        self.ds_pelle["pp_tokens"] = self.ds_pelle["preprocessing"].apply(tknze)
        
        #Functions
        def averageWordVectors(words, model, vocabulary, num_features):
            feature_vector = np.zeros((num_features,),dtype="float64")
            nwords = 0.

            for word in words:
                if word in vocabulary: 
                    nwords = nwords + 1.
                    feature_vector = np.add(feature_vector, model[word])

            if nwords:
                feature_vector = np.divide(feature_vector, nwords)

            return feature_vector


        def averagedWordVectorizer(corpus, model, num_features):
            vocabulary = set(model.index2word)
            features = [averageWordVectors(tokenized_sentence, model, vocabulary,
                                           num_features) for tokenized_sentence in corpus]
            return np.array(features)

        def removeAccentsW2V(model):
            l = list(model.vocab)
            for w in l:
                model.vocab[''.join(ch for ch in unicodedata.normalize("NFKD",
                                    w) if not unicodedata.combining(ch))] = model.vocab.pop(w)
            return model
        
        self.w2v_model = KeyedVectors.load_word2vec_format("datasets/cbow_s100_USP.txt")
        self.w2v_model = removeAccentsW2V(self.w2v_model) #Removing accents
        
        self.X_fortuna_w2v = averagedWordVectorizer(corpus=self.ds_fortuna["pp_tokens"].tolist(), model=self.w2v_model,
                                                    num_features=100) #Dataset Fortuna Embeddings
        self.X_offPelle_w2v = averagedWordVectorizer(corpus=self.ds_pelle[self.ds_pelle["label"] == 1]["pp_tokens"].tolist(), 
                                                     model=self.w2v_model, num_features=100)
        
        
        
    def fortunaWordCloud(self, col, label):
        dataFrame = self.ds_fortuna
               
        df_filtrado = dataFrame.query("label == '{}'".format(label))
        allWords = ' '.join([s for s in df_filtrado[col]])

        cloud = WordCloud(width = 900, height = 600,
                                  max_font_size = 120,
                                  collocations = False).generate(allWords)
        plt.figure(figsize=(16,8))
        plt.imshow(cloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()
        
    def pelleWordCloud(self, col, label):
        dataFrame = self.ds_pelle
        
        df_filtrado = dataFrame.query("label == '{}'".format(label))
        allWords = ' '.join([s for s in df_filtrado[col]])

        cloud = WordCloud(width = 900, height = 600,
                                  max_font_size = 120,
                                  collocations = False).generate(allWords)
        plt.figure(figsize=(16,8))
        plt.imshow(cloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()
    
    def fortunaFrequency(self, col, label):
             
        dataFrame = self.ds_fortuna
               
        df_filtrado = dataFrame.query("label == '{}'".format(label))
        allWords = ' '.join([s for s in df_filtrado[col]])

        spaceTknzr = tokenize.WhitespaceTokenizer()
        sentences = spaceTknzr.tokenize(allWords)
        frequency = FreqDist(sentences)

        df_frequency = pd.DataFrame({"Word": list(frequency.keys()), "Frequency": list(frequency.values())})
        return df_frequency.nlargest(columns = "Frequency", n = 50)
    
    def pellefrequency(self, col, label):
        
        dataFrame = self.ds_pelle
        
        df_filtrado = dataFrame.query("label == '{}'".format(label))
        allWords = ' '.join([s for s in df_filtrado[col]])

        spaceTknzr = tokenize.WhitespaceTokenizer()
        sentences = spaceTknzr.tokenize(allWords)
        frequency = FreqDist(sentences)

        df_frequency = pd.DataFrame({"Word": list(frequency.keys()), "Frequency": list(frequency.values())})
        return df_frequency.nlargest(columns = "Frequency", n = 50)
    
    def fortunaPareto(self, col, n, df="fortuna"):
        
        dataFrame = self.ds_fortuna
        
        spaceTknzr = tokenize.WhitespaceTokenizer()
        allWords = " ".join([s for s in dataFrame[col]])
        sentences = spaceTknzr.tokenize(allWords)
        frequency = FreqDist(sentences)
        df_frequency = pd.DataFrame({"Palavra": list(frequency.keys()),
                                     "Frequência": list(frequency.values())})
        df_frequency = df_frequency.nlargest(columns = "Frequência", n = n)
        plt.figure(figsize=(14,12))
        ax = sns.barplot(data = df_frequency, x = "Palavra", y = "Frequência")
        ax.set(ylabel = "Contagem")
        plt.show()
        
    def pellePareto(self, col, n, df="fortuna"):
        
        dataFrame = self.ds_pelle
        
        spaceTknzr = tokenize.WhitespaceTokenizer()
        allWords = " ".join([s for s in dataFrame[col]])
        sentences = spaceTknzr.tokenize(allWords)
        frequency = FreqDist(sentences)
        df_frequency = pd.DataFrame({"Palavra": list(frequency.keys()),
                                     "Frequência": list(frequency.values())})
        df_frequency = df_frequency.nlargest(columns = "Frequência", n = n)
        plt.figure(figsize=(14,12))
        ax = sns.barplot(data = df_frequency, x = "Palavra", y = "Frequência")
        ax.set(ylabel = "Contagem")
        plt.show()
    
    @staticmethod
    def confusionMatrix(y_true, y_pred, img_name, cmap=plt.cm.Blues):
        
        classes=['0','1']

        # Compute confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Only use the labels that appear in the data
        classes = list(['no','yes'])#classes[unique_labels(y_true, y_pred)]
        cm_n = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        fig, ax = plt.subplots()
        im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
        ax.figure.colorbar(im, ax=ax)
        # We want to show all ticks...
        ax.set(xticks=np.arange(cm.shape[1]),
               yticks=np.arange(cm.shape[0]),
               # ... and label them with the respective list entries
               xticklabels=classes, yticklabels=classes,
               title=img_name,
               ylabel='True label',
               xlabel='Predicted label')

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

        # Loop over data dimensions and create text annotations.
        fmt_n = '.2f'
        fmt = 'd'
        thresh = cm.max() / 2.
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], fmt)+' ('+ format(cm_n[i, j], fmt_n) +')',
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black")
        #fig.tight_layout()
        
        return cm, fig
    
    @staticmethod
    def rf_random_search(X_train, y_train, n_iter, random_state):
        
        rf_model = RandomForestClassifier(random_state=random_state)
        
        # Number of trees in random forest
        n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
        # Number of features to consider at every split
        max_features = ["auto", "sqrt"]
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        bootstrap = [True, False]
        # Class weight
        class_weight = ["balanced", {1: 1}, {1: 2}, {1: 4}, {1: 6}, {1: 8}]
        # Create the random grid
        random_grid = {'n_estimators': n_estimators,
                       'max_features': max_features,
                       'max_depth': max_depth,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'bootstrap': bootstrap,
                       'class_weight': class_weight}
        
        rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=random_grid,
                                       n_iter=n_iter, cv=2, verbose=2, random_state=random_state,
                                       n_jobs=-1, scoring='f1')
        
        rf_random.fit(X_train, y_train)
        
        return rf_random
    
    def experiment(self, feature, add_Pelle=False, rf_rs_n_iter = 100,
                   ngram_range=(1,1), splitting="STRAT", rs = 42):
        
        start_time = time.time()
        
        #Data split
        ss_split = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=rs)
        s_split = ShuffleSplit(n_splits=10, test_size=0.2, random_state=rs)
        
        if splitting == "STRAT":
        
            splitter = ss_split #Stratified
        
        elif splitting == "SHUFFLE":
            
            splitter = s_split #Shuffle
        
        #Classifiers
        clf_lr = LogisticRegression(solver = "lbfgs", class_weight="balanced")
        clf_svm = svm.SVC(kernel="linear", class_weight="balanced")
        clf_rf = RandomForestClassifier(random_state=42)
        clf_nb = GaussianNB()
        
        #Features       
        if feature == "BOW" or feature == "TFIDF":
            
            bow_vectorizer = CountVectorizer(lowercase=False, ngram_range=ngram_range, analyzer="word")
            tfidf_vectorizer = TfidfVectorizer(lowercase=False, ngram_range=ngram_range, analyzer="word")
        
            X = self.ds_fortuna["preprocessing"] #features
            y = self.ds_fortuna["label"] #target
        
        elif feature == "W2V":
                                   
            X = self.X_fortuna_w2v #features
            y = np.array(self.ds_fortuna["label"].tolist()) #target
        
        clfs = [clf_lr, clf_svm, clf_rf, clf_nb]
        
        exp_id = f"EXP{self.exp_counter}"
        exp_outcomes = {}
        exp_outcomes["id"] = exp_id
        rs_outcomes = {}
        
        #experiment logs
        tags = f"[{feature}]"
        if feature != "W2V": tags += f"[NG{ngram_range}]"
        if add_Pelle: tags += "[PELLE]"
        
        exp_outcomes["tags"] = tags
        exp_outcomes["feature"] = f"{feature}"
        exp_outcomes["ngram"] = f"{ngram_range}"
        if add_Pelle:
            exp_outcomes["pelle"] = 1
        else: 
            exp_outcomes["pelle"] = 0
        exp_outcomes["created"] = f"{date.today()}"
        
        rs_outcomes["tags"] = tags
        rs_outcomes["feature"] = f"{feature}"
        rs_outcomes["ngram"] = f"{ngram_range}"
        if add_Pelle:
            rs_outcomes["pelle"] = 1
        else: 
            rs_outcomes["pelle"] = 0
        exp_outcomes["created"] = f"{date.today()}"
        rs_outcomes["created"] = f"{date.today()}"
        
        split_counter = 0
        
        rs_ok = False
        
        for train_index, test_index in splitter.split(X, y.tolist()):
            
            split_counter += 1
            
            exp_outcomes["split"] = split_counter
            rs_outcomes["split"] = split_counter
            
            for clf in clfs:
                
                X_train, X_test = X[train_index], X[test_index] 
                y_train, y_test = y[train_index], y[test_index]
                
                if add_Pelle:
                    #Adding Off_Pelle in the train split
                    off_pelle = self.ds_pelle[self.ds_pelle["label"] == 1]
                    #Verifying feature type
                    X_train = np.append(X_train, self.X_offPelle_w2v, axis=0) if feature == "W2V" else X_train.append(off_pelle["preprocessing"], ignore_index=True)
                    y_train = np.append(y_train, np.array(off_pelle["label"].tolist()), axis=0) if feature == "W2V" else y_train.append(off_pelle["label"], ignore_index=True)
                
                clf_nb_name = f"{type(clf_nb).__name__}"
                clf_rf_name = f"{type(clf_rf).__name__}"
                clf_lr_name = f"{type(clf_lr).__name__}"
                clf_svm_name = f"{type(clf_svm).__name__}"
                clf_name = f"{type(clf).__name__}"
                
                #adapting classifiers inputs
                if feature == "BOW":
                        
                    X_train = bow_vectorizer.fit_transform(X_train).toarray() if clf_name == clf_nb_name else bow_vectorizer.fit_transform(X_train)
                    X_test = bow_vectorizer.transform(X_test).toarray() if clf_name == clf_nb_name else bow_vectorizer.transform(X_test)
                
                elif feature == "TFIDF":
                    
                    X_train = tfidf_vectorizer.fit_transform(X_train).toarray() if clf_name == clf_nb_name else tfidf_vectorizer.fit_transform(X_train)
                    X_test = tfidf_vectorizer.transform(X_test).toarray() if clf_name == clf_nb_name else tfidf_vectorizer.transform(X_test)
                
                #Random Forest - Random Search
                if clf_name == clf_rf_name and not rs_ok:
                    rf_random = Model.rf_random_search(X_train, y_train, rf_rs_n_iter, rs)      
                    
                    #Getting the best RS parameters
                    clf = rf_random.best_estimator_
                    clfs[2] = rf_random.best_estimator_
                    
                    #Random Search Experiments
                    for i in range(rf_rs_n_iter):
                        rs_outcomes["n_sample"] = i
                        rs_outcomes["n_estimators"] = rf_random.cv_results_["params"][i]["n_estimators"]
                        rs_outcomes["max_features"] = rf_random.cv_results_["params"][i]["max_features"]
                        rs_outcomes["max_depth"] = rf_random.cv_results_["params"][i]["max_depth"]
                        rs_outcomes["min_samples_split"] = rf_random.cv_results_["params"][i]["min_samples_split"]
                        rs_outcomes["min_samples_leaf"] = rf_random.cv_results_["params"][i]["min_samples_leaf"]
                        rs_outcomes["bootstrap"] = rf_random.cv_results_["params"][i]["bootstrap"]
                        rs_outcomes["class_weight"] = rf_random.cv_results_["params"][i]["class_weight"]
                        rs_outcomes["f1_mean_test_score"] = round(rf_random.cv_results_["mean_test_score"][i],2)
                        self.rs_outcomes = self.rs_outcomes.append(rs_outcomes, ignore_index=True)
                    
                    rs_ok = True
                        
                #training
                clf.fit(X_train, y_train)

                #predicting
                pred = clf.predict(X_test)
                
                #metrics
                ac = accuracy_score(y_test, pred)
                f1 = f1_score(y_test, pred)
                roc_auc = roc_auc_score(y_test, pred)
                rec = recall_score(y_test, pred)
                pr = precision_score(y_test, pred)
                
                #experiment logs
                if clf_name == clf_lr_name:
                    exp_outcomes["acc_lr"] = round(ac,2)
                    exp_outcomes["rec_lr"] = round(rec,2)
                    exp_outcomes["pr_lr"] = round(pr,2)
                    exp_outcomes["f1_lr"] = round(f1,2)
                    exp_outcomes["roc_auc_lr"] = round(roc_auc,2)
                elif clf_name == clf_nb_name:
                    exp_outcomes["acc_nb"] = round(ac,2)
                    exp_outcomes["rec_nb"] = round(rec,2)
                    exp_outcomes["pr_nb"] = round(pr,2)
                    exp_outcomes["f1_nb"] = round(f1,2)
                    exp_outcomes["roc_auc_nb"] = round(roc_auc,2)
                elif clf_name == clf_svm_name:
                    exp_outcomes["acc_svm"] = round(ac,2)
                    exp_outcomes["rec_svm"] = round(rec,2)
                    exp_outcomes["pr_svm"] = round(pr,2)
                    exp_outcomes["f1_svm"] = round(f1,2)
                    exp_outcomes["roc_auc_svm"] = round(roc_auc,2)
                elif clf_name == clf_rf_name:
                    exp_outcomes["acc_rf"] = round(ac,2)
                    exp_outcomes["rec_rf"] = round(rec,2)
                    exp_outcomes["pr_rf"] = round(pr,2)
                    exp_outcomes["f1_rf"] = round(f1,2)
                    exp_outcomes["roc_auc_rf"] = round(roc_auc,2)
                
                #Confusion matrix
                
                img_name = f"{feature}_{clf_name}_split({split_counter})"
                if feature != "W2V": img_name += f"_NG{ngram_range[1]}"
                if add_Pelle: img_name += f"_PELLE"
                
                cm, fig = Model.confusionMatrix(y_test, pred, 
                                                img_name=img_name)
                
                plt.savefig(f"experiments/img/{img_name}.png")
                plt.close('all')
                
                #TN = cm[0][0]
                #FN = cm[1][0]
                #TP = cm[1][1]
                #FP = cm[0][1]
        
            exp_outcomes["elapsed"] = time.time() - start_time
            self.exp_counter += 1
            if feature == "BOW":
                self.bow_outcomes = self.bow_outcomes.append(exp_outcomes, ignore_index=True)
            elif feature == "TFIDF":
                self.tfidf_outcomes = self.tfidf_outcomes.append(exp_outcomes, ignore_index=True)
            elif feature == "W2V":
                self.w2v_outcomes = self.w2v_outcomes.append(exp_outcomes, ignore_index=True)


In [2]:
#Model object
model = Model()

In [3]:
#Experimento - NGRAM(1,1); Pelle(0)
model.experiment("BOW", add_Pelle=False, rf_rs_n_iter=200,
                 ngram_range=(1,1), splitting="STRAT", rs=42)

model.experiment("TFIDF", add_Pelle=False, rf_rs_n_iter=200,
                 ngram_range=(1,1), splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 51.8min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 56.4min finished


In [4]:
#Experimento - NGRAM(1,2); Pelle(0)
model.experiment("BOW", add_Pelle=False, rf_rs_n_iter=200,
                 ngram_range=(1,2), splitting="STRAT", rs=42)

model.experiment("TFIDF", add_Pelle=False, rf_rs_n_iter=200,
                 ngram_range=(1,2), splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 32.6min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 78.3min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 84.4min finished


In [5]:
#Experimento - NGRAM(1,3); Pelle(0)
model.experiment("BOW", add_Pelle=False, rf_rs_n_iter=200,
                 ngram_range=(1,3), splitting="STRAT", rs=42)

model.experiment("TFIDF", add_Pelle=False, rf_rs_n_iter=200,
                 ngram_range=(1,3), splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 36.0min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 90.6min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 96.9min finished


In [6]:
#Experimento - W2V; Pelle(0)
model.experiment("W2V", add_Pelle=False, rf_rs_n_iter=200,
                 splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 22.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 52.1min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 58.6min finished


In [6]:
#Experimento - NGRAM(1,1); Pelle(1)
model.experiment("BOW", add_Pelle=True,rf_rs_n_iter=200,
                 ngram_range=(1,1), splitting="STRAT", rs=42)

model.experiment("TFIDF", add_Pelle=True, rf_rs_n_iter=200,
                 ngram_range=(1,1), splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 23.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 59.0min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 63.7min finished


In [7]:
#Experimento - NGRAM(1,2); Pelle(1)
model.experiment("BOW", add_Pelle=True, rf_rs_n_iter=200,
                 ngram_range=(1,2), splitting="STRAT", rs=42)

model.experiment("TFIDF", add_Pelle=True, rf_rs_n_iter=200,
                 ngram_range=(1,2), splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 91.9min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 101.3min finished


In [8]:
#Experimento - NGRAM(1,3); Pelle(1)
model.experiment("BOW", add_Pelle=True, rf_rs_n_iter=200,
                 ngram_range=(1,3), splitting="STRAT", rs=42)

model.experiment("TFIDF", add_Pelle=True, rf_rs_n_iter=200,
                 ngram_range=(1,3), splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 55.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 158.2min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 169.6min finished


In [10]:
#Experimento - W2V; Pelle(1)
model.experiment("W2V", add_Pelle=True, rf_rs_n_iter=200,
                 splitting="STRAT", rs=42)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 24.7min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 57.9min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 65.3min finished


In [9]:
#Salvando arquivos
model.bow_outcomes.to_csv("experiments/BOW_Experiments.csv", index=False, encoding="utf-8")
model.tfidf_outcomes.to_csv("experiments/TFIDF_Experiments.csv", index=False, encoding="utf-8")
model.w2v_outcomes.to_csv("experiments/W2V_Experiments.csv", index=False, encoding="utf-8")
model.rs_outcomes.to_csv("experiments/RS_Experiments.csv", index=False, encoding="utf-8")