In [5]:
import numpy as np
import pandas as pd
from skmultilearn.model_selection import IterativeStratification
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.dummy import DummyClassifier
from auxiliar_functions import process_folds, build_report, load_dataset, predict, load_embedding
from nltk.corpus import stopwords  
from sklearn.naive_bayes import GaussianNB
from IPython.display import clear_output
import nltk
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords  
from joblib import dump as dump_model_joblib

nltk.download('stopwords')
spanish_stopwords = stopwords.words('spanish') + ["UNK"]

clear_output()

In [6]:
df = load_dataset("preprocess_dataset.npy")
Y_true = np.array([np.array(x) for x in df.frames])

df.head()

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral
0,Japón registró un nuevo déficit comercial réco...,japón registró un nuevo déficit comercial réco...,"[8759, 8914, 9989, 9898, 6584, 8773, 8428, 999...","[0, 1, 0, 0]",0,1,0,0
1,"UDI acusa ""mala memoria"" de la Nueva Mayoría f...",udi acusa mala memoria de la nueva mayoría fre...,"[9610, 8486, 8448, 7205, 10001, 9999, 9927, 97...","[1, 0, 0, 1]",1,0,0,1
2,La misteriosa oferta por Esteban Paredes que i...,la misteriosa oferta por esteban paredes que [...,"[9999, 1121, 8346, 9990, 8487, 8596, 9996, 1, ...","[1, 0, 0, 0]",1,0,0,0
3,La familia maratón que causó revuelo en Holand...,la familia maratón que causó revuelo en holand...,"[9999, 9668, 5417, 9996, 7388, 2016, 9997, 887...","[0, 0, 1, 0]",0,0,1,0
4,Crean sitio web que recopila mangas descontin...,crean sitio web que [UNK] [UNK] [UNK] para [UN...,"[2420, 9319, 9360, 9996, 1, 1, 1, 9985, 1, 998...","[0, 1, 0, 0]",0,1,0,0


In [9]:
def cross_validation(mode):

    embedding_train = df.preprocess_text.values
    
    def run_model(classifier, args):
        datasets = []
        k_fold = IterativeStratification(n_splits=10, order=1)
        np.random.seed(4444)
        for fold_index, (train, test) in enumerate(k_fold.split(embedding_train, Y_true)):
            Y_train_fold, Y_test_fold = Y_true[train], Y_true[test]
            X_train_fold, X_test_fold = embedding_train[train], embedding_train[test]
            
            model = Pipeline([
                  ('vect', CountVectorizer(lowercase=False, stop_words = spanish_stopwords)),
                  ('tfidf', TfidfTransformer(use_idf = False)),
                  ])

            model.fit(X_train_fold)
            
            X_train_fold = model.transform(X_train_fold).toarray()
            X_test_fold = model.transform(X_test_fold).toarray()
            
            model = MultiOutputClassifier(classifier(**args))
            model.fit(X_train_fold, Y_train_fold)
            # Save with joblib
            dump_model_joblib(model, f'Models/Fold_{fold_index + 1}_baseline_{mode}.joblib')
            
            try:
                frames_probability = model.predict_proba(X_test_fold)
                frames_probability = [[a[1] for a in x] for x in zip(*frames_probability)]
                y_pred = [[int(a > 0.5) for a in x] for x in frames_probability]
            except:
                y_pred = model.predict(X_test_fold).tolist()
                frames_probability = y_pred.tolist()

            df = pd.DataFrame()
            df["y_pred"] = y_pred
            df["y_prob"] = frames_probability
            df["y_true"] = Y_test_fold.tolist()
            print(f'Folds {fold_index + 1}')
            data = process_folds([df])
            build_report(pd.DataFrame(data.mean()).T, data.applymap(lambda x:0), mode)

            datasets.append(df.copy())

        return datasets

    @ignore_warnings(category=ConvergenceWarning)
    def cross_validation_Naive(mode):

        if mode == "random":
            datasets = run_model(DummyClassifier, {"strategy": "stratified"})
            print("Finals results")        
            data = process_folds(datasets)
            build_report(pd.DataFrame(data.mean()).T, pd.DataFrame(data.std()).T, mode)

            with open(f"Results/cross_validation#{mode}#baseline.pickle", "wb") as file:
                pickle.dump(datasets, file)

        else:
            datasets = run_model(GaussianNB, {})
            print("Finals results")        
            data = process_folds(datasets)
            build_report(pd.DataFrame(data.mean()).T, pd.DataFrame(data.std()).T, mode)

            with open(f"Results/cross_validation#{mode}#baseline.pickle", "wb") as file:
                pickle.dump(datasets, file)
        
    cross_validation_Naive(mode)

In [10]:
cross_validation("random")

Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.62(±0.00)       0.62(±0.00)       0.62(±0.00)
               Macro       0.52(±0.00)       0.52(±0.00)       0.52(±0.00)       0.41(±0.00)       0.52(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.61(±0.00)       0.60(±0.00)       0.61(±0.00)
               Macro       0.50(±0.00)       0.50(±0.00)       0.50(±0.00)       0.40(±0.00)       0.50(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.61(±0.00)       0.62(±0.00)       0.62(±0.00)
               Macro       0.50(±0.00)       0.50(±0.00)       0.50(±0.00)  

In [11]:
cross_validation("naive_bayes")

Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.73(±0.00)       0.74(±0.00)       0.73(±0.00)
               Macro       0.65(±0.00)       0.63(±0.00)       0.64(±0.00)       0.56(±0.00)       0.64(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.72(±0.00)       0.74(±0.00)       0.72(±0.00)
               Macro       0.66(±0.00)       0.62(±0.00)       0.62(±0.00)       0.54(±0.00)       0.62(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.72(±0.00)       0.73(±0.00)       0.72(±0.00)
               Macro       0.64(±0.00)       0.62(±0.00)       0.62(±0.00)  