In [11]:
import numpy as np
import pandas as pd
from skmultilearn.model_selection import IterativeStratification
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from auxiliar_functions import process_folds, build_report, load_dataset, load_embedding
from IPython.display import clear_output
import nltk
import pickle
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords  
from joblib import dump as dump_model_joblib
from joblib import load as load_model_joblib

nltk.download('stopwords')
spanish_stopwords = stopwords.words('spanish') + ["UNK"]

clear_output()

In [12]:
df = load_dataset("preprocess_dataset.npy")
Y_true = np.array([np.array(x) for x in df.frames])

df.head()

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral
0,Japón registró un nuevo déficit comercial réco...,japón registró un nuevo déficit comercial réco...,"[8759, 8914, 9989, 9898, 6584, 8773, 8428, 999...","[0, 1, 0, 0]",0,1,0,0
1,"UDI acusa ""mala memoria"" de la Nueva Mayoría f...",udi acusa mala memoria de la nueva mayoría fre...,"[9610, 8486, 8448, 7205, 10001, 9999, 9927, 97...","[1, 0, 0, 1]",1,0,0,1
2,La misteriosa oferta por Esteban Paredes que i...,la misteriosa oferta por esteban paredes que [...,"[9999, 1121, 8346, 9990, 8487, 8596, 9996, 1, ...","[1, 0, 0, 0]",1,0,0,0
3,La familia maratón que causó revuelo en Holand...,la familia maratón que causó revuelo en holand...,"[9999, 9668, 5417, 9996, 7388, 2016, 9997, 887...","[0, 0, 1, 0]",0,0,1,0
4,Crean sitio web que recopila mangas descontin...,crean sitio web que [UNK] [UNK] [UNK] para [UN...,"[2420, 9319, 9360, 9996, 1, 1, 1, 9985, 1, 998...","[0, 1, 0, 0]",0,1,0,0


In [15]:
def cross_validation(mode, model_name, args):
    print(mode)
    if "tf-idf" not in mode:
        embedding_train = load_embedding(mode, "dataset")
    else: #mode == tf-idf
        embedding_train = df.preprocess_text.values
    
    @ignore_warnings(category=ConvergenceWarning)
    def run_model(classifier, args):
        datasets = []
        k_fold = IterativeStratification(n_splits=10, order=1)
        np.random.seed(4444)
        for fold_index, (train, test) in enumerate(k_fold.split(embedding_train, Y_true)):
            Y_train_fold, Y_test_fold = Y_true[train], Y_true[test]
            X_train_fold, X_test_fold = embedding_train[train], embedding_train[test]
            
            if "tf-idf" in mode:
                model = Pipeline([
                      ('vect', CountVectorizer(lowercase=False, stop_words = spanish_stopwords)),
                      ('tfidf', TfidfTransformer(use_idf = True))])

                model.fit(X_train_fold)
                X_train_fold = model.transform(X_train_fold)
                X_test_fold = model.transform(X_test_fold)
            
            model = MultiOutputClassifier(classifier(**args))
            model.fit(X_train_fold, Y_train_fold)
            
            # Save with joblib
            dump_model_joblib(model, f'Models/Fold_{fold_index + 1}_{model_name}_{mode}.joblib')
            
            try:
                frames_probability = model.predict_proba(X_test_fold)
                frames_probability = [[a[1] for a in x] for x in zip(*frames_probability)]
                y_pred = [[int(a > 0.5) for a in x] for x in frames_probability]
            except:
                y_pred = model.predict(X_test_fold).tolist()
                frames_probability = y_pred.tolist()

            df = pd.DataFrame()
            df["y_pred"] = y_pred
            df["y_prob"] = frames_probability
            df["y_true"] = Y_test_fold.tolist()
            print(f'Folds {fold_index + 1}')
            data = process_folds([df])
            build_report(pd.DataFrame(data.mean()).T, data.applymap(lambda x:0), mode)

            datasets.append(df.copy())

        return datasets

    datasets = run_model(SVC, args = args)

    print("Finals results")

    data = process_folds(datasets)
    build_report(pd.DataFrame(data.mean()).T, pd.DataFrame(data.std()).T, mode)

    with open(f"Results/cross_validation#{model_name}#{mode}.pickle", "wb") as file:
        pickle.dump(datasets, file)        

In [16]:
cross_validation("tf-idf", "SVM", {'probability': True})

tf-idf
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.77(±0.00)
               Macro       0.76(±0.00)       0.69(±0.00)       0.71(±0.00)       0.73(±0.00)       0.84(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.77(±0.00)       0.71(±0.00)       0.72(±0.00)       0.69(±0.00)       0.84(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.82(±0.00)       0.83(±0.00)       0.82(±0.00)
               Macro       0.80(±0.00)       0.74(±0.00)       0.76(±

In [17]:
cross_validation("elmo", "SVM", {'probability': True})

elmo
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.75(±0.00)       0.76(±0.00)       0.75(±0.00)
               Macro       0.71(±0.00)       0.66(±0.00)       0.68(±0.00)       0.64(±0.00)       0.79(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.78(±0.00)       0.77(±0.00)
               Macro       0.74(±0.00)       0.68(±0.00)       0.69(±0.00)       0.66(±0.00)       0.81(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.77(±0.00)       0.69(±0.00)       0.71(±0.

In [18]:
cross_validation("beto_embedding_cls", "SVM", {'probability': True})

beto_embedding_cls
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.78(±0.00)       0.76(±0.00)
               Macro       0.74(±0.00)       0.67(±0.00)       0.69(±0.00)       0.66(±0.00)       0.82(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.75(±0.00)       0.71(±0.00)       0.72(±0.00)       0.69(±0.00)       0.82(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.77(±0.00)       0.69(±0.00) 

In [19]:
cross_validation("beto_embedding_mean", "SVM", {'probability': True})

beto_embedding_mean
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.78(±0.00)
               Macro       0.75(±0.00)       0.70(±0.00)       0.71(±0.00)       0.68(±0.00)       0.82(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.78(±0.00)
               Macro       0.74(±0.00)       0.71(±0.00)       0.72(±0.00)       0.69(±0.00)       0.83(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.80(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.77(±0.00)       0.71(±0.00)

In [20]:
cross_validation("fasttext", "SVM", {'probability': True})

fasttext
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.78(±0.00)
               Macro       0.76(±0.00)       0.70(±0.00)       0.72(±0.00)       0.70(±0.00)       0.83(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.79(±0.00)
               Macro       0.75(±0.00)       0.71(±0.00)       0.72(±0.00)       0.69(±0.00)       0.83(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.77(±0.00)       0.71(±0.00)       0.73

##  SVM Linear

In [21]:
cross_validation("tf-idf", "SVM_Linear",  {'probability': True, 'kernel': 'linear'})

tf-idf
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.78(±0.00)       0.71(±0.00)       0.73(±0.00)       0.71(±0.00)       0.83(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.76(±0.00)       0.70(±0.00)       0.72(±0.00)       0.69(±0.00)       0.82(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.81(±0.00)       0.82(±0.00)       0.81(±0.00)
               Macro       0.79(±0.00)       0.73(±0.00)       0.75(±

In [22]:
cross_validation("elmo", "SVM_Linear",  {'probability': True, 'kernel': 'linear'})

elmo
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.78(±0.00)       0.76(±0.00)
               Macro       0.74(±0.00)       0.66(±0.00)       0.68(±0.00)       0.64(±0.00)       0.79(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.77(±0.00)
               Macro       0.74(±0.00)       0.68(±0.00)       0.69(±0.00)       0.65(±0.00)       0.81(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.80(±0.00)       0.80(±0.00)       0.79(±0.00)
               Macro       0.78(±0.00)       0.69(±0.00)       0.71(±0.

In [23]:
cross_validation("beto_embedding_cls",  "SVM_Linear", {'probability': True, 'kernel': 'linear'})

beto_embedding_cls
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.74(±0.00)       0.76(±0.00)       0.73(±0.00)
               Macro       0.71(±0.00)       0.62(±0.00)       0.62(±0.00)       0.57(±0.00)       0.75(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.76(±0.00)       0.78(±0.00)       0.75(±0.00)
               Macro       0.73(±0.00)       0.64(±0.00)       0.65(±0.00)       0.62(±0.00)       0.78(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.73(±0.00)       0.76(±0.00)       0.73(±0.00)
               Macro       0.69(±0.00)       0.61(±0.00) 

In [24]:
cross_validation("beto_embedding_mean",  "SVM_Linear",  {'probability': True, 'kernel': 'linear'})

beto_embedding_mean
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.78(±0.00)       0.76(±0.00)
               Macro       0.75(±0.00)       0.67(±0.00)       0.69(±0.00)       0.65(±0.00)       0.80(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.78(±0.00)       0.77(±0.00)
               Macro       0.73(±0.00)       0.68(±0.00)       0.70(±0.00)       0.66(±0.00)       0.80(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.77(±0.00)
               Macro       0.76(±0.00)       0.68(±0.00)

In [25]:
cross_validation("fasttext", "SVM_Linear",  {'probability': True, 'kernel': 'linear'})

fasttext
Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.78(±0.00)       0.76(±0.00)
               Macro       0.74(±0.00)       0.68(±0.00)       0.69(±0.00)       0.67(±0.00)       0.82(±0.00)
        
Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.79(±0.00)       0.77(±0.00)
               Macro       0.73(±0.00)       0.69(±0.00)       0.70(±0.00)       0.67(±0.00)       0.82(±0.00)
        
Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.78(±0.00)       0.79(±0.00)       0.78(±0.00)
               Macro       0.75(±0.00)       0.68(±0.00)       0.70