In [1]:
import numpy as np
import pandas as pd
import nltk
import pickle
import torch
from skmultilearn.model_selection import IterativeStratification
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from torch.utils.tensorboard import SummaryWriter
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import BCEWithLogitsLoss
from IPython.display import clear_output
from auxiliar_functions import (
    save_data,
    process_folds,
    build_report,
    load_dataset,
    predict_deep,
    load_embedding,
)

nltk.download('stopwords')
spanish_stopwords = nltk.corpus.stopwords.words('spanish') + ["UNK"]

clear_output()

In [2]:
df = load_dataset("preprocess_dataset.npy")
Y_true = np.array([np.array(x) for x in df.frames])

df.head()

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral
0,Japón registró un nuevo déficit comercial réco...,japón registró un nuevo déficit comercial réco...,"[8759, 8914, 9989, 9898, 6584, 8773, 8428, 999...","[0, 1, 0, 0]",0,1,0,0
1,"UDI acusa ""mala memoria"" de la Nueva Mayoría f...",udi acusa mala memoria de la nueva mayoría fre...,"[9610, 8486, 8448, 7205, 10001, 9999, 9927, 97...","[1, 0, 0, 1]",1,0,0,1
2,La misteriosa oferta por Esteban Paredes que i...,la misteriosa oferta por esteban paredes que [...,"[9999, 1121, 8346, 9990, 8487, 8596, 9996, 1, ...","[1, 0, 0, 0]",1,0,0,0
3,La familia maratón que causó revuelo en Holand...,la familia maratón que causó revuelo en holand...,"[9999, 9668, 5417, 9996, 7388, 2016, 9997, 887...","[0, 0, 1, 0]",0,0,1,0
4,Crean sitio web que recopila mangas descontin...,crean sitio web que [UNK] [UNK] [UNK] para [UN...,"[2420, 9319, 9360, 9996, 1, 1, 1, 9985, 1, 998...","[0, 1, 0, 0]",0,1,0,0


In [8]:
class Layer(torch.nn.Module):
    def __init__(self, layer_input_size, hidden_size):
        super().__init__()
        self.fc = torch.nn.Linear(layer_input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.5)

    def forward(self, x):
        output = self.fc(x)
        output = self.relu(output)
        return self.dropout(output)
    
    
class MLP(torch.nn.Module):
    def __init__(self, input_size, hidden_sizes=[50, 150], n_epochs=30, batch_size=32):
        super().__init__()
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        
        self.layers = torch.nn.ModuleList()
        layer_input_size = input_size
        for hidden_size in hidden_sizes:
            self.layers.append(Layer(layer_input_size, hidden_size))
            layer_input_size = hidden_size
            
        self.final_fc = torch.nn.Linear(layer_input_size, 4)
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        output = x
        for layer in self.layers:
            output = layer(output)
        return self.final_fc(output)
    
    def get_proba(self, x):
        return self.sigmoid(self(x))    
        
    def fit(self, X, Y, X_val=None, Y_val=None, writer=None, fold_index=None):
        model = self.cuda(1)
        
        ds_train = TensorDataset(torch.Tensor(X), torch.Tensor(Y))
        train_dl = DataLoader(ds_train, batch_size=32, shuffle=True)
        dataloaders = {"train": train_dl}
        
        if X_val is not None:
            ds_test = TensorDataset(torch.Tensor(X_val), torch.Tensor(Y_val))    
            dataloaders["val"] = DataLoader(ds_test, batch_size=32)
        
        loss_object = torch.nn.BCEWithLogitsLoss()
        optimizer = AdamW(model.parameters(), lr=0.001)  
        
        for epoch in range(self.n_epochs):
            for step in dataloaders:
                if step == "train":
                    model.train()
                else:
                    model.eval()
                    
                total_data = 0.0
                total_corrects = 0.0
                total_loss = 0
                loss_class = [0, 0, 0, 0]
                all_logits = None
                all_target = None

                for x, target in dataloaders[step]:
                    optimizer.zero_grad()                    
                    
                    x = x.cuda(1)
                    target = target.cuda(1)                   
                    logits = model(x)          

                    loss = loss_object(logits, target)
                    total_loss += loss * (target.shape[0] * 4)

                    for i in range(4):
                        loss_c = loss_object(logits[:, i], target[:, i])
                        loss_class[i] += loss_c * target.shape[0]

                    preds = logits >= 0.0
                    if all_logits is None:
                        all_logits = preds.detach().cpu().numpy()
                        all_target = target.detach().cpu().numpy()
                    else:
                        all_logits = np.append(all_logits, preds.detach().cpu().numpy(), axis=0)
                        all_target = np.append(all_target, target.detach().cpu().numpy(), axis=0)

                    total_data += (target.shape[0]*4)      
                    
                    if step == "train":
                        loss.backward()                             
                        optimizer.step()                            
                        correctas = (preds == target).sum().item() 
                        total_corrects += correctas               
                        accuracy = total_corrects/total_data 
                        
                        print("\rEpoch {}: Loss: {:.4f} Accuracy: {:.2f}%".format(epoch, loss, 100*accuracy),
                              end="")

                save_data(writer, all_logits, all_target, total_loss, loss_class,
                          total_data, fold_index, epoch, step)
                 
                    
    def predict_proba(self, X):
        model = self.cuda(1)
        model.eval()
        
        ds_test = TensorDataset(torch.Tensor(X))
        test_dl = DataLoader(ds_test, batch_size=1)
        all_probs = None
        for x, in test_dl:
            x = x.cuda(1)                
            probs = self.get_proba(x)

            if all_probs is None:
                all_probs = probs.detach().cpu().numpy()
            else:
                all_probs = np.append(all_probs, probs.detach().cpu().numpy(), axis=0)

        return all_probs 
    

In [11]:
def cross_validation(mode, model_name, args):
    print(f"{'#'*50}\n\n{mode}\n\n{'#'*50}")
    
    if "tf-idf" not in mode:
        embedding_train = load_embedding(mode, "dataset")
    else: # mode == "tf-idf":
        embedding_train = df.preprocess_text.values
    
    writer = None #SummaryWriter(f"runs/{model_name}_{mode}")
    
    np.random.seed(4444)
    torch.manual_seed(4444)
    
    datasets = []
    k_fold = IterativeStratification(n_splits=10, order=1)

    for fold_index, (train, test) in enumerate(k_fold.split(embedding_train, Y_true)):
        print(f'\nFolds {fold_index + 1}')
        
        Y_train_fold, Y_test_fold = Y_true[train], Y_true[test]
        X_train_fold, X_test_fold = embedding_train[train], embedding_train[test]
        
        if "tf-idf" in mode:
            tf_idf_pipeline = Pipeline([
                  ('vect', CountVectorizer(lowercase=False, stop_words = spanish_stopwords)),
                  ('tfidf', TfidfTransformer(use_idf = True)),
                  ])

            tf_idf_pipeline.fit(X_train_fold)
            X_train_fold = tf_idf_pipeline.transform(X_train_fold).toarray()
            X_test_fold = tf_idf_pipeline.transform(X_test_fold).toarray()

        args["input_size"] = X_train_fold.shape[1]
        
        model = MLP(**args)
        model.fit(X_train_fold, Y_train_fold, X_test_fold, Y_test_fold, writer, fold_index)
        torch.save(model.state_dict(), f'Models/Fold_{fold_index+1}_{model_name}_{mode}.model')
        
        frames_probability = model.predict_proba(X_test_fold).tolist()
        y_pred = [[int(a >= 0.5) for a in x] for x in frames_probability]

        df_r = pd.DataFrame()
        df_r["y_pred"] = y_pred
        df_r["y_prob"] = frames_probability
        df_r["y_true"] = Y_test_fold.tolist()
        
        data = process_folds([df_r])
        datasets.append(df_r.copy())
    
    if writer:
        writer.close()

    data = process_folds(datasets)
    build_report(pd.DataFrame(data.mean()).T, pd.DataFrame(data.std()).T, model_name)

    with open(f"Results/cross_validation#{model_name}#{mode}.pickle", "wb") as file:
        pickle.dump(datasets, file)
        

In [12]:
for embedding in ["tf-idf", "fasttext", "elmo", "beto_embedding_mean", "beto_embedding_cls"]:
    cross_validation(embedding, "MLP-LR",  {'hidden_sizes': [], 'n_epochs': 70})

##################################################

tf-idf

##################################################

Folds 1
Epoch 69: Loss: 0.2261 Accuracy: 96.40%
Folds 2
Epoch 69: Loss: 0.1678 Accuracy: 96.39%
Folds 3
Epoch 69: Loss: 0.1981 Accuracy: 96.52%
Folds 4
Epoch 69: Loss: 0.1972 Accuracy: 96.46%
Folds 5
Epoch 69: Loss: 0.2190 Accuracy: 96.42%
Folds 6
Epoch 69: Loss: 0.1675 Accuracy: 96.50%
Folds 7
Epoch 69: Loss: 0.1918 Accuracy: 96.38%
Folds 8
Epoch 69: Loss: 0.1699 Accuracy: 96.38%
Folds 9
Epoch 69: Loss: 0.2160 Accuracy: 96.39%
Folds 10
Epoch 69: Loss: 0.1701 Accuracy: 96.50%
        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.02)       0.80(±0.02)       0.79(±0.02)
               Macro       0.78(±0.03)       0.69(±0.03)       0.72(±0.03)       0.69(±0.04)       0.83(±0.03)
        
##################################################

fasttext

######

In [13]:
for embedding in ["tf-idf", "fasttext", "elmo", "beto_embedding_mean", "beto_embedding_cls"]:
    cross_validation(embedding, "MLP-1",  {'hidden_sizes': [150], 'n_epochs': 5})

##################################################

tf-idf

##################################################

Folds 1
Epoch 4: Loss: 0.3688 Accuracy: 88.73%
Folds 2
Epoch 4: Loss: 0.2439 Accuracy: 89.26%
Folds 3
Epoch 4: Loss: 0.3040 Accuracy: 88.49%
Folds 4
Epoch 4: Loss: 0.3074 Accuracy: 88.91%
Folds 5
Epoch 4: Loss: 0.2632 Accuracy: 89.07%
Folds 6
Epoch 4: Loss: 0.2446 Accuracy: 88.62%
Folds 7
Epoch 4: Loss: 0.2019 Accuracy: 88.56%
Folds 8
Epoch 4: Loss: 0.3013 Accuracy: 88.61%
Folds 9
Epoch 4: Loss: 0.2361 Accuracy: 88.86%
Folds 10
Epoch 4: Loss: 0.3095 Accuracy: 88.38%
        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0.02)       0.80(±0.02)       0.79(±0.02)
               Macro       0.77(±0.03)       0.69(±0.03)       0.71(±0.03)       0.69(±0.04)       0.83(±0.02)
        
##################################################

fasttext

################