In [1]:
import numpy as np
import pandas as pd
from skmultilearn.model_selection import IterativeStratification
from auxiliar_functions import process_folds, build_report, load_dataset, predict_deep, load_embedding
from IPython.display import clear_output
import pickle
import torch 
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import BCEWithLogitsLoss
from torch.utils.tensorboard import SummaryWriter
from AsymmetricLoss import AsymmetricLossOptimized
from torch.utils.data import Dataset
from sklearn.metrics import precision_score, recall_score, accuracy_score
clear_output()

In [2]:
df = load_dataset("preprocess_dataset.npy")
Y_true = np.array([np.array(x) for x in df.frames])

df.head()

Unnamed: 0,original_text,preprocess_text,encoded,frames,conflicto,economico,humanidad,moral
0,Japón registró un nuevo déficit comercial réco...,japón registró un nuevo déficit comercial réco...,"[8759, 8914, 9989, 9898, 6584, 8773, 8428, 999...","[0, 1, 0, 0]",0,1,0,0
1,"UDI acusa ""mala memoria"" de la Nueva Mayoría f...",udi acusa mala memoria de la nueva mayoría fre...,"[9610, 8486, 8448, 7205, 10001, 9999, 9927, 97...","[1, 0, 0, 1]",1,0,0,1
2,La misteriosa oferta por Esteban Paredes que i...,la misteriosa oferta por esteban paredes que [...,"[9999, 1121, 8346, 9990, 8487, 8596, 9996, 1, ...","[1, 0, 0, 0]",1,0,0,0
3,La familia maratón que causó revuelo en Holand...,la familia maratón que causó revuelo en holand...,"[9999, 9668, 5417, 9996, 7388, 2016, 9997, 887...","[0, 0, 1, 0]",0,0,1,0
4,Crean sitio web que recopila mangas descontin...,crean sitio web que [UNK] [UNK] [UNK] para [UN...,"[2420, 9319, 9360, 9996, 1, 1, 1, 9985, 1, 998...","[0, 1, 0, 0]",0,1,0,0


In [3]:
vocab = [x.strip().split(": ")[0] for x in open("vocabulary_corpus_counter.txt", encoding="UTF-8").readlines()]

print(vocab[:5], vocab[-5:], len(vocab))

['plantaciones', 'lamentando', 'cerrados', 'alessandro', 'consejero'] ['en', 'el', 'la', '[NUM]', 'de'] 10000


In [4]:
#creating vocabulary
vocab_to_idx = {"[PAD]":0, "[UNK]":1}
WORDS = ["[PAD]", "[UNK]"]
for word in vocab:
    vocab_to_idx[word] = len(WORDS)
    WORDS.append(word)

In [5]:
ORDER = ['conflicto', 'economico', 'humanidad', 'moral']

class MyTextDataset(Dataset):
    def __init__(self, X, Y=None):
        self.texts = [torch.LongTensor(x) for x in X]
        self.len = len(X)
        if Y is not None:
            self.frames = Y
        else:
            self.frames = [np.array([0]*4) for _ in range(self.len)]
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, item):
        x = self.texts[item]
        y = self.frames[item]
        return x, y
    
    
def my_collate(data_list):
    data_list.sort(key=lambda x: len(x[0]), reverse=True)
    X_seq, Y = zip(*data_list)
    lengths = [len(x) for x in X_seq]
    X = torch.nn.utils.rnn.pad_sequence(X_seq, batch_first=True)
    return ((X, lengths), torch.Tensor(Y))


def calculate_pres_recall(preds, Y):
    pres_class = [0] * Y.shape[1]
    recall_class = [0] * Y.shape[1]
    acc_class = [0] * Y.shape[1]

    all_y_pred = []
    all_y_true = []
    for i in range(Y.shape[1]):
        y_pred = [int(pred[i]) for pred in preds]
        y_true = [int(target[i]) for target in Y]

        all_y_pred.extend(y_pred)
        all_y_true.extend(y_true)

        pres_class[i] = precision_score(y_true, y_pred, zero_division=0)
        recall_class[i] = recall_score(y_true, y_pred, zero_division=0)
        acc_class[i] = accuracy_score(y_true, y_pred)

    mean_pres = precision_score(all_y_true, all_y_pred, zero_division=0)
    mean_recall = recall_score(all_y_true, all_y_pred, zero_division=0)
    mean_acc = accuracy_score(all_y_true, all_y_pred)

    return mean_pres, mean_recall, mean_acc, pres_class, recall_class, acc_class 


def save_data(writer, all_logits, Y, total_loss, loss_class,
              total_muestras, fold_index, epoch, step):
    if writer is None:
        return

    loss = (total_loss/total_muestras).item()
    pres, recall, acc, pres_class, recall_class, acc_class = calculate_pres_recall(all_logits, Y)

    writer.add_scalar(f'Fold_{fold_index}/loss_{step}', loss, epoch)
    writer.add_scalar(f'Fold_{fold_index}/recall_{step}', recall, epoch)
    writer.add_scalar(f'Fold_{fold_index}/presicion_{step}', pres, epoch)
    writer.add_scalar(f'Fold_{fold_index}/acc_{step}', acc, epoch)

    for i in range(len(loss_class)):
        loss_class_train = loss_class[i]/(total_muestras/Y.shape[1])
        writer.add_scalar(f'Fold_{fold_index}/loss_class_{ORDER[i]}_{step}', loss_class_train, epoch)
        writer.add_scalar(f'Fold_{fold_index}/presicion_{ORDER[i]}_{step}', pres_class[i], epoch)
        writer.add_scalar(f'Fold_{fold_index}/recall_{ORDER[i]}_{step}', recall_class[i], epoch)
        writer.add_scalar(f'Fold_{fold_index}/acc_{ORDER[i]}_{step}', acc_class[i], epoch)

In [12]:
class BiLSTM(torch.nn.Module):
    def __init__(self, embedding_size, hidden_dim_lstm=50, hidden_size=50,
                 n_epochs=30, process_output="max", batch_size=1, 
                 loss_function="cross-entropy"):
        super().__init__()
        
        self.n_epochs = n_epochs
        self.hidden_dim_lstm = hidden_dim_lstm
        self.process_output = process_output
        self.batch_size = batch_size
        self.loss_object = AsymmetricLossOptimized()
        if loss_function == "cross-entropy":
            self.loss_object = torch.nn.BCEWithLogitsLoss()
    
    
        self.embedding = torch.nn.Embedding(len(WORDS), embedding_size, padding_idx=0)
        self.lstm = torch.nn.LSTM(embedding_size, hidden_dim_lstm, 
                                  num_layers=2, bidirectional=True,
                                  batch_first=True)
        
        self.fc1 = torch.nn.Linear(hidden_dim_lstm*2, hidden_size)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.5)
        self.fc2 = torch.nn.Linear(hidden_size, 4)
        self.sigmoid = torch.nn.Sigmoid()    
        
    def forward(self, x):
        output = self.get_embedding(x)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.dropout(output)
        output = self.fc2(output)
        return output
    
    def get_embedding(self, x):
        x, lengths = x
        output = self.embedding(x)
        
        output = torch.nn.utils.rnn.pack_padded_sequence(output, lengths, batch_first=True)
        lstm_out, (hidden, _) = self.lstm(output) # shape = (batch_size, seq_len, hidden_dim*2)
        lstm_out, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True, padding_value=0.0)
        output = lstm_out
        
        if self.process_output == "max":    
            output, _ = torch.max(output, 1) # shape = (batch_size, hidden_dim*2)
        elif self.process_output == "mean":
            mask = output != 0.0
            output = (output*mask).sum(dim=1)/mask.sum(dim=1)
        else: # last_state       # layers # direction (2) # batch size, # hidden_dim
            hidden = hidden.view(2, 2, len(lengths), self.hidden_dim_lstm)[-1]
            output = torch.cat([hidden[0], hidden[1]], 1)
            
        return output
    
    def get_proba(self, x):
        return self.sigmoid(self(x))
        
        
    def fit(self, X, Y, X_val=None, Y_val=None, writer=None, fold_index=None):
        model = self.cuda()
        
        ds_train = MyTextDataset(X, Y)
        train_dl = DataLoader(ds_train, batch_size=self.batch_size, shuffle=True, collate_fn=my_collate)
        dataloaders = {"train": train_dl}
        
        if X_val is not None:
            ds_val = MyTextDataset(X_val, Y_val)    
            dataloaders["val"] = DataLoader(ds_val, batch_size=self.batch_size, collate_fn=my_collate)
        
        loss_object = self.loss_object
        optimizer = AdamW(model.parameters(), lr=0.001)  
        
        for epoch in range(self.n_epochs):
            for step in dataloaders:
                if step == "train":
                    model.train()
                else:
                    model.eval()
                    
                total_muestras = 0.0
                total_correctas = 0.0
                total_loss = 0
                loss_class = [0, 0, 0, 0]
                all_logits = None
                all_target = None

                for (x, length), target in dataloaders[step]:
                    optimizer.zero_grad()                    
                    
                    x = x.cuda()
                    target = target.cuda()                   
                    logits = model((x, length))          

                    loss = loss_object(logits, target)
                    total_loss += loss * (target.shape[0] * target.shape[1])

                    for i in range(target.shape[1]):
                        loss_c = loss_object(logits[:, i], target[:, i])
                        loss_class[i] += loss_c * target.shape[0]

                    preds = logits >= 0.0
                    if all_logits is None:
                        all_logits = preds.detach().cpu().numpy()
                        all_target = target.detach().cpu().numpy()
                    else:
                        all_logits = np.append(all_logits, preds.detach().cpu().numpy(), axis=0)
                        all_target = np.append(all_target, target.detach().cpu().numpy(), axis=0)

                    total_muestras += (target.shape[0]*target.shape[1])
                    
                    if step == "train":
                        loss.backward()                             
                        optimizer.step()                            
                        correctas = (preds == target).sum().item() 
                        total_correctas += correctas               
                        accuracy = total_correctas/total_muestras 
                        
                        print("\rEpoca {}: Loss: {:.4f} Accuracy: {:.2f}%".format(epoch, loss, 100*accuracy),
                              end="")

                save_data(writer, all_logits, all_target, total_loss, loss_class,
                          total_muestras, fold_index, epoch, step)
                 
                    
    def predict_proba(self, X):
        model = self.cuda()
        model.eval()
        
        ds_test = MyTextDataset(X)    
        test_dl = DataLoader(ds_test, batch_size=1, collate_fn=my_collate)
        all_probs = None
        for (x, length), _ in test_dl:
            x = x.cuda()                
            probs = self.get_proba((x, length))

            if all_probs is None:
                all_probs = probs.detach().cpu().numpy()
            else:
                all_probs = np.append(all_probs, probs.detach().cpu().numpy(), axis=0)

        return all_probs 

In [15]:
def cross_validation(args, name):
    writer = SummaryWriter('runs/' + name)
    
    embedding_train = np.array([np.array(x) for x in df.encoded.values], dtype=object)
    
    np.random.seed(4444)
    torch.manual_seed(4444)

    datasets = []
    k_fold = IterativeStratification(n_splits=10, order=1)

    for fold_index, (train, test) in enumerate(k_fold.split(embedding_train, Y_true)):

        Y_train_fold, Y_test_fold = Y_true[train], Y_true[test]
        X_train_fold, X_test_fold = embedding_train[train], embedding_train[test]

        model = BiLSTM(**args)
        model.fit(X_train_fold, Y_train_fold, X_test_fold, Y_test_fold, writer, fold_index)
        torch.save(model.state_dict(), f'Models/Fold_{fold_index+1}_{name}.model')

        frames_probability = model.predict_proba(X_test_fold).tolist()
        y_pred = [[int(pred >= 0.5) for pred in frames] for frames in frames_probability]

        df_result = pd.DataFrame()
        df_result["y_pred"] = y_pred
        df_result["y_prob"] = frames_probability
        df_result["y_true"] = Y_test_fold.tolist()
        print(f'Folds {fold_index + 1}')
        data = process_folds([df_result])
        build_report(pd.DataFrame(data.mean()).T, data.applymap(lambda x:0), "BiLSTM")

        datasets.append(df_result.copy())
    
    if writer:
        writer.close()
    
    print("Finals results")
    data = process_folds(datasets)
    build_report(pd.DataFrame(data.mean()).T, pd.DataFrame(data.std()).T, "BiLSTM")
    
    with open(f"Results/cross_validation_{name}.pickle", "wb") as file:
        pickle.dump(datasets, file)


In [16]:
args = {'embedding_size': 200, 'hidden_size': 200, 'hidden_dim_lstm': 300,
        'n_epochs': 6, 'process_output': 'max', "batch_size": 32,
        "loss_function": "cross-entropy"}
    
cross_validation(args, f'Bi-LSTM')

Epoca 5: Loss: 0.2877 Accuracy: 83.63%Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.76(±0.00)       0.74(±0.00)
               Macro       0.72(±0.00)       0.68(±0.00)       0.67(±0.00)       0.63(±0.00)       0.78(±0.00)
        
Epoca 5: Loss: 0.3467 Accuracy: 84.15%Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.77(±0.00)       0.76(±0.00)
               Macro       0.71(±0.00)       0.70(±0.00)       0.69(±0.00)       0.61(±0.00)       0.79(±0.00)
        
Epoca 5: Loss: 0.3823 Accuracy: 84.68%Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.76(±0.00

In [17]:
args = {'embedding_size': 200, 'hidden_size': 200, 'hidden_dim_lstm': 300,
        'n_epochs': 6, 'process_output': 'max', "batch_size": 32,
        "loss_function": "AsymetricLoss"}
    
cross_validation(args, f'Bi-LSTM_AsymetricLoss')

Epoca 5: Loss: 0.8863 Accuracy: 79.71%%Folds 1

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.76(±0.00)       0.70(±0.00)       0.71(±0.00)
               Macro       0.68(±0.00)       0.71(±0.00)       0.67(±0.00)       0.63(±0.00)       0.78(±0.00)
        
Epoca 5: Loss: 6.6499 Accuracy: 76.02%%Folds 2

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.77(±0.00)       0.70(±0.00)       0.69(±0.00)
               Macro       0.69(±0.00)       0.70(±0.00)       0.65(±0.00)       0.59(±0.00)       0.78(±0.00)
        
Epoca 5: Loss: 7.0524 Accuracy: 80.07%%Folds 3

        Mean                
                            precision           recall         f1-score              AUC          ROC AUC
        
               Micro       0.79(±0