# Multi Layer Perceptron

In [1]:
import sys
import os

# Agregar el directorio raiz al PYTHONPATH
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from src.trainers.utils import build_datasets, save_metrics, set_seed
from constants.constants_tiktok import TIKTOK_DATASET_SENTENCES

set_seed()

dataset_train, dataset_test, dataset_val = build_datasets(
    TIKTOK_DATASET_SENTENCES,
    test_size=0.3,
    val_size=0.5, # 0.5 de 0.3    
    random_state=42
)
print(dataset_train.shape)
print(dataset_test.shape)
print(dataset_val.shape)

Train: 20.32%
Test: 4.36%
Val: 4.36%
(13727, 2)
(2942, 2)
(2942, 2)


In [None]:
from src.preprocesamiento.nlp_spacy import Tokenizer

tokenizer = Tokenizer()

dataset_train_tokenized = {}
dataset_val_tokenized = {}
dataset_test_tokenized = {}

dataset_train_tokenized['tokens'] = tokenizer.tokenize(dataset_train['text'], True)
dataset_val_tokenized['tokens'] = tokenizer.tokenize(dataset_val['text'], True)
dataset_test_tokenized['tokens'] = tokenizer.tokenize(dataset_test['text'], True)

dataset_train_tokenized['polarity'] = dataset_train['polarity'].to_numpy()
dataset_val_tokenized['polarity'] = dataset_val['polarity'].to_numpy()
dataset_test_tokenized['polarity'] = dataset_test['polarity'].to_numpy()

Modelo cargado: es_core_news_sm


100%|██████████| 13727/13727 [00:12<00:00, 1066.98it/s]
100%|██████████| 2942/2942 [00:02<00:00, 1086.47it/s]
100%|██████████| 2942/2942 [00:02<00:00, 1136.68it/s]


## Entrenar MLP

Hiperparámetros

In [None]:
import numpy as np
from src.trainers.utils import ModelArgs, EarlyStopping 
from constants.constants_tiktok import EMBEDDING_W2V_TIKTOK_SENTENCES_PATH

list_pooling = ["aver", "max"]
list_optim = ["adam", "sgd"]
batch_size = 64
list_lr = np.logspace(-3, -0.3, 5)
print(list_lr)
epochs = 50

patience = 15
min_delta = 1e-4

model_args = ModelArgs()
model_args.hidden_layers = [128, 64] # 128 bad
model_args.output_size = 3
model_args.dropout = 0.20

[0.001      0.00473151 0.02238721 0.10592537 0.50118723]


In [None]:
import os
from src.trainers.train_mlp import train_mlp
from src.trainers.utils import show_loss_val_curves, save_model_torch
from constants.constants_tiktok import MLP_SWEM_LOSS_CURVES_DIR, MLP_SWEM_MODEL_PATH, TIKTOK_MLP_SWEM_METRICS_PATH

best_accuracy = -1
for pooling in list_pooling:
    for optim in list_optim:
        for lr in list_lr:
            model, metrics, train_losses, val_losses = train_mlp(
                dataset_train=dataset_train_tokenized,
                dataset_val=dataset_val_tokenized,
                embeddings_path=EMBEDDING_W2V_TIKTOK_SENTENCES_PATH,
                model_args=model_args,
                early_stopping = EarlyStopping(patience, min_delta),
                batch_size=batch_size,
                lr=lr,
                epochs=epochs,
                optim=optim,
                pooling=pooling,
                use_class_weights=True
            )
            print(f"[MLP {pooling} {optim} {lr} ({model_args.hidden_layers}) {model_args.dropout}] acc: {metrics['accuracy']:.4f}")
            save_metrics(metrics, TIKTOK_MLP_SWEM_METRICS_PATH)
            title = f"MLP_SWEM_{metrics['pooling']}_{metrics['optim']} {'-'.join(str(lr).split('.'))}"
            path = os.path.join(MLP_SWEM_LOSS_CURVES_DIR, f"{title}.png")
            show_loss_val_curves(train_losses, val_losses, len(train_losses), path)
            if metrics['accuracy'] > best_accuracy:
                best_accuracy = metrics['accuracy']
                save_model_torch(model.get_model(), MLP_SWEM_MODEL_PATH)

## Modelo con mayor accuracy

In [4]:
import pandas as pd

# Seleccionar los hiperparámetros que generan mayor accuracy
df_metrics = pd.read_csv(TIKTOK_MLP_SWEM_METRICS_PATH)

best_acc = df_metrics.loc[df_metrics['accuracy'].idxmax()]
print(best_acc)

accuracy               0.35588
recall                0.355883
precision             0.355943
f1_score              0.355874
model                 MLP SWEM
pooling                   aver
optim                     adam
lr                    0.001292
patience                   NaN
min_delta               0.0001
hidden_layers    [128, 64, 32]
output_size                  3
dropout                    0.2
epochs                     100
batch_size                 128
embedding_dim              100
train_time           30.543442
Name: 2, dtype: object


In [None]:
import re
from src.trainers.utils import load_model_torch, ModelArgs
from src.trainers.train_mlp import evaluate_model, MLPModelCustom
from constants.constants_tiktok import MLP_SWEM_MODEL_PATH, EMBEDDING_W2V_TIKTOK_SENTENCES_PATH

model_args = ModelArgs()
model_args.input_size = best_acc['embedding_dim']
model_args.hidden_layers = [int(layer) for layer in re.findall(r"\d+", best_acc['hidden_layers'])]
model_args.output_size = 3
model_args.dropout = best_acc['dropout']

model = MLPModelCustom(model_args)
model = load_model_torch(model, MLP_SWEM_MODEL_PATH)

metrics = evaluate_model(
    model,
    dataset_test_tokenized,
    "MLP",
    EMBEDDING_W2V_TIKTOK_SENTENCES_PATH,
    best_acc['pooling'],
    64
)
display(metrics)

In [None]:
from src.trainers.train_mlp import SentimentAnalysis

cls = SentimentAnalysis(model, EMBEDDING_W2V_TIKTOK_SENTENCES_PATH, tokenizer, "cpu", "aver")

In [8]:

from src.preprocesamiento.clean import clean_text
from src.preprocesamiento.nlp_spacy import preprocesamiento

textos_test = ["Hola! cómo estás???😀","el dia de hoy estoy feliz y contento", "estoy muy triste", "Me encuentro estudiando para un exámen", "A veces me encuentro triste, pero la mayoria del tiempo estoy muy feliz"]
textos_test = list(map(lambda x: clean_text(x), textos_test))
textos_test = preprocesamiento(textos_test)
for texto in textos_test:
    print(cls.predict(texto))

Modelo cargado: es_core_news_sm


100%|██████████| 5/5 [00:00<00:00, 1245.12it/s]
5it [00:00, 455.17it/s]

Aplicando stemming...
Total de documentos preprocesados: 5
[('NEU', 0.8706167936325073), ('POS', 0.12933428585529327), ('NEG', 4.89312042191159e-05)]
[('POS', 0.6948035359382629), ('NEU', 0.3051930069923401), ('NEG', 3.491584266157588e-06)]
[('NEG', 0.9999744892120361), ('NEU', 2.5472378183621913e-05), ('POS', 1.0228039160369317e-19)]
[('NEU', 0.9985527396202087), ('NEG', 0.0009148080134764314), ('POS', 0.0005324697121977806)]
[('POS', 0.7565785050392151), ('NEG', 0.23506490886211395), ('NEU', 0.008356570266187191)]



