# SWEM (Simple Word Embedding-based Model)

Toma los embeddings y aplica operaciones de Pooling para obtener una representación fija de un documento
Se pueden emplear diferentes técnicas para obtener Embeddings y diferentes modelos de clasificación.

* SWEM-aver:
Hace un promedio de los embeddings

* SWEM-max
Selecciona el valor máximo en cada dimensión del embedding

In [None]:
import sys
import os
import pandas as pd

# Agregar el directorio raiz al PYTHONPATH
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from src.trainers.train_swem import train_swem
from src.trainers.utils import build_datasets, save_metrics
from constants.constants_twitter import TWITTER_DATASET_TRAIN_PATH

dataset_train, dataset_test, dataset_val = build_datasets(
    TWITTER_DATASET_TRAIN_PATH,
    test_size=0.3,
    val_size=0.5, # 0.5 de 0.3    
    random_state=42
)
print(dataset_train.shape)
print(dataset_test.shape)
print(dataset_val.shape)

In [None]:
from src.preprocesamiento.nlp_spacy import Tokenizer

tokenizer = Tokenizer("en")

dataset_train_tokenized = {}
dataset_val_tokenized = {}
dataset_test_tokenized = {}

dataset_train_tokenized['tokens'] = tokenizer.tokenize(dataset_train['text'], )
dataset_val_tokenized['tokens'] = tokenizer.tokenize(dataset_val['text'])
dataset_test_tokenized['tokens'] = tokenizer.tokenize(dataset_test['text'])

dataset_train_tokenized['polarity'] = dataset_train['polarity']
dataset_val_tokenized['polarity'] = dataset_val['polarity']
dataset_test_tokenized['polarity'] = dataset_test['polarity']

## Entrenar con modelo SVM

In [None]:
from src.trainers.utils import SVMModelArgs
from constants.constants_twitter import EMBEDDING_W2V_TWITTER_PATH

svm_model_args = SVMModelArgs()

### Hiperparámetros

In [None]:
import numpy as np

list_pooling = ["aver", "max"]
list_kernel = ["linear", "rbf", 'poly']
list_C = np.logspace(-4, -1, 20)
print(list_C)

In [None]:
best_accuracy = -1

In [None]:
from src.trainers.utils import save_model
from constants.constants_twitter import SWEM_MODEL_PATH, TWITTER_SWEM_METRICS_PATH

for pooling in list_pooling:
    for kernel in list_kernel:
        for C in list_C:
            svm_model_args.kernel = kernel
            svm_model_args.C = C
            model, metrics = train_swem(
                dataset_train_tokenized,
                dataset_val_tokenized,
                embeddings_path=EMBEDDING_W2V_TWITTER_PATH,
                pooling=pooling,
                classifier="svm",
                model_args=svm_model_args
            )
            print(f"SWEM SVM {pooling} {kernel} {C}: {metrics['accuracy']}")
            # Guardar Pipeline
            save_metrics(metrics, TWITTER_SWEM_METRICS_PATH)
            if metrics['accuracy'] > best_accuracy:
                best_accuracy = metrics['accuracy']
                save_model(model, SWEM_MODEL_PATH)

## Entrenar con modelo LR

In [None]:
from src.trainers.utils import LRModelArgs

lr_model_args = LRModelArgs()

### Hiperparámetros

In [None]:
import numpy as np

list_pooling = ["aver", "max"]
list_solver_l1 = ["saga"]
list_solver_l2 = ["lbfgs", "saga"]
list_C = np.logspace(-4, -1, 20)
print(list_C)

### Penalty L2

In [None]:
from constants.constants_twitter import SWEM_MODEL_PATH, TWITTER_SWEM_METRICS_PATH
from src.trainers.utils import save_model

for pooling in list_pooling:
    for solver in list_solver_l2:
        for C in list_C:
            lr_model_args.penalty="l2" 
            lr_model_args.C = C
            lr_model_args.max_iter = 300
            lr_model_args.solver = solver
            
            model, metrics = train_swem(
                dataset_train_tokenized,
                dataset_val_tokenized,
                embeddings_path=EMBEDDING_W2V_TWITTER_PATH,
                pooling=pooling,
                classifier="lr",
                model_args=lr_model_args
            )
            print(f"SWEM LR l2 {pooling} {solver} {C}: {metrics['accuracy']}")
            save_metrics(metrics, TWITTER_SWEM_METRICS_PATH)
            if metrics['accuracy'] > best_accuracy:
                best_accuracy = metrics['accuracy']
                save_model(model, SWEM_MODEL_PATH)

### Penalty L1

In [None]:
from constants.constants_twitter import SWEM_MODEL_PATH, TWITTER_SWEM_METRICS_PATH
from src.trainers.utils import save_model

for pooling in list_pooling:
    for solver in list_solver_l1:
        for C in list_C:
            lr_model_args.penalty="l1" 
            lr_model_args.C = C
            lr_model_args.max_iter = 300
            lr_model_args.solver = solver
            
            model, metrics = train_swem(
                dataset_train_tokenized,
                dataset_val_tokenized,
                embeddings_path=EMBEDDING_W2V_TWITTER_PATH,
                pooling=pooling,
                classifier="lr",
                model_args=lr_model_args
            )
            print(f"SWEM LR l1 {pooling} {solver} {C}: {metrics['accuracy']}")
            save_metrics(metrics, TWITTER_SWEM_METRICS_PATH)
            if metrics['accuracy'] > best_accuracy:
                best_accuracy = metrics['accuracy']
                save_model(model, SWEM_MODEL_PATH)

## Modelo con mayor accuracy

In [None]:
# Seleccionar los hiperparámetros que generan mayor accuracy
df_metrics = pd.read_csv(TWITTER_SWEM_METRICS_PATH)

best_acc = df_metrics.loc[df_metrics['accuracy'].idxmax()]
print(best_acc)

In [None]:
from src.trainers.train_swem import evaluate_model
from src.trainers.utils import load_model

# Evaluar modelo con datos de prueba
model = load_model(SWEM_MODEL_PATH)
metrics = evaluate_model(
    model, 
    dataset_test_tokenized,
    "Simple Word Embedding-based Model",
    EMBEDDING_W2V_TWITTER_PATH,
    best_acc['pooling'],
    )
print(metrics)