# SVM - TikTok

In [None]:
import sys
import os
import pandas as pd

# Agregar el directorio raiz al PYTHONPATH
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

Importar modelo de SVM para entrenarlo

In [None]:
from src.trainers.train_svm import train_svm
from src.trainers.utils import build_datasets, save_metrics
from constants.constants_twitter import TWITTER_DATASET_TRAIN_PATH

dataset_train, dataset_test, dataset_val = build_datasets(
    TWITTER_DATASET_TRAIN_PATH,
    test_size=0.3,
    val_size=0.5, # 0.5 de 0.3    
    random_state=42
)
print(dataset_train.shape)
print(dataset_test.shape)
print(dataset_val.shape)

In [None]:
import numpy as np

list_kernel = ["linear", "rbf", 'poly']
list_vectorizers = ["tfidf", "bow"]
list_C = np.logspace(-2, 1, 20)
print(list_C)

## Entrenar modelos

In [None]:
from constants.constants_twitter import SVM_PIPELINE_PATH, TWITTER_SVM_METRICS_PATH
from src.trainers.utils import save_model

best_accuracy = -1
for vectorizer in list_vectorizers:
    for kernel in list_kernel:
        for C in list_C:
            pipeline, metrics = train_svm(
                dataset_train,
                dataset_val,
                C=C,
                kernel=kernel,
                vec=vectorizer
            )
            print(f"SVM {vectorizer} {kernel} {C}: {metrics['accuracy']}")
            save_metrics(metrics, TWITTER_SVM_METRICS_PATH)
            # Guardar Pipeline
            if metrics['accuracy'] > best_accuracy:
                save_model(pipeline, SVM_PIPELINE_PATH)
                

## Modelo con mayor accuracy

In [None]:
# Seleccionar los hiperparámetros que generan mayor accuracy
df_metrics = pd.read_csv(TWITTER_SVM_METRICS_PATH)

best_acc = df_metrics.loc[df_metrics['accuracy'].idxmax()]
print(best_acc)

In [None]:
from src.trainers.utils import evaluate_model
from src.trainers.utils import load_model
from constants.constants_twitter import SVM_PIPELINE_PATH

# Evaluar modelo con datos de prueba
pipeline = load_model(SVM_PIPELINE_PATH)
metrics = evaluate_model(pipeline, dataset_test, title="Support Vector Machine")
print(metrics)

## Test

In [None]:
from constants.constants_nlp import INDEX_TO_POLARITY
from src.preprocesamiento.clean import clean_text
from src.preprocesamiento.nlp_spacy import preprocesamiento

textos_test = ["i'm study hard", "i'm happy"]
textos_test = list(map(lambda x: clean_text(x, "en"), textos_test))
textos_test = preprocesamiento(textos_test, stemming=True, lang="en")

preds = pipeline.predict(textos_test)
for input, pred in zip(textos_test,preds):
    print(f"{input}: {INDEX_TO_POLARITY[pred]}")