In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn

from model2vec import StaticModel
from langchain.embeddings.base import Embeddings
from typing import List, Tuple, Dict

In [24]:
# --- Sentence Embedding ---
class Model2VecEmbeddings(Embeddings):
        """Wrapper para o Model2Vec como Embeddings do LangChain"""
        def __init__(self, model_name: str):
            self.model = StaticModel.from_pretrained(model_name)

        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            return self.model.encode(texts).tolist()
        
        def embed_query(self, text: str) -> List[float]:
            return self.model.encode([text]).tolist()[0]
        
print("Gerando embeddings com Model2Vec...")
model_name = "minishlab/potion-base-2M"
emb_model_name = Model2VecEmbeddings(model_name)

INFO:model2vec.hf_utils:Folder does not exist locally, attempting to use huggingface hub.


Gerando embeddings com Model2Vec...


In [25]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")  # Ajuste para seu servidor MLflow
# Experimento MLflow
mlflow.set_experiment("Valid_Emb_Trimmed_Fix-Negative_Sentiment_Analysis_Restaurant")

<Experiment: artifact_location='mlflow-artifacts:/164704175141499300', creation_time=1747257156553, experiment_id='164704175141499300', last_update_time=1747257156553, lifecycle_stage='active', name='Valid_Emb_Trimmed_Fix-Negative_Sentiment_Analysis_Restaurant', tags={}>

In [26]:
import os
import joblib
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SentimentValidatorEmbeddings:
    def __init__(self, parquet_path, embedding_model_name = "minishlab/potion-base-2M"):
        self.embedding_model = Model2VecEmbeddings(embedding_model_name)
        self.parquet_path = parquet_path
        self.label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
        self.inverse_mapping = {v: k for k, v in self.label_mapping.items()}
        
    def load_model_and_components(self, model_name):
        """Carrega o modelo e extrai o vetorizador corretamente"""
        try:
            model_uri = f"models:/sentiment_{model_name}/latest"
            
            sklearn_model = mlflow.sklearn.load_model(model_uri)
            return sklearn_model
        
        except Exception as e:
            model_uri = f"models:/{model_name}/latest"
            
            sklearn_model = mlflow.sklearn.load_model(model_uri)
            return sklearn_model

    def load_data(self):
        try:
            df = pd.read_parquet(self.parquet_path)
            df = df[df['sentiment'].isin(self.label_mapping.keys())]
            df = df.dropna(subset=['comment_cleaned', 'sentiment'])
            if len(df) == 0:
                raise ValueError("Nenhum dado válido após filtragem")
            return df['comment_cleaned'].values, df['sentiment'].values
        except Exception as e:
            logger.error(f"Erro ao carregar dados: {str(e)}")
            raise

    def validate(self):
        """Valida usando embeddings + KMeans + modelo supervisionado"""
        try:
            with mlflow.start_run(run_name="Valid_Emb_KMeans"):
                X_val_raw, y_val = self.load_data()
                logger.info(f"Validação com {len(X_val_raw)} amostras")

                # 1. Embedding
                X_embed = self.embedding_model.embed_documents(X_val_raw)

                # 2. Baixar artifacts: kmeans e modelo
                model = self.load_model_and_components(model_name="LogReg_Emb_KMeans")
                
                # 3. Clustering
                cluster_features = model.predict(X_embed).reshape(-1, 1)
                X_val_aug = np.hstack([X_embed, cluster_features])

                # 4. Inferência
                y_pred = model.predict(X_val_aug)
                y_pred_text = [self.inverse_mapping.get(int(y), "Neutral") for y in y_pred]

                # 5. Métricas
                self._log_metrics(y_val, y_pred_text)

        except Exception as e:
            logger.error(f"Erro na validação: {str(e)}")

    def _log_metrics(self, y_true, y_pred):
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        report = classification_report(y_true, y_pred, output_dict=True)

        mlflow.log_metrics({
            "val_accuracy": acc,
            "val_f1_weighted": f1
        })

        for cls in ['Negative', 'Neutral', 'Positive']:
            if cls in report:
                mlflow.log_metrics({
                    f"val_precision_{cls.lower()}": report[cls]['precision'],
                    f"val_recall_{cls.lower()}": report[cls]['recall'],
                    f"val_f1_{cls.lower()}": report[cls]['f1-score'],
                    f"val_support_{cls.lower()}": report[cls]['support']
                })

        self._plot_confusion_matrix(y_true, y_pred)

    def _plot_confusion_matrix(self, y_true, y_pred):
        cm = confusion_matrix(y_true, y_pred, labels=['Negative', 'Neutral', 'Positive'])
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['Negative', 'Neutral', 'Positive'],
                    yticklabels=['Negative', 'Neutral', 'Positive'])
        plt.title('Matriz de Confusão - Validação')
        plt.ylabel('Verdadeiro')
        plt.xlabel('Previsto')
        cm_path = "confusion_matrix_val.png"
        plt.savefig(cm_path)
        mlflow.log_artifact(cm_path)
        plt.close()


In [27]:
validator = SentimentValidatorEmbeddings(
    parquet_path="../data\dataset_valid_with_sentiment_fix_negative_trimmed_similarity.parquet",
)

validator.validate()

INFO:model2vec.hf_utils:Folder does not exist locally, attempting to use huggingface hub.
INFO:__main__:Validação com 195 amostras


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

ERROR:__main__:Erro na validação: X has 64 features, but LogisticRegression is expecting 65 features as input.


🏃 View run Valid_Emb_KMeans at: http://127.0.0.1:5000/#/experiments/164704175141499300/runs/8194748c4c6f4e12ae7611ce9214be99
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/164704175141499300
