In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score
import mlflow
from model2vec import StaticModel
from langchain.embeddings.base import Embeddings
from typing import List, Tuple, Dict

2025/05/14 17:59:17 INFO mlflow.tracking.fluent: Autologging successfully enabled for google.generativeai.

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel
"
2025/05/14 17:59:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for openai.
2025/05/14 17:59:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for langchain.


In [11]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")  # Ajuste para seu servidor MLflow
mlflow.set_experiment("Train_Emb_Trimmed_Fix-Negative_Sentiment_Analysis_Restaurant")

<Experiment: artifact_location='mlflow-artifacts:/855487855781956226', creation_time=1747256508705, experiment_id='855487855781956226', last_update_time=1747256508705, lifecycle_stage='active', name='Train_Emb_Trimmed_Fix-Negative_Sentiment_Analysis_Restaurant', tags={}>

In [14]:
# --- Carregar Dados ---
df_train = pd.read_parquet('../data/dataset_train_with_sentiment_fix_negative_trimmed_similarity.parquet')

if 'target' not in df_train.columns:
    df_train['target'] = df_train['sentiment']

X_text = df_train['comment_cleaned']
y = df_train['target']

# --- Split de Treino e Teste ---
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

# --- Sentence Embedding ---
class Model2VecEmbeddings(Embeddings):
        """Wrapper para o Model2Vec como Embeddings do LangChain"""
        def __init__(self, model_name: str):
            self.model = StaticModel.from_pretrained(model_name)

        def embed_documents(self, texts: List[str]) -> List[List[float]]:
            return self.model.encode(texts).tolist()
        
        def embed_query(self, text: str) -> List[float]:
            return self.model.encode([text]).tolist()[0]
        
print("Gerando embeddings com Model2Vec...")
model_name = "minishlab/potion-base-2M"
model = Model2VecEmbeddings(model_name)

X_train_vec = model.embed_documents(X_train_text.tolist())
X_test_vec = model.embed_documents(X_test_text.tolist())

# --- Clustering (KMeans) ---
print("Executando KMeans...")
kmeans = KMeans(n_clusters=3, random_state=42)
train_clusters = kmeans.fit_predict(X_train_vec)
test_clusters = kmeans.predict(X_test_vec)

# Adicionar clusters como feature extra
X_train_augmented = np.hstack([X_train_vec, train_clusters.reshape(-1, 1)])
X_test_augmented = np.hstack([X_test_vec, test_clusters.reshape(-1, 1)])

# --- Modelo Supervisionado (Logistic Regression) ---
model_name = "LogReg_with_Embeddings+KMeans"

with mlflow.start_run(run_name=model_name):
    clf = LogisticRegression(C=10, penalty='l2', solver='lbfgs', max_iter=1000, random_state=42)
    clf.fit(X_train_augmented, y_train)

    y_pred = clf.predict(X_test_augmented)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

    # --- Log MLflow ---
    mlflow.autolog()
    mlflow.log_param("embedding_model", "potion-base-2M")
    mlflow.log_param("classifier", "LogisticRegression")
    mlflow.log_param("C", 10)
    mlflow.log_param("penalty", "l2")
    mlflow.log_param("solver", "lbfgs")
    mlflow.log_param("kmeans_clusters", 3)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_dict(report, "classification_report.json")


Gerando embeddings com Model2Vec...


2025/05/14 18:04:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f73a2f0b075242c491a20d1999ea030b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Executando KMeans...




🏃 View run chill-skink-562 at: http://127.0.0.1:5000/#/experiments/855487855781956226/runs/f73a2f0b075242c491a20d1999ea030b
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/855487855781956226


2025/05/14 18:04:50 INFO mlflow.bedrock: Enabled auto-tracing for Bedrock. Note that MLflow can only trace boto3 service clients that are created after this call. If you have already created one, please recreate the client by calling `boto3.client`.
2025/05/14 18:04:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for boto3.
2025/05/14 18:04:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for google.genai.
2025/05/14 18:04:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for google.generativeai.


Accuracy: 0.7222
              precision    recall  f1-score   support

    Negative       0.63      0.55      0.59        31
     Neutral       0.33      0.07      0.11        15
    Positive       0.76      0.91      0.83        80

    accuracy                           0.72       126
   macro avg       0.57      0.51      0.51       126
weighted avg       0.68      0.72      0.68       126



2025/05/14 18:04:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for langchain.
2025/05/14 18:04:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for openai.
2025/05/14 18:04:51 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


🏃 View run LogReg_with_Embeddings+KMeans at: http://127.0.0.1:5000/#/experiments/855487855781956226/runs/c732ef0c4ccc48139aeb8b5961c6ccc9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/855487855781956226
