In [6]:
import pandas as pd
import numpy as np
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
# Importe a classe de clustering hierárquico
from sklearn.cluster import AgglomerativeClustering
# Importe NearestNeighbors para atribuir pontos de teste
from sklearn.neighbors import NearestNeighbors

import mlflow
# Certifique-se de que a biblioteca static_models está instalada ou disponível
from model2vec import StaticModel # Assumindo que StaticModel é como no seu código
# Assumindo que Embeddings é uma classe base que você definiu ou importou
from langchain.embeddings.base import Embeddings # Assumindo que é de langchain ou similar

In [7]:
mlflow.set_tracking_uri("http://127.0.0.1:5000/")  # Ajuste para seu servidor MLflow
mlflow.set_experiment("Train_Emb_Balanced_Synth-Negative_Sentiment_Analysis_Restaurant")

<Experiment: artifact_location='mlflow-artifacts:/816380000824485392', creation_time=1747411527299, experiment_id='816380000824485392', last_update_time=1747411527299, lifecycle_stage='active', name='Train_Emb_Balanced_Synth-Negative_Sentiment_Analysis_Restaurant', tags={}>

In [8]:
# --- Sentence Embedding ---
class Model2VecEmbeddings(Embeddings):
    """Wrapper para o Model2Vec como Embeddings do LangChain"""
    def __init__(self, model_name: str):
        # Certifique-se de ter a biblioteca static_models instalada (pip install static-models)
        self.model = StaticModel.from_pretrained(model_name)

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        # Certifique-se de que texts é uma lista de strings
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        # Certifique-se de que text é uma string
        return self.model.encode([text]).tolist()[0]

# inputs

In [9]:
# Usando o caminho do seu código
df_train = pd.read_parquet('../data\dataset_train_trim_synthetic_balanced.parquet')

if 'target' not in df_train.columns:
    df_train['target'] = df_train['sentiment']

X_text = df_train['comment_cleaned']
y = df_train['target']

# --- Split de Treino e Teste ---
print("Realizando split de treino/teste...")
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

Realizando split de treino/teste...


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score

ALL_LABELS = ['Negative', 'Neutral', 'Positive']

# --- Função Scorer Customizada para Recall da Classe 'Negative' ---
def recall_negative_scorer_func(y_true, y_pred):
    # Calcula o recall para *todas* as classes presentes, retornando um array.
    # Usamos `labels=ALL_LABELS` para garantir a ordem consistente
    # e `zero_division=0` para que folds sem a classe 'Negative' em y_true resultem em 0.
    per_class_recall = recall_score(y_true, y_pred, average=None, labels=ALL_LABELS, zero_division=0)

    # Encontra o índice da classe 'Negative' na lista ALL_LABELS
    try:
        neg_index = ALL_LABELS.index('Negative')
        # Retorna o recall correspondente a esse índice
        return per_class_recall[neg_index]
    except ValueError:
        # Isso só deve acontecer se 'Negative' não estiver em ALL_LABELS - verifique sua lista!
        print(f"Erro: 'Negative' não encontrado na lista ALL_LABELS: {ALL_LABELS}")
        return np.nan # Retorna NaN para indicar um problema de configuração

# --- Defina nosso scorer usando a função customizada ---
# make_scorer agora usa a função que retorna um único valor (o recall da classe 'Negative')
scorer = make_scorer(recall_negative_scorer_func)

# --- Embedding Models Grid ---
embedding_models = {
    "minishlab/potion-base-8M": Model2VecEmbeddings("minishlab/potion-base-8M")
}

# --- Logistic Regression Hyperparameters Grid ---
lr_params = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],  # These solvers support both L1 and L2
    'max_iter': [500, 1000]
}

# --- Clustering Parameters Grid ---
clustering_params = {
    'n_clusters': [3],
    'linkage': ['ward', 'complete', 'average']
}

best_recall = 0
best_params = {}
best_model = None

# Main grid search loop
for emb_name, emb_model in embedding_models.items():
    print(f"\nTesting embedding model: {emb_name}")
    
    # Generate embeddings
    X_train_vec = np.array(emb_model.embed_documents(X_train_text.tolist()))
    X_test_vec = np.array(emb_model.embed_documents(X_test_text.tolist()))
    
    for n_clusters in clustering_params['n_clusters']:
        for linkage in clustering_params['linkage']:
            print(f"\nTesting clustering with {n_clusters} clusters and {linkage} linkage...")
            
            # Hierarchical clustering
            hc = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
            train_clusters = hc.fit_predict(X_train_vec)
            
            # Assign test clusters using nearest neighbor
            nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
            nn.fit(X_train_vec)
            _, indices = nn.kneighbors(X_test_vec)
            test_clusters = train_clusters[indices.flatten()]
            
            # Augment features
            X_train_aug = np.hstack([X_train_vec, train_clusters.reshape(-1, 1)])
            X_test_aug = np.hstack([X_test_vec, test_clusters.reshape(-1, 1)])
            
            # Logistic Regression grid search
            lr = LogisticRegression(random_state=42)
            grid = GridSearchCV(lr, lr_params, scoring=scorer, cv=3, n_jobs=-1)
            grid.fit(X_train_aug, y_train)
            
            # Get best model from this iteration
            current_recall = grid.best_score_
            print(f"Best recall for this config: {current_recall}")
            
            # Track best overall model
            if current_recall > best_recall:
                best_recall = current_recall
                best_params = {
                    'embedding_model': emb_name,
                    'n_clusters': n_clusters,
                    'linkage': linkage,
                    'lr_params': grid.best_params_
                }
                best_model = grid.best_estimator_
                
                # Evaluate on test set (using the full y_test)
                # Evaluate on test set (using the full y_test)
                y_pred = best_model.predict(X_test_aug)

                # Calcule o recall para cada classe e selecione o da classe 'Negative'
                # Certifique-se de que ALL_LABELS está definido no topo do seu script
                per_class_recall_test = recall_score(y_test, y_pred, average=None, labels=ALL_LABELS)
                neg_index = ALL_LABELS.index('Negative')
                test_recall = per_class_recall_test[neg_index] # Seleciona o recall da classe 'Negative'

                print(f"New best model! Test recall: {test_recall:.4f}")

# --- Log best model to MLflow ---
with mlflow.start_run(run_name="Best_LogReg_Embeddings_Cluster"):
    print("\nLogging best model to MLflow...")
    print("Best parameters:")
    print(best_params)
    
    # Log parameters
    mlflow.log_params({
        'embedding_model': best_params['embedding_model'],
        'n_clusters': best_params['n_clusters'],
        'linkage': best_params['linkage'],
        'lr_C': best_params['lr_params']['C'],
        'lr_penalty': best_params['lr_params']['penalty'],
        'lr_solver': best_params['lr_params']['solver'],
        'lr_max_iter': best_params['lr_params']['max_iter']
    })
    
    # Log metrics
    y_pred = best_model.predict(X_test_aug)
    acc = accuracy_score(y_test, y_pred)

    # Calcule o recall para cada classe e selecione o da classe 'Negative'
    # Certifique-se de que ALL_LABELS está definido no topo do seu script
    per_class_recall_mlflow = recall_score(y_test, y_pred, average=None, labels=ALL_LABELS)
    neg_index = ALL_LABELS.index('Negative')
    neg_recall = per_class_recall_mlflow[neg_index] # Seleciona o recall da classe 'Negative'

    report = classification_report(y_test, y_pred, output_dict=True)
    mlflow.log_metrics({
        'accuracy': acc,
        'recall_negative': neg_recall
    })
    mlflow.log_dict(report, "classification_report.json")
    
    # Log model
    mlflow.sklearn.log_model(best_model, "model")
    
    print(f"Final test recall for negative class: {neg_recall:.4f}")
    print("Best model logged to MLflow")

print("Grid search complete!")


Testing embedding model: minishlab/potion-base-8M

Testing clustering with 3 clusters and ward linkage...
Best recall for this config: 0.7415458937198066
New best model! Test recall: 0.7308

Testing clustering with 3 clusters and complete linkage...
Best recall for this config: 0.707936507936508

Testing clustering with 3 clusters and average linkage...
Best recall for this config: 0.7222222222222222

Logging best model to MLflow...
Best parameters:
{'embedding_model': 'minishlab/potion-base-8M', 'n_clusters': 3, 'linkage': 'ward', 'lr_params': {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Final test recall for negative class: 0.0000
Best model logged to MLflow
🏃 View run Best_LogReg_Embeddings_Cluster at: http://127.0.0.1:5000/#/experiments/816380000824485392/runs/cee92191f80b484e9a902b26330affc7
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/816380000824485392
Grid search complete!


In [18]:
# Dentro do loop de clustering, APÓS gerar X_train_aug e y_train
if emb_name == "minishlab/potion-base-8M":
    print(f"--- Debugging data for {emb_name}, n_clusters={n_clusters}, linkage={linkage} ---")
    print("Shape of X_train_aug:", X_train_aug.shape)
    print("Are there NaNs in X_train_aug?", np.isnan(X_train_aug).any())
    print("Are there Infs in X_train_aug?", np.isinf(X_train_aug).any())
    print("Min value in X_train_aug:", np.nanmin(X_train_aug) if np.isnan(X_train_aug).any() else X_train_aug.min())
    print("Max value in X_train_aug:", np.nanmax(X_train_aug) if np.isnan(X_train_aug).any() else X_train_aug.max())
    print("Mean value in X_train_aug:", np.nanmean(X_train_aug) if np.isnan(X_train_aug).any() else X_train_aug.mean())
    print("Shape of y_train:", y_train.shape)
    
    #print("Are there NaNs in y_train?", np.isnan(y_train)) # Deve ser False para labels
    print("y_train value counts:", np.unique(y_train, return_counts=True))
    print("-----------------------------------------------------------------")

--- Debugging data for minishlab/potion-base-8M, n_clusters=3, linkage=average ---
Shape of X_train_aug: (586, 257)
Are there NaNs in X_train_aug? False
Are there Infs in X_train_aug? False
Min value in X_train_aug: -0.4777367413043976
Max value in X_train_aug: 2.0
Mean value in X_train_aug: -0.00026017618687688494
Shape of y_train: (586,)
y_train value counts: (array(['Negative', 'Neutral', 'Positive'], dtype=object), array([209, 139, 238], dtype=int64))
-----------------------------------------------------------------
