In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import string
import re

# Configuração do MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000/")  # Ajuste para seu servidor MLflow
mlflow.set_experiment("Sentiment Analysis Restaurant Reviews")

2025/05/13 22:10:05 INFO mlflow.tracking.fluent: Experiment with name 'Restaurant_Reviews_Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/105878113738556386', creation_time=1747185005345, experiment_id='105878113738556386', last_update_time=1747185005345, lifecycle_stage='active', name='Restaurant_Reviews_Classification', tags={}>

In [None]:
# Assume df_train and df_valid with 'comment_cleaned' and 'target' columns are ready
# For demonstration, let's create dummy data if they don't exist
try:
    df_train = pd.read_parquet('data/dataset_train_with_sentiment.parquet') # Or load from CSV
    df_valid = pd.read_parquet('data/dataset_valid_with_sentiment.parquet') # Or load from CSV
    # Ensure 'target' column exists in df_train (added by LLM)
    if 'target' not in df_train.columns:
        # Duplicando 'coluna_original' para 'nova_coluna'
        df_train['target'] = df_train['sentiment']
        
except FileNotFoundError:
    print("Arquivos limpos não encontrados. Criando DataFrames de exemplo para demonstração.")
    # Create dummy dataframes for demonstration if files aren't found
    data_train = {'comment_cleaned': ['ótimo lugar recomendo', 'comida fria e atendimento ruim', 'ambiente agradável preço justo', 'demorou muito para chegar', 'tudo perfeito voltarei'],
                  'target': ['Positivo', 'Negativo', 'Positivo', 'Negativo', 'Positivo']}
    df_train = pd.DataFrame(data_train)

    data_valid = {'comment_cleaned': ['serviço excelente', 'conta veio errada', 'local barulhento', 'pizza saborosa']}
    df_valid = pd.DataFrame(data_valid)
    print("DataFrames de exemplo criados.")

# Separate features and labels
X = df_train['comment_cleaned']
y = df_train['target']

# --- MLflow Setup ---

# --- Preparar dados para treinamento e avaliação (split interno) ---
# Dividir a base de treino em treino e validação PARA ACOMPANHAMENTO DO MLflow
# Isso permite avaliar o modelo em dados que ele não viu durante o treinamento deste run específico

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y) # Usar stratify para manter proporção das classes


# Modelos para testar
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# Vetorizador TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        #Pipeline
        pipeline = Pipeline([
                ('tfidf', vectorizer),
                ('clf', model)
            ])
        # Treino
        pipeline.fit(X_train, y_train)
        
        # Avaliação
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        mlflow.autolog()
        
        print(f"{model_name} - Accuracy: {accuracy:.4f}")


2025/05/13 22:57:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
Registered model 'sentiment_randomforest' already exists. Creating a new version of this model...
2025/05/13 22:57:13 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sentiment_randomforest, version 2
Created version '2' of model 'sentiment_randomforest'.


RandomForest - Accuracy: 0.7099
🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/744206330945824458/runs/a54e15eda2da4aa59cd24f86d9084598
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/744206330945824458


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025/05/13 22:57:21 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
Registered model 'sentiment_logisticregression' already exists. Creating a new version of this model...
2025/05/13 22:57:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: sentiment_logisticregression, version 2


LogisticRegression - Accuracy: 0.6947
🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/744206330945824458/runs/be478411bc5f41b0b07fb0a35285914a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/744206330945824458


Created version '2' of model 'sentiment_logisticregression'.


In [None]:
# Calcular métricas
accuracy = accuracy_score(y_train_split, y_eval_split)
precision = precision_score(y_train_split, y_eval_split, average='weighted', zero_division=0) # Use weighted/macro/micro dependendo da distribuição das classes
recall = recall_score(y_train_split, y_eval_split, average='weighted', zero_division=0)
f1 = f1_score(y_train_split, y_eval_split, average='weighted', zero_division=0)

# Logar as métricas
mlflow.log_metric("eval_accuracy", accuracy)
mlflow.log_metric("eval_precision", precision)
mlflow.log_metric("eval_recall", recall)
mlflow.log_metric("eval_f1_score", f1)

print(f"\nMétricas no conjunto de avaliação interno:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_eval_split, y_pred_eval, zero_division=0))

    # TfidfVectorizer.
    # # --- Logar o Modelo (a Pipeline completa) ---
    # # Isso salva o vetorizador e o classificador juntos
    # mlflow.sklearn.log_model(pipeline, "sentiment_model_pipeline")

    # print("\nModelo (Pipeline) logado no MLflow.")
    # print(f"MLflow Run ID: {mlflow.active_run().info.run_id}")
    # print("Run finalizado.")

# --- Próximos Passos ---
# Após rodar o script várias vezes com diferentes parâmetros,
# você pode visualizar os resultados usando a UI do MLflow.
# No seu terminal, navegue até o diretório onde você rodou este script
# e execute: mlflow ui
# Isso abrirá uma interface web onde você pode comparar os runs pelas métricas e parâmetros.

# Depois de escolher o melhor conjunto de parâmetros, você treinaria a pipeline FINAL
# Usando a base de treino COMPLETA (X_train_cleaned, y_train) e logaria/registraria
# esse modelo final de forma distinta, se necessário.
# Exemplo (Fora do run de experimentação):
# final_pipeline = Pipeline([...com_os_melhores_parametros...])
# final_pipeline.fit(X_train_cleaned, y_train)
# joblib.dump(final_pipeline, 'final_sentiment_pipeline.pkl') # Salvar para usar na predição da validação