In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import string
import re

# Configuração do MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000/")  # Ajuste para seu servidor MLflow
mlflow.set_experiment("Train_Trimmed_Fix-Negative_Sentiment_Analysis_Restaurant")

<Experiment: artifact_location='mlflow-artifacts:/901935482127334374', creation_time=1747243622588, experiment_id='901935482127334374', last_update_time=1747243622588, lifecycle_stage='active', name='Train_Trimmed_Fix-Negative_Sentiment_Analysis_Restaurant', tags={}>

# INPUTS

In [3]:
df_train = pd.read_parquet('../data/dataset_train_with_sentiment_fix_negative_trimmed_similarity.parquet') 

if 'target' not in df_train.columns:
    df_train['target'] = df_train['sentiment']
    
# Separate features and labels
X = df_train['comment_cleaned']
y = df_train['target']

# --- MLflow Setup ---

# --- Preparar dados para treinamento e avaliação (split interno) ---
# Dividir a base de treino em treino e validação PARA ACOMPANHAMENTO DO MLflow
# Isso permite avaliar o modelo em dados que ele não viu durante o treinamento deste run específico

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y) # Usar stratify para manter proporção das classes


# Modelos para testar
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# Vetorizador TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        #Pipeline
        pipeline = Pipeline([
                ('tfidf', vectorizer),
                ('clf', model)
            ])
        # Treino
        pipeline.fit(X_train, y_train)
        
        # Avaliação
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        mlflow.autolog()
        
        print(f"{model_name} - Accuracy: {accuracy:.4f}")


2025/05/14 14:31:20 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


RandomForest - Accuracy: 0.6667
🏃 View run RandomForest at: http://127.0.0.1:5000/#/experiments/901935482127334374/runs/b2326155ff93476cbd8c9ab1ac81bad8
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/901935482127334374


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025/05/14 14:31:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


LogisticRegression - Accuracy: 0.6667
🏃 View run LogisticRegression at: http://127.0.0.1:5000/#/experiments/901935482127334374/runs/0941f552e3e2466f81fef2ae7c5c79f9
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/901935482127334374
