In [None]:
import warnings
warnings.filterwarnings('ignore')

import json
import mlflow
import logging
import kagglehub
import pandas as pd

from pathlib import Path
from dotenv import load_dotenv
from mlflow import MlflowClient
from sklearn.pipeline import Pipeline
from mlflow.models import infer_signature
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(name)s: %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

In [6]:
df = data_ingestion("oliviervha/crypto-news")
df = data_cleaning(df)
X, y = data_preparation(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

2026-01-30 01:41:38 [INFO] __main__: Class distribution (count):
class
positive    13964
neutral     10555
negative     6518
2026-01-30 01:41:38 [INFO] __main__: Class distribution (ratio):
class
positive    0.449915
neutral     0.340078
negative    0.210007


In [None]:
def data_preparation(df: pd.DataFrame, test_size: float=0.25) -> tuple[pd.Series, pd.Series, pd.Series, pd.Series]:
    X, y = df["title_text"], df["class"]
    return train_test_split(X, y, test_size, stratify=y)

pandas.core.series.Series

In [7]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(class_weight="balanced", n_jobs=-1))
])

param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2)],
    "tfidf__max_df": [0.8, 0.9],
    "tfidf__min_df": [5, 10],
    "clf__max_iter": [100, 500],
}

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="f1_macro",
    n_jobs=-1,
    cv=StratifiedKFold(n_splits=5)
)

In [8]:
# Initialize mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Sentiment_Logistic_Regression")

<Experiment: artifact_location='/Users/wennanshi/VScodeProjects/Text_Sentiment_Classification/mlruns/1', creation_time=1769681839450, experiment_id='1', last_update_time=1769681839450, lifecycle_stage='active', name='Sentiment_Logistic_Regression', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [9]:
# Initialize mlflow client
client = MlflowClient()

mlflow.sklearn.autolog(log_models=False, silent=False)

with mlflow.start_run(run_name=f"logreg_gridsearch {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}"):
    grid.fit(X_train, y_train)

    best_estimator = grid.best_estimator_
    y_pred = best_estimator.predict(X_test)
    test_f1_score = f1_score(y_test, y_pred, average="macro")
    
    with open("classification_report.txt", "w", encoding="utf-8") as f:
        f.write(classification_report(y_test, y_pred))
    
    # Log the best model info
    mlflow.log_metric("test_f1_score", test_f1_score)
    mlflow.log_artifact("classification_report.txt")

    # Log the best model
    model_info = mlflow.sklearn.log_model(
        sk_model=best_estimator,
        name="best_estimator",
        registered_model_name = "TF-IDF Logistic Regression",
        signature = infer_signature(X_test, y_pred)
    )
    
    # Update model registry
    client.set_registered_model_alias(
        name="TF-IDF Logistic Regression",
        alias="production",
        version=model_info.registered_model_version
    )

2026/01/30 01:42:17 INFO mlflow.sklearn.utils: Logging the 5 best runs, 11 runs will be omitted.
Registered model 'TF-IDF Logistic Regression' already exists. Creating a new version of this model...
2026/01/30 01:42:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: TF-IDF Logistic Regression, version 6


üèÉ View run logreg_gridsearch 2026-01-30 01:41:39 at: http://localhost:5000/#/experiments/1/runs/a577728bec0441e3b85c62a3bf74e8a3
üß™ View experiment at: http://localhost:5000/#/experiments/1


Created version '6' of model 'TF-IDF Logistic Regression'.
