In [20]:
## VARIABLES

MLFLOW_URI = "http://localhost:8080"
DATASET = "movie_reviews"
EXPERIMENT_NAME = "movie-sentiment-analysis-v2"
MODEL_REGISTERY_NAME = "movie-sentiment-v2"

In [21]:
import mlflow

mlflow.set_tracking_uri(MLFLOW_URI)

In [29]:
import nltk
import random
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


# Download dataset
nltk.download(DATASET)

# Load dataset
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
len(documents)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/fatih/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


2000

In [30]:
# Split data into features (text) and labels (sentiments)
texts = [" ".join(words) for words, label in documents]
labels = [label for words, label in documents]
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [31]:
from matplotlib import pyplot as plt
from mlflow.models import infer_signature
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

# it will create a new experiment if it doesn't exist
experiment = mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

with mlflow.start_run(experiment_id=experiment.experiment_id):
    # Train-test split
    
    tfidf_params = {
        "ngram_range": (1,3),
        "max_df": 3,
        "use_idf": True
    }
    clf_params = {
        "C": 1.0,
        "max_iter": 150,
    }
    # Create a pipeline with TF-IDF and Logistic Regression
    model = make_pipeline(
        TfidfVectorizer(**tfidf_params),
        LogisticRegression(**clf_params)
    )

    # log model related params
    mlflow.log_params(
        params={**tfidf_params, **clf_params}
    )
    
    # Log extra params.
    mlflow.log_param("dataset_name", DATASET)
    mlflow.log_param("custom_key", "custom_value")

    # Train the model
    model.fit(X_train, y_train)

    # test the model on test set to get the metrics
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # log metrics to mlflow
    mlflow.log_metric("accuracy", accuracy)

    # Generate classification report
    report = classification_report(y_test, y_pred)
    with open("./artifacts/classification_report.txt", "w") as f:
        f.write(report)
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['neg', 'pos'], yticklabels=['neg', 'pos'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.savefig("./artifacts/confusion_matrix.png")
    plt.close()

    # Create a signature for the model
    signature = infer_signature(
        model_input=X_test[:5],
        model_output=y_test[:5]
    )
    
    # save artifacts to mlflow
    mlflow.log_artifacts("artifacts")
    
    # log the model to arficat
    model_info = mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        signature=signature
    )




In [32]:
# register model
registered_model_info = mlflow.register_model(model_info.model_uri, MODEL_REGISTERY_NAME)

Registered model 'movie-sentiment-v2' already exists. Creating a new version of this model...
2025/02/26 20:13:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: movie-sentiment-v2, version 2
Created version '2' of model 'movie-sentiment-v2'.


## Notes on Registered Models

- Models get versioned
- URI for any model: `models:/{model_name}/{model_version}`

### Also models can be found at:
- `models:/{run_id}/model`


## Notes on Aliases

Aliases can be used for identifying `champion` and `challanger` models. It is possible to load a model by it's alias.

In [33]:
# give CHALLANGER tag to the model
from mlflow import MlflowClient

client = MlflowClient(
    tracking_uri=MLFLOW_URI,
    registry_uri=MLFLOW_URI
)

client.set_registered_model_alias(
    name=MODEL_REGISTERY_NAME,
    alias="champion",
    version=registered_model_info.version
)

**Model Registery Example**

In [27]:
model_uri = f"models:/{MODEL_REGISTERY_NAME}@champion"
loaded_model = model = mlflow.sklearn.load_model(model_uri)
loaded_model

Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00,  1.94it/s]


In [28]:
text = "I loved this movie"
loaded_model.predict([text])

array(['pos'], dtype='<U3')

### Validation

- Load "CHALLANGER" model from registery
- Run model in validation dataset
- Calculate metrics
- compare with Champion (if exists)
- Make it champion if better (or first model)

## Next Steps

- Run a validation script with the original test set.
    - Compare results of `challanger` model with the `champion`
    - Make `challanger` the new `champion`
- Restart all inference services to load new `champion` model from the registery