In [4]:
%run ../course_helpers/init_working_environment.ipynb

Loaded 1565 rows from /workspaces/mlops-course/data/comments.csv
First ten rows of the dataframe `df`:


Unnamed: 0,content,is_spam
0,Best Music Ever!!!,0
1,please look up DHG SONGS this is my playlist with a bunch of amazing songs,1
2,just :( superr!!!,0
3,Check out this playlist on YouTube:,1
4,subscribed :) btw you have a good style keep it up brother :)),1
5,she is horrible at acting. cringe-worhty.,0
6,https://m.facebook.com/story.php?story_fbid=764484966942313&amp;id=754989901225153&amp;ref=stream gf,1
7,WOw,0
8,I loved this song when I was in my teenage years!,0
9,Where did she find all that make up in a freakin jungle?!,0


In [5]:
my_experiment = mlflow_connect()

OK - mlflow server is up and running. Setting Tracking URI to http://localhost:5000. Setting Experiment to 'Spam Detection'


In [6]:
import mlflow

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

# Initialize Random Forest model with best parameters
best_params = {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}

X = df[["content"]]
y = df["is_spam"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

count_vectorizer = CountVectorizer()
rf = RandomForestClassifier(random_state=42, **best_params)

col_trans = ColumnTransformer(
        transformers=[
            ('text to features', count_vectorizer, 'content')  # Apply CountVectorizer directly to the 'text' column
        ],
        remainder='drop'
    )

pipe = Pipeline([('Preprocess', col_trans), ('RandomForest', rf)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

accuracy = float(accuracy_score(y_test, y_pred))

mlflow.sklearn.autolog(log_models=False)
with mlflow.start_run(run_name="Basic Spam Classification Model") as run:
    model_info = mlflow.sklearn.log_model(pipe, "spam_pipeline_model")

print(f"Accuracy of the Model: {accuracy}")

2023/10/25 08:05:08 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '34c3bad22f3d4c0b9a27770bb6b6739f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Accuracy of the Model: 0.9744408945686901


In [8]:

artifact_path = f"runs:/{run.info.run_id}/{model_info.artifact_path}"
print(f"Reloading model from {artifact_path}")
print()

content_df = pd.DataFrame({
    "content": [
        "Click this link and subscribe to my channel!",
        "This is the best ever video about MLflow!"]})



Reloading model from runs:/18c426152b2344059273f3e3901425e1/spam_pipeline_model



Using the df won't work

In [26]:
reloaded_model = mlflow.sklearn.load_model(artifact_path)
predictions = pipe.predict(content_df)

pred_df = content_df.assign(prediction=predictions)
pred_df.head()

Unnamed: 0,content,prediction
0,Click this link and subscribe to my channel!,1
1,This is the best ever video about MLflow!,0


Logging with extra parameters

In [31]:
import datetime
mlflow.sklearn.autolog(disable=True)
with mlflow.start_run(run_name="Custom Spam Classification Model") as run:

    pip_requirements = [
        "scikit-learn==1.2.2",
        "mlflow==2.7",
    ]

    from mlflow.models import ModelSignature, infer_signature
    from mlflow.types.schema import Schema, ColSpec

    # Option 1: Manually construct the signature object
    input_schema = Schema(
        [
            ColSpec("string", "The content to be classified as spam or not spam"),
        ]
    )
    output_schema = Schema([ColSpec("double","Content is spam (1) or not spam (0)")])
    signature = ModelSignature(inputs=input_schema, outputs=output_schema)

    mlflow.sklearn.log_model(
        pipe, 
        artifact_path="custom_spam_pipeline_model",
        input_example=content_df,
        pip_requirements=pip_requirements,
        signature=signature,
        metadata={
            "model_description": "Spam classifier trained on YouTube comments",
            "trained_at": datetime.datetime.utcnow().isoformat(),
            "accuracy": accuracy,
        },
    )
