In [2]:
# Make sure that mlflow is running

import mlflow
from sklearn.ensemble import RandomForestClassifier
from pathlib import Path
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Set up MLflow experiment
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Spam Detection")

# Load training and test data
data_files_dir = Path.cwd().parent / "data"
df = pd.read_csv(data_files_dir / "comments.csv")

# Create a CountrVectorizer object 
vectorizer = CountVectorizer()

# Initialize Random Forest model with best parameters
best_params = {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
rf = RandomForestClassifier(random_state=42, **best_params)

In [3]:
from sklearn.model_selection import train_test_split

mlflow.start_run(run_name="Random Forest Pipeline")

X = df["content"]
y = df["is_spam"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_test


527                           prehistoric song..has been﻿
271     ********OMG Facebook is OLD! Check out  ------...
1402    VOTE FOR KATY FOR THE EMAs! #KATYCATS  http://...
1201                                          Waka waka ﻿
59      Put famous people in the jungle for an hour an...
                              ...                        
208                     Check out this video on YouTube:﻿
970     You guys should check out this EXTRAORDINARY w...
1113                    Check out this video on YouTube:﻿
483     ******* Facebook is LAME and so 2004! Check ou...
58      THIS HAS MORE VIEWS THAN QUEEN AND MICHAEL JAC...
Name: content, Length: 313, dtype: object

In [4]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([('CountVectorizer', vectorizer), ('RandomForest', rf)])
pipe.fit(X_train, y_train)

In [5]:
pipe.predict(X_test)[:5]

array([0, 1, 1, 0, 0])

In [5]:

import mlflow

with mlflow.start_run(run_name="Random Forest Pipeline") as run:
    model_info = mlflow.sklearn.log_model(pipe, "spam_model")
    print(model_info.artifact_path)
    model_info = mlflow.sklearn.log_model(pipe, "spam_model")


Exception: Run with UUID 7d8aab277bfd4bf983177d91c1dfbfca is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [None]:
# %%

artifact_path = f"runs:/{run.info.run_id}/{model_info.artifact_path}"
print(artifact_path)

reloaded_model = mlflow.pyfunc.load_model(artifact_path)

# %%
message = "Come subscribe to my channel"
is_spam = reloaded_model.predict([message])
print(message + " spam? " + str(bool(is_spam)))

# %%
import shutil

shutil.rmtree(os.path.join(THIS_FOLDER, "model"), ignore_errors=True)

model_dir = os.path.join(THIS_FOLDER, "model")
pip_requirements = [
    "numpy==1.24.4",
    "pandas==1.5.3",
    "scikit-learn==1.2.2",
    "mlflow==2.5",
]


class ModelWithPreprocess(mlflow.pyfunc.PythonModel):
    def __init__(self, pipe_model):
        self.model = pipe_model

    def preprocess_input(self, payload):
        if (
            not isinstance(payload, dict)
            or "data" not in payload
            or not isinstance(payload["data"], str)
        ):
            raise TypeError(
                "Request payload must be a dict in " + '{"data": "message"} format',
            )
        return payload["data"]

    def predict(self, context, model_input):
        processed_model_input = self.preprocess_input(model_input.copy())
        # proba = self.model.predict_proba([processed_model_input][0])
        prediction = self.model.predict([processed_model_input])[0]
        # return {
        #    "prediction": prediction,
        #    "probability": float(proba[1]),
        # }
        return prediction


model_w_preprocess = ModelWithPreprocess(pipe)

mlflow.pyfunc.save_model(
    path=model_dir,
    python_model=model_w_preprocess,
    code_path=[os.path.join(THIS_FOLDER, "model_code")],
    pip_requirements=pip_requirements,
    metadata={
        "model_type": "spam_classifier",
        "model_name": "spam_model",
        "model_version": "v1",
        "model_description": "Spam classifier trained on YouTube comments",
        "trained_at": datetime.datetime.utcnow().isoformat(),
        "accuracy": accuracy,
    },
)

m2 = mlflow.pyfunc.load_model(model_dir)
print("Prediction: " + str(m2.predict({"data": "Come subscribe to my channel"})))
print(m2.metadata.metadata)
