In [1]:
"""Create training dataset"""

import csv
import datetime
import glob
import os
from pathlib import Path

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

data_files_dir = Path.cwd().parent / "data"
df = pd.read_csv(data_files_dir / "comments.csv")

print(f"Read total {len(df)} rows")
df.head(2)

Read total 1565 rows


Unnamed: 0,content,is_spam
0,Best Music Ever!!!﻿,0
1,please look up DHG SONGS this is my playlist w...,1


In [2]:
vectorizer = CountVectorizer()
v_model = vectorizer.fit(df["content"])
X = v_model.transform(df["content"])
y = df["is_spam"]


In [3]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [9]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("spam_detection")

2023/10/15 18:14:54 INFO mlflow.tracking.fluent: Experiment with name 'spam_detection' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/184009001614160443', creation_time=1697393694688, experiment_id='184009001614160443', last_update_time=1697393694688, lifecycle_stage='active', name='spam_detection', tags={}>

In [19]:
with mlflow.start_run(run_name="Naive Bayes"):
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))

    mlflow.log_param("model", "Naive Bayes")
    mlflow.log_metric("accuracy", accuracy)

    print(f"Naive Bayes - Accuracy: {accuracy:.1%}")


NB Accuracy: 93.3%


In [20]:

from sklearn.linear_model import LogisticRegression


with mlflow.start_run(run_name="Logistic Regression"):
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))

    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy)

print(f"Logistic Regression - Accuracy: {accuracy:.1%}")

LogReg Accuracy: 96.8%


In [23]:

from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run(run_name="Random Forest"):
    classifier = RandomForestClassifier(random_state=42)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))


    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy)

print(f"Random Forest - Accuracy: {accuracy:.1%}")





Random Forest - Accuracy: 97.8%


In [25]:

from sklearn.model_selection import GridSearchCV
mlflow.autolog(log_models=False)

# Create a dictionary containing the hyperparameters and their possible values
param_grid = {
    "n_estimators": [10, 50, 100, 200],  # 200
    "max_depth": [None, 10, 20, 30],  # None
    "min_samples_split": [2, 5, 10],  # 2
}

# DEV to make it faster
param_grid = {
    "n_estimators": [200],  # 200
    "max_depth": [None, 10],  # None
    "min_samples_split": [2],  # 2
}

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Create the grid search with cross-validation
classifier = GridSearchCV(
    estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2
)
classifier.fit(X_train, y_train)


2023/10/15 18:26:30 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/10/15 18:26:30 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'fa13ed31e15046faa4ec2702d3b8f171', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   1.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   1.6s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.4s


2023/10/15 18:26:36 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy of best model: 97.764%


In [None]:
with mlflow.start_run(run_name="Random Forest - Hyperparameter Tuning"):

    best_params = classifier.best_params_
    best_model = classifier.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))

    mlflow.log_param("model", "Random Forest CV")
    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", accuracy)

print(f"Best parameters: {best_params}")
print(f"Accuracy of best model: {accuracy:.3%}")

In [None]:
import mlflow


model_info = mlflow.sklearn.log_model(best_model, "spam_model")

print(model_info.artifact_path)
# %%

artifact_path = f"runs:/{run.info.run_id}/{model_info.artifact_path}"
print(artifact_path)

reloaded_model = mlflow.pyfunc.load_model(artifact_path)

# %%
message = "Come subscribe to my channel"
is_spam = reloaded_model.predict([message])
print(message + " spam? " + str(bool(is_spam)))

# %%
import shutil

shutil.rmtree(os.path.join(THIS_FOLDER, "model"), ignore_errors=True)

model_dir = os.path.join(THIS_FOLDER, "model")
pip_requirements = [
    "numpy==1.24.4",
    "pandas==1.5.3",
    "scikit-learn==1.2.2",
    "mlflow==2.5",
]


class ModelWithPreprocess(mlflow.pyfunc.PythonModel):
    def __init__(self, pipe_model):
        self.model = pipe_model

    def preprocess_input(self, payload):
        if (
            not isinstance(payload, dict)
            or "data" not in payload
            or not isinstance(payload["data"], str)
        ):
            raise TypeError(
                "Request payload must be a dict in " + '{"data": "message"} format',
            )
        return payload["data"]

    def predict(self, context, model_input):
        processed_model_input = self.preprocess_input(model_input.copy())
        # proba = self.model.predict_proba([processed_model_input][0])
        prediction = self.model.predict([processed_model_input])[0]
        # return {
        #    "prediction": prediction,
        #    "probability": float(proba[1]),
        # }
        return prediction


model_w_preprocess = ModelWithPreprocess(pipe)

mlflow.pyfunc.save_model(
    path=model_dir,
    python_model=model_w_preprocess,
    code_path=[os.path.join(THIS_FOLDER, "model_code")],
    pip_requirements=pip_requirements,
    metadata={
        "model_type": "spam_classifier",
        "model_name": "spam_model",
        "model_version": "v1",
        "model_description": "Spam classifier trained on YouTube comments",
        "trained_at": datetime.datetime.utcnow().isoformat(),
        "accuracy": accuracy,
    },
)

m2 = mlflow.pyfunc.load_model(model_dir)
print("Prediction: " + str(m2.predict({"data": "Come subscribe to my channel"})))
print(m2.metadata.metadata)
