In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import mlflow
from mlflow.models import infer_signature
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_recall_curve, auc
from sklearn.pipeline import Pipeline

In [2]:
# Loading Train, Validation, and Test Data
msg_train = pd.read_csv('./data/train.csv')
msg_val = pd.read_csv('./data/validation.csv')
msg_test = pd.read_csv('./data/test.csv')

In [3]:
# Preprocessing Pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

In [4]:
# Fit the pipeline on training data
pipeline.fit(msg_train['message'])

In [5]:
# Prepare train, validation, and test data
X_train = pipeline.transform(msg_train['message'])
X_val = pipeline.transform(msg_val['message'])
X_test = pipeline.transform(msg_test['message'])
y_train = msg_train['label']
y_val = msg_val['label']
y_test = msg_test['label']

In [6]:
# Function to print AUCPR and plot Precision-Recall curve
def print_aucpr(y_true, y_scores):
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    auc_pr = auc(recall, precision)
    print(f"The AUCPR is = {auc_pr}")
    plt.figure(figsize=(5, 5))
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.show()

In [7]:
def pr_auc(y_true, y_pred):
    p, r, _ = precision_recall_curve(y_true, y_pred, pos_label=1)
    return auc(r, p)

In [8]:
# Model Selection
df = pd.DataFrame(np.zeros((3, 1)), index=["Naive Bayes", "Logistic Regression", "Support Vector Machines"], columns=["AUC-PR"])

In [9]:
# Naive Bayes Classifier
with mlflow.start_run():
    model = MultinomialNB(alpha=0.05)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    signature = infer_signature(X_val, y_val)
    
    mlflow.log_params({"alpha": 0.05})
    mlflow.log_param("model_name", "Multinomial NB Model")
    mlflow.log_metrics({"AUC-PR": pr_auc(y_val, y_pred), "Accuracy": accuracy_score(y_val, y_pred), "F1-Score": f1_score(y_val, y_pred)})
    mlflow.sklearn.log_model(model, artifact_path="sklearn-model", signature=signature, registered_model_name="Multinomial NB Model")

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



MlflowException: Could not find experiment with ID 0