In [1]:
import pickle
import prepare_functions
import train_functions

In [2]:
# loading the saved count vectorizer
with open('bow_transformer.pickle', 'rb') as f:
    bow_transformer = pickle.load(f)


In [3]:
# loading the datasets
train_data = prepare_functions.load_data("data/train.csv", separator=',')
validation_data = prepare_functions.load_data("data/validation.csv", separator=',')
test_data = prepare_functions.load_data("data/test.csv", separator=',')

print(train_data.shape)
print(validation_data.shape)
print(test_data.shape)


(3900, 2)
(836, 2)
(836, 2)


In [4]:
# preprocessing the datasets, converting into sparse matrices
train_matrix, train_labels = prepare_functions.preprocess(train_data, bow_transformer)
validation_matrix, validation_labels = prepare_functions.preprocess(validation_data, bow_transformer)
test_matrix, test_labels = prepare_functions.preprocess(test_data, bow_transformer)

print(train_matrix.shape)
print(validation_matrix.shape)
print(test_matrix.shape)

(3900, 8731)
(836, 8731)
(836, 8731)


In [5]:
# importing modules for the three classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [6]:
# using mlflow to track and register the models
import mlflow
from urllib.parse import urlparse
from sklearn.metrics import accuracy_score, precision_score, recall_score

def eval_metrics(model, modelname):
    test_predictions = model.predict(test_matrix)

    train_score = model.score(train_matrix, train_labels)
    validation_score = model.score(validation_matrix, validation_labels)
    test_score = model.score(test_matrix, test_labels)
    
    test_accuracy = accuracy_score(test_labels, test_predictions)
    test_precision = precision_score(test_labels, test_predictions)
    test_recall = recall_score(test_labels, test_predictions)

    print(modelname)
    print("Test accuracy:", test_accuracy)
    print("Test precision: ", test_precision)
    print("Test recall: ", test_recall)

    return [
        train_score,
        validation_score,
        test_score,
        test_accuracy,
        test_precision,
        test_recall
    ]

def mlflow_run(modelclass, modelname):
    with mlflow.start_run():
        model = modelclass()
        train_functions.train_model(model, train_matrix, train_labels)
        [
            train_score,
            validation_score,
            test_score,
            test_accuracy,
            test_precision,
            test_recall
        ] = eval_metrics(model, modelname)
        
        # logging metrics
        mlflow.log_metric("train_score", train_score)
        mlflow.log_metric("validation_score", validation_score)
        mlflow.log_metric("test_score", test_score)
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.log_metric("test_precision", test_precision)
        mlflow.log_metric("test_recall", test_recall)

        # the following will store the model in mlruns
        mlflow.sklearn.log_model(model, "model", registered_model_name=modelname)
        

In [7]:
mlflow_run(MultinomialNB, "NaiveBayes")

NaiveBayes
Test accuracy: 0.9366028708133971
Test precision:  1.0
Test recall:  0.5350877192982456


Registered model 'NaiveBayes' already exists. Creating a new version of this model...
2023/02/27 22:40:47 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: NaiveBayes, version 2
Created version '2' of model 'NaiveBayes'.


In [8]:
mlflow_run(LogisticRegression, "LogisticRegression")

LogisticRegression
Test accuracy: 0.9677033492822966
Test precision:  1.0
Test recall:  0.7631578947368421


Registered model 'LogisticRegression' already exists. Creating a new version of this model...
2023/02/27 22:40:49 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: LogisticRegression, version 2
Created version '2' of model 'LogisticRegression'.


In [9]:
mlflow_run(RandomForestClassifier, "RandomForestClassifier")

RandomForestClassifier
Test accuracy: 0.9629186602870813
Test precision:  1.0
Test recall:  0.7280701754385965


Registered model 'RandomForestClassifier' already exists. Creating a new version of this model...
2023/02/27 22:40:55 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: RandomForestClassifier, version 2
Created version '2' of model 'RandomForestClassifier'.


In [10]:
# loading the saved models and printing AUCPR
# the model paths were found using: mlflow ui
from sklearn.metrics import precision_recall_curve, auc
def get_auc(model):
    test_predictions = model.predict(test_matrix)
    precision, recall, _ = precision_recall_curve(test_labels, test_predictions)
    return auc(precision, recall)

NaiveBayes = mlflow.pyfunc.load_model("runs:/11341006aabf46cd94f65572712d590a/model")
LogisticRegression = mlflow.pyfunc.load_model("runs:/a211eafb22c94b63b49575f2a2f25a28/model")
RandomForestClassifier = mlflow.pyfunc.load_model("runs:/3b873f98c1594a78b4f44a06b71224f4/model")

print("AUC for NaiveBayes: ", get_auc(NaiveBayes))
print("AUC for LogisticRegression: ", get_auc(LogisticRegression))
print("AUC for RandomForestClassifier: ", get_auc(RandomForestClassifier))

AUC for NaiveBayes:  0.662878787878788
AUC for LogisticRegression:  0.7613636363636364
AUC for RandomForestClassifier:  0.7348484848484849
