In [1]:
"""Create training dataset"""

import csv
import datetime
import glob
import os
from pathlib import Path

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

data_files_dir = Path.cwd().parent / "data"
df = pd.read_csv(data_files_dir / "comments.csv")

print(f"Read total {len(df)} rows")
df.head(2)

Read total 1565 rows


Unnamed: 0,content,is_spam
0,Best Music Ever!!!﻿,0
1,please look up DHG SONGS this is my playlist w...,1


In [15]:
vectorizer = CountVectorizer()
v_model = vectorizer.fit(df["content"])
X = v_model.transform(df["content"])
y = df["is_spam"]

X.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [3]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
import mlflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Spam Detection")

<Experiment: artifact_location='mlflow-artifacts:/596702777571537385', creation_time=1697432839623, experiment_id='596702777571537385', last_update_time=1697432839623, lifecycle_stage='active', name='Spam Detection', tags={}>

https://www.youtube.com/watch?v=O2L2Uv9pdDA

In [5]:
"""
https://google.com
"""
with mlflow.start_run(run_name="Naive Bayes"):
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))

    mlflow.log_param("model", "Naive Bayes")
    mlflow.log_metric("accuracy", accuracy)

    print(f"Naive Bayes - Accuracy: {accuracy:.1%}")


Naive Bayes - Accuracy: 93.3%


In [6]:

from sklearn.linear_model import LogisticRegression


with mlflow.start_run(run_name="Logistic Regression"):
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))

    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy)

print(f"Logistic Regression - Accuracy: {accuracy:.1%}")

Logistic Regression - Accuracy: 96.8%


In [8]:

from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run(run_name="Random Forest"):
    classifier = RandomForestClassifier(random_state=42)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))


    mlflow.log_param("model", "Random Forest")
    mlflow.log_metric("accuracy", accuracy)

print(f"Random Forest - Accuracy: {accuracy:.1%}")



Random Forest - Accuracy: 97.8%


In [9]:
from sklearn.model_selection import GridSearchCV
mlflow.autolog(log_models=False)

# Create a dictionary containing the hyperparameters and their possible values
param_grid = {
    "n_estimators": [10, 50, 100, 200],  # 200
    "max_depth": [None, 10, 20, 30],  # None
    "min_samples_split": [2, 5, 10],  # 2
}

# DEV to make it faster
param_grid = {
    "n_estimators": [10, 200],  # 200
    "max_depth": [None, 10],  # None
    "min_samples_split": [2],  # 2
}

# Create a Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Create the grid search with cross-validation
classifier = GridSearchCV(
    estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2
)
classifier.fit(X_train, y_train)




2023/10/16 10:20:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2023/10/16 10:20:14 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b3a60d16521b4870b48b1b39244b74e9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] END max_depth=None, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END .max_depth=10, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END .max_depth=10, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END .max_depth=10, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   1.2s
[CV] END max_depth=10, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=10, min_samples_split

2023/10/16 10:20:23 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


In [11]:
with mlflow.start_run(run_name="Best RF Model - Hyperparameter Tuning"):

    best_params = classifier.best_params_
    best_model = classifier.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = float(accuracy_score(y_test, y_pred))

    mlflow.log_param("model", "Random Forest CV")
    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", accuracy)

print(f"Best parameters: {best_params}")
print(f"Accuracy of best model: {accuracy:.1%}")

Best parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy of best model: 97.8%
