#### Import Required Libraries

In [1]:
import re
import pandas as pd
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
from sklearn.model_selection import ParameterGrid
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saisa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Initialize preprocessing components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocess a single text message:
    1. Convert to lowercase
    2. Tokenize the text
    3. Remove stopwords
    4. Lemmatize the tokens
    5. Join tokens back into a string
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back into a string
    return ' '.join(tokens)

#### Load and Prepare Data

In [3]:
# Load all datasets
train_data = pd.read_csv("datasets_processed/train.csv")
val_data = pd.read_csv("datasets_processed/validation.csv")
test_data = pd.read_csv("datasets_processed/test.csv")

# Preprocess all data
X_train = train_data["message"].apply(preprocess_text)
y_train = train_data["label"]
X_val = val_data["message"].apply(preprocess_text)
y_val = val_data["label"]
X_test = test_data["message"].apply(preprocess_text)
y_test = test_data["label"]

#### Define Models

In [4]:
# Define hyperparameter grids for each model
param_grids = {
    "LogisticRegression": {
        'clf__C': [0.1, 1, 10],
        'clf__solver': ['liblinear', 'saga']
    },
    "RandomForest": {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [None, 10]
    },
    "XGBoost": {
        'clf__learning_rate': [0.1, 0.01],
        'clf__max_depth': [3, 5]
    }
}

# MLflow setup
mlflow.set_experiment("SMS_Spam_Classification")
mlflow.set_tracking_uri("file:///g:/Desktop/CMI_DS/semester_IV/applied_machine_learning/assignments/assignment_2/mlruns")

2025/03/04 20:06:46 INFO mlflow.tracking.fluent: Experiment with name 'SMS_Spam_Classification' does not exist. Creating a new experiment.


#### Train and Track Models with MLflow

In [5]:
def train_and_tune(models, param_grids):
    best_models = {}
    for model_name in models.keys():
        print("\n" + "="*50)
        print(f"Training and Tuning for Model: {model_name}")
        print("="*50)
        with mlflow.start_run(run_name=f"{model_name}_Tuning", nested=True):
            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', models[model_name])
            ])
            best_score = 0
            best_params = None
            best_pipeline = None
            for params in ParameterGrid(param_grids[model_name]):
                with mlflow.start_run(run_name=f"{model_name}_Combination", nested=True):
                    pipeline.set_params(**params)
                    pipeline.fit(X_train, y_train)
                    input_example_df = pd.DataFrame({"message": [X_train.iloc[0]]})
                    signature = infer_signature(pd.DataFrame({"message": X_train}), pipeline.predict(X_train.tolist()))
                    y_proba = pipeline.predict_proba(X_val)
                    aucpr = average_precision_score(y_val, y_proba[:, 1])
                    y_pred = pipeline.predict(X_val)
                    print("-"*50)
                    print(f"Parameters: {params}")
                    print(f"Validation AUCPR: {aucpr:.4f}")
                    print("Classification Report:")
                    print(classification_report(y_val, y_pred, zero_division=0))
                    mlflow.log_params(params)
                    mlflow.log_metric("val_aucpr", aucpr)
                    mlflow.sklearn.log_model(
                        pipeline, 
                        "model", 
                        registered_model_name=f"SpamModel_{model_name}",
                        signature=signature,
                        input_example=input_example_df
                    )
                    if aucpr > best_score:
                        best_score = aucpr
                        best_params = params
                        best_pipeline = pipeline
            print("="*50)
            print(f"Best Parameters for {model_name}: {best_params}")
            print(f"Best Validation AUCPR: {best_score:.4f}")
            print("="*50)
            mlflow.log_params(best_params)
            mlflow.log_metric("best_val_aucpr", best_score)
            mlflow.sklearn.log_model(
                best_pipeline, 
                "best_model",
                registered_model_name=f"SpamModel_{model_name}",
                signature=signature,
                input_example=input_example_df
            )
            best_models[model_name] = best_pipeline
    return best_models

In [6]:
# Run training and tuning
best_models = train_and_tune({
    "LogisticRegression": LogisticRegression(),
    "RandomForest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(eval_metric='logloss')
}, param_grids)


Training and Tuning for Model: LogisticRegression
--------------------------------------------------
Parameters: {'clf__C': 0.1, 'clf__solver': 'liblinear'}
Validation AUCPR: 0.9109
Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       718
           1       0.00      0.00      0.00       118

    accuracy                           0.86       836
   macro avg       0.43      0.50      0.46       836
weighted avg       0.74      0.86      0.79       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'SpamModel_LogisticRegression'.
Created version '1' of model 'SpamModel_LogisticRegression'.


--------------------------------------------------
Parameters: {'clf__C': 0.1, 'clf__solver': 'saga'}
Validation AUCPR: 0.9102
Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       718
           1       0.00      0.00      0.00       118

    accuracy                           0.86       836
   macro avg       0.43      0.50      0.46       836
weighted avg       0.74      0.86      0.79       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_LogisticRegression' already exists. Creating a new version of this model...
Created version '2' of model 'SpamModel_LogisticRegression'.


--------------------------------------------------
Parameters: {'clf__C': 1, 'clf__solver': 'liblinear'}
Validation AUCPR: 0.9605
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       718
           1       0.98      0.71      0.82       118

    accuracy                           0.96       836
   macro avg       0.97      0.85      0.90       836
weighted avg       0.96      0.96      0.95       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_LogisticRegression' already exists. Creating a new version of this model...
Created version '3' of model 'SpamModel_LogisticRegression'.


--------------------------------------------------
Parameters: {'clf__C': 1, 'clf__solver': 'saga'}
Validation AUCPR: 0.9609
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       718
           1       0.98      0.71      0.82       118

    accuracy                           0.96       836
   macro avg       0.97      0.85      0.90       836
weighted avg       0.96      0.96      0.95       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_LogisticRegression' already exists. Creating a new version of this model...
Created version '4' of model 'SpamModel_LogisticRegression'.


--------------------------------------------------
Parameters: {'clf__C': 10, 'clf__solver': 'liblinear'}
Validation AUCPR: 0.9783
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       718
           1       0.98      0.86      0.91       118

    accuracy                           0.98       836
   macro avg       0.98      0.93      0.95       836
weighted avg       0.98      0.98      0.98       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_LogisticRegression' already exists. Creating a new version of this model...
Created version '5' of model 'SpamModel_LogisticRegression'.


--------------------------------------------------
Parameters: {'clf__C': 10, 'clf__solver': 'saga'}
Validation AUCPR: 0.9785
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       718
           1       0.98      0.86      0.91       118

    accuracy                           0.98       836
   macro avg       0.98      0.93      0.95       836
weighted avg       0.98      0.98      0.98       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_LogisticRegression' already exists. Creating a new version of this model...
Created version '6' of model 'SpamModel_LogisticRegression'.


Best Parameters for LogisticRegression: {'clf__C': 10, 'clf__solver': 'saga'}
Best Validation AUCPR: 0.9785


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_LogisticRegression' already exists. Creating a new version of this model...
Created version '7' of model 'SpamModel_LogisticRegression'.



Training and Tuning for Model: RandomForest
--------------------------------------------------
Parameters: {'clf__max_depth': None, 'clf__n_estimators': 100}
Validation AUCPR: 0.9783
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       718
           1       1.00      0.81      0.90       118

    accuracy                           0.97       836
   macro avg       0.99      0.91      0.94       836
weighted avg       0.97      0.97      0.97       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'SpamModel_RandomForest'.
Created version '1' of model 'SpamModel_RandomForest'.


--------------------------------------------------
Parameters: {'clf__max_depth': None, 'clf__n_estimators': 200}
Validation AUCPR: 0.9758
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       718
           1       0.99      0.82      0.90       118

    accuracy                           0.97       836
   macro avg       0.98      0.91      0.94       836
weighted avg       0.97      0.97      0.97       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_RandomForest' already exists. Creating a new version of this model...
Created version '2' of model 'SpamModel_RandomForest'.


--------------------------------------------------
Parameters: {'clf__max_depth': 10, 'clf__n_estimators': 100}
Validation AUCPR: 0.9614
Classification Report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       718
           1       1.00      0.18      0.30       118

    accuracy                           0.88       836
   macro avg       0.94      0.59      0.62       836
weighted avg       0.90      0.88      0.85       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_RandomForest' already exists. Creating a new version of this model...
Created version '3' of model 'SpamModel_RandomForest'.


--------------------------------------------------
Parameters: {'clf__max_depth': 10, 'clf__n_estimators': 200}
Validation AUCPR: 0.9636
Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       718
           1       1.00      0.23      0.37       118

    accuracy                           0.89       836
   macro avg       0.94      0.61      0.66       836
weighted avg       0.90      0.89      0.86       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_RandomForest' already exists. Creating a new version of this model...
Created version '4' of model 'SpamModel_RandomForest'.


Best Parameters for RandomForest: {'clf__max_depth': None, 'clf__n_estimators': 100}
Best Validation AUCPR: 0.9783


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_RandomForest' already exists. Creating a new version of this model...
Created version '5' of model 'SpamModel_RandomForest'.



Training and Tuning for Model: XGBoost
--------------------------------------------------
Parameters: {'clf__learning_rate': 0.1, 'clf__max_depth': 3}
Validation AUCPR: 0.9273
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       718
           1       0.98      0.73      0.83       118

    accuracy                           0.96       836
   macro avg       0.97      0.86      0.91       836
weighted avg       0.96      0.96      0.96       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Successfully registered model 'SpamModel_XGBoost'.
Created version '1' of model 'SpamModel_XGBoost'.


--------------------------------------------------
Parameters: {'clf__learning_rate': 0.1, 'clf__max_depth': 5}
Validation AUCPR: 0.9399
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       718
           1       0.99      0.75      0.85       118

    accuracy                           0.96       836
   macro avg       0.97      0.87      0.91       836
weighted avg       0.96      0.96      0.96       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_XGBoost' already exists. Creating a new version of this model...
Created version '2' of model 'SpamModel_XGBoost'.


--------------------------------------------------
Parameters: {'clf__learning_rate': 0.01, 'clf__max_depth': 3}
Validation AUCPR: 0.8245
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       718
           1       0.98      0.45      0.62       118

    accuracy                           0.92       836
   macro avg       0.95      0.72      0.79       836
weighted avg       0.93      0.92      0.91       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_XGBoost' already exists. Creating a new version of this model...
Created version '3' of model 'SpamModel_XGBoost'.


--------------------------------------------------
Parameters: {'clf__learning_rate': 0.01, 'clf__max_depth': 5}
Validation AUCPR: 0.8629
Classification Report:
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       718
           1       0.98      0.47      0.64       118

    accuracy                           0.92       836
   macro avg       0.95      0.74      0.80       836
weighted avg       0.93      0.92      0.91       836



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_XGBoost' already exists. Creating a new version of this model...
Created version '4' of model 'SpamModel_XGBoost'.


Best Parameters for XGBoost: {'clf__learning_rate': 0.1, 'clf__max_depth': 5}
Best Validation AUCPR: 0.9399


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Registered model 'SpamModel_XGBoost' already exists. Creating a new version of this model...
Created version '5' of model 'SpamModel_XGBoost'.


In [7]:
def evaluate_on_test(client, model_name):
    """Evaluate best production model on test set"""
    model_version = client.get_latest_versions(model_name, stages=["Production"])[0]
    model = mlflow.sklearn.load_model(model_version.source)
    
    y_proba = model.predict_proba(X_test)
    aucpr = average_precision_score(y_test, y_proba[:, 1])
    y_pred = model.predict(X_test)
    
    print(f"\n{model_name} Test Results:")
    print(f"AUCPR: {aucpr:.4f}")
    print(classification_report(y_test, y_pred))
    
    return aucpr

In [8]:
client = mlflow.tracking.MlflowClient()
print("\n=== Final Test Evaluation ===")
for model_name in ["SpamModel_LogisticRegression", "SpamModel_RandomForest", "SpamModel_XGBoost"]:
    versions = client.search_model_versions(f"name='{model_name}'")
    best_version = None
    best_aucpr = 0
    for version in versions:
        if version.current_stage != "Archived":
            run = client.get_run(version.run_id)
            aucpr = float(run.data.metrics.get("val_aucpr", 0))
            if aucpr > best_aucpr:
                best_aucpr = aucpr
                best_version = version
    if best_version is None:
        print(f"No valid versions found for {model_name}")
        continue
    client.transition_model_version_stage(
        name=model_name,
        version=best_version.version,
        stage="Production"
    )
    evaluate_on_test(client, model_name)


=== Final Test Evaluation ===

SpamModel_LogisticRegression Test Results:
AUCPR: 0.9668
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       724
           1       0.97      0.88      0.92       112

    accuracy                           0.98       836
   macro avg       0.98      0.94      0.95       836
weighted avg       0.98      0.98      0.98       836


SpamModel_RandomForest Test Results:
AUCPR: 0.9638
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       724
           1       1.00      0.79      0.88       112

    accuracy                           0.97       836
   macro avg       0.98      0.89      0.93       836
weighted avg       0.97      0.97      0.97       836


SpamModel_XGBoost Test Results:
AUCPR: 0.9033
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       724
           1       0.97      0.74      0.84      