In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import mlflow
from mlflow.models import infer_signature


from dotenv import load_dotenv

# Settings
load_dotenv()
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")



In [2]:
data_path = os.environ.get("PROCESSED_DATA_PATH")
data = pd.read_csv(f"{data_path}/simple_cleaned_train.csv", index_col=0)

In [3]:
data.head()

Unnamed: 0,medical_specialty,transcription,labels,transcription_cleaned_simple
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0,reason visit high pt inr history patient year ...
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1,preoperative diagnosis acetabular fracture lef...
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1,name procedure selective coronary angiography ...
3,Radiology,"REFERRING DIAGNOSIS: , Motor neuron disease.,P...",2,referring diagnosis motor neuron disease perti...
4,Emergency Room Reports,"CHIEF COMPLAINT: , Dental pain.,HISTORY OF PRE...",0,chief complaint dental pain history present il...


# Train Test Splits

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data['transcription_cleaned_simple'], data['labels'], test_size=0.2, random_state=42, stratify=data['labels'])

# Logistic Model

In [5]:
# Basic Logistic Regression Model
base_logistic_model = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])
# Train the model
base_logistic_model.fit(X_train, y_train)
# Predict on the training set
y_pred = base_logistic_model.predict(X_train)

# Train Set Evaluate
train_report = classification_report(y_train, y_pred, zero_division=1, output_dict=True)

In [15]:
# Predict on the test set
y_test_pred = base_logistic_model.predict(X_test)

# Test Set Evaluate
test_report = classification_report(y_test, y_test_pred, zero_division=1)
print(test_report)


              precision    recall  f1-score   support

           0       1.00      0.00      0.00        13
           1       0.35      0.58      0.44       173
           2       0.28      0.30      0.29        43
           3       1.00      0.00      0.00         7
           4       0.17      0.12      0.14        34
           5       0.16      0.09      0.11        35
           6       0.21      0.21      0.21        58
           7       0.30      0.29      0.29        62
           8       1.00      0.00      0.00        13
           9       0.00      0.00      0.00        16
          10       0.25      0.12      0.16        42
          11       1.00      0.00      0.00        14
          12       1.00      0.00      0.00         4
          13       0.18      0.15      0.16        27
          14       1.00      0.00      0.00         2
          15       1.00      0.00      0.00         9
          16       0.29      0.68      0.41        82
          17       1.00    

In [13]:
train_report

{'0': {'precision': 0.6666666666666666,
  'recall': 0.04,
  'f1-score': 0.07547169811320754,
  'support': 50},
 '1': {'precision': 0.5063613231552163,
  'recall': 0.8652173913043478,
  'f1-score': 0.6388443017656501,
  'support': 690},
 '2': {'precision': 0.5568181818181818,
  'recall': 0.5764705882352941,
  'f1-score': 0.5664739884393063,
  'support': 170},
 '3': {'precision': 1.0,
  'recall': 0.03571428571428571,
  'f1-score': 0.0689655172413793,
  'support': 28},
 '4': {'precision': 0.5555555555555556,
  'recall': 0.5147058823529411,
  'f1-score': 0.5343511450381679,
  'support': 136},
 '5': {'precision': 0.7049180327868853,
  'recall': 0.3049645390070922,
  'f1-score': 0.42574257425742573,
  'support': 141},
 '6': {'precision': 0.5281385281385281,
  'recall': 0.5281385281385281,
  'f1-score': 0.5281385281385281,
  'support': 231},
 '7': {'precision': 0.5874439461883408,
  'recall': 0.5303643724696356,
  'f1-score': 0.5574468085106382,
  'support': 247},
 '8': {'precision': 1.0,
  '

In [12]:
# Start MLflow tracking
with mlflow.start_run():
    # Log parameters
    # For example, log parameters for the TfidfVectorizer and LogisticRegression
    tfidf_params = base_logistic_model.named_steps['tfidf'].get_params()
    mlflow.log_params(tfidf_params)
    
    clf_params = base_logistic_model.named_steps['clf'].get_params()
    mlflow.log_params(clf_params)
    
    # Log metrics
    
    mlflow.log_metric("accuracy", train_report["accuracy"])
    mlflow.log_metric("macro_avg_score", train_report["macro avg"]["f1-score"])
    mlflow.log_metric("macro_avg_precision", train_report["macro avg"]["precision"])
    mlflow.log_metric("macro_avg_recall", train_report["macro avg"]["recall"])
    mlflow.log_metric("weighted_avg_score", train_report["weighted avg"]["f1-score"])
    mlflow.log_metric("weighted_avg_precision", train_report["weighted avg"]["precision"])
    mlflow.log_metric("weighted_avg_recall", train_report["weighted avg"]["recall"])

    # Convert X_train to DataFrame
    X_train_df = X_train.to_frame()

    # Log the model itself
    signature = infer_signature(X_train_df, base_logistic_model.predict(X_train))
    mlflow.sklearn.log_model(base_logistic_model, 'base_logistic_model', signature=signature, input_example=X_train_df, registered_model_name="base_logistic_model")

Successfully registered model 'base_logistic_model'.
2025/02/10 00:32:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: base_logistic_model, version 1


🏃 View run gentle-gnat-774 at: http://127.0.0.1:8080/#/experiments/0/runs/8c1a4a15596449dca169c85fc7dd2ab2
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/0


Created version '1' of model 'base_logistic_model'.


# Naive Bayes

# Support Vector Machines

In [None]:
# Base SVM Model
base_svm_model = Pipeline([('tfidf', TfidfVectorizer()), ('clf', svm.SVC())])