In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import model_selection, naive_bayes
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline



from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
data_path = os.environ.get("PROCESSED_DATA_PATH")
data = pd.read_csv(f"{data_path}/simple_cleaned_train.csv", index_col=0)

In [3]:
data.head()

Unnamed: 0,medical_specialty,transcription,labels,transcription_cleaned_simple
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0,reason visit high pt inr history patient year ...
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1,preoperative diagnosis acetabular fracture lef...
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1,name procedure selective coronary angiography ...
3,Radiology,"REFERRING DIAGNOSIS: , Motor neuron disease.,P...",2,referring diagnosis motor neuron disease perti...
4,Emergency Room Reports,"CHIEF COMPLAINT: , Dental pain.,HISTORY OF PRE...",0,chief complaint dental pain history present il...


# Train Test Splits

In [4]:
# Training and testing data split
X_train, X_test, y_train, y_test = train_test_split(data['transcription_cleaned_simple'], data['labels'], test_size=0.2, random_state=42, stratify=data['labels'])

# Cross-validation setup
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Logistic Model

In [9]:
# Define the base logistic regression model
base_logistic_pipeline = Pipeline(
    [("tfidf", TfidfVectorizer()), ("clf", LogisticRegression())]
)

# Evaluate using cross-validation 
f1_scores = cross_val_score(
    base_logistic_pipeline, X_train, y_train, cv=cv, scoring="f1_macro"
)

print("Cross-validated F1 scores:", f1_scores)
print("Average F1 score:", f1_scores.mean())



# Train the model on the full training data and evaluate on the test set
base_logistic_pipeline.fit(X_train, y_train)

# Predict on the test set
y_test_pred = base_logistic_pipeline.predict(X_test)


# Test Set Evaluate
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))


Cross-validated F1 scores: [0.07987754 0.07580094 0.07729746]
Average F1 score: 0.07765864456475817
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        13
           1       0.35      0.58      0.44       173
           2       0.28      0.30      0.29        43
           3       1.00      0.00      0.00         7
           4       0.17      0.12      0.14        34
           5       0.16      0.09      0.11        35
           6       0.21      0.21      0.21        58
           7       0.30      0.29      0.29        62
           8       1.00      0.00      0.00        13
           9       0.00      0.00      0.00        16
          10       0.25      0.12      0.16        42
          11       1.00      0.00      0.00        14
          12       1.00      0.00      0.00         4
          13       0.18      0.15      0.16        27
          14       1.00      0.00      0.00         2
       

# Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer



# Define vectorizers to compare
vectorizers = {
    "TF-IDF": TfidfVectorizer(max_features=10000),
    "Count": CountVectorizer()
}

for name, vectorizer in vectorizers.items():
    print(f"\n--- {name} Vectorizer + Naive Bayes ---")
    
    # Create pipeline
    pipeline = Pipeline([
        ("vectorizer", vectorizer),
        ("clf", MultinomialNB(alpha=0.5))
    ])
    
    # Cross-validation F1 scores
    f1_scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring="f1_macro")
    print("Cross-validated F1 scores:", f1_scores)
    print("Average F1 score:", f1_scores.mean())
    
    # Fit and evaluate on the test set
    pipeline.fit(X_train, y_train)
    y_test_pred = pipeline.predict(X_test)
    
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred, zero_division=1))



--- TF-IDF Vectorizer + Naive Bayes ---
Cross-validated F1 scores: [0.0482423  0.04867372 0.0510881 ]
Average F1 score: 0.04933470622565611
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        13
           1       0.41      0.90      0.56       173
           2       0.25      0.14      0.18        43
           3       1.00      0.00      0.00         7
           4       0.36      0.26      0.31        34
           5       0.00      0.00      0.00        35
           6       0.17      0.05      0.08        58
           7       0.37      0.24      0.29        62
           8       1.00      0.00      0.00        13
           9       1.00      0.00      0.00        16
          10       1.00      0.00      0.00        42
          11       1.00      0.00      0.00        14
          12       1.00      0.00      0.00         4
          13       0.00      0.00      0.00        27
          14       1

# Support Vector Machines

In [7]:
# Base SVM Model
base_svm_pipeline = Pipeline(
    [("tfidf", TfidfVectorizer()), ("clf", SVC(C=1.0, kernel="rbf", gamma="auto"))]
)


# Evaluate using cross-validation
f1_scores = cross_val_score(
    base_svm_pipeline, X_train, y_train, cv=cv, scoring="f1_macro"
)

print("Cross-validated F1 scores:", f1_scores)
print("Average F1 score:", f1_scores.mean())


# Train the model on the full training data and evaluate on the test set
base_svm_pipeline.fit(X_train, y_train)

# Predict on the test set
y_test_pred = base_svm_pipeline.predict(X_test)


# Test Set Evaluate
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred, zero_division=1))


Cross-validated F1 scores: [0.00892164 0.00892857 0.00892857]
Average F1 score: 0.008926262514315268
Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        13
           1       0.22      1.00      0.36       173
           2       1.00      0.00      0.00        43
           3       1.00      0.00      0.00         7
           4       1.00      0.00      0.00        34
           5       1.00      0.00      0.00        35
           6       1.00      0.00      0.00        58
           7       1.00      0.00      0.00        62
           8       1.00      0.00      0.00        13
           9       1.00      0.00      0.00        16
          10       1.00      0.00      0.00        42
          11       1.00      0.00      0.00        14
          12       1.00      0.00      0.00         4
          13       1.00      0.00      0.00        27
          14       1.00      0.00      0.00         2
      