In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import openai
from sklearn import preprocessing
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
import os.path
from collections import defaultdict
from imodelsx import LinearFinetuneClassifier, LinearNgramClassifier

openai.api_key = open('/home/chansingh/.OPENAI_KEY').read().strip()

df = pd.read_pickle("../data/data_clean.pkl")
LABELS = [
    "categorization___chief_complaint",
    "categorization___specialty",
    "categorization___purpose",
    "categorization___system",
    "categorization___disease",
]

In [None]:
def get_classification_data(lab="categorization___chief_complaint", random_state=42):
    # prepare output
    classes = df[lab].explode()
    vc = classes.value_counts()

    # restrict to top classes
    top_classes = vc.index[vc.values >= 20]
    df[lab] = df[lab].apply(lambda l: [x for x in l if x in top_classes])

    # label binarizer
    le = MultiLabelBinarizer()
    y = le.fit_transform(df[lab])

    # input text
    # set up text for prediction
    # def get_text_representation(row):
    #     # return f"""- Title: {row["title"]}
    # # - Description: {row["description"]}
    # # - Predictor variables: {str(row["feature_names"])[1:-1]}"""
    #     return f"""{row["title"]}. {row["description"]}. Keywords: {str(row["info___keywords"])[1:-1]}"""
    # df['text'] = df.apply(get_text_representation, axis=1)
    X = df["paper___raw_text"]

    idxs = X.notna()
    X = X[idxs].tolist()
    y = y[idxs]

    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)
    return X_train, X_test, y_train, y_test, le.classes_

X_train, X_test, y_train, y_test, classes = get_classification_data(lab=LABELS[0])

In [8]:
def get_model(model_name="decision_tree", random_state=42):
    if model_name == "decision_tree":
        pipe = Pipeline(
            [
                ("tfidf", TfidfVectorizer()),
                ("clf", DecisionTreeClassifier(random_state=random_state)),
            ]
        )
    elif model_name == "random_forest":
        pipe = Pipeline(
            [
                ("tfidf", TfidfVectorizer()),
                ("clf", RandomForestClassifier(random_state=random_state)),
            ]
        )
    elif model_name == "logistic":
        pipe = Pipeline(
            [
                ("tfidf", TfidfVectorizer()),
                (
                    "clf",
                    MultiOutputClassifier(
                        LogisticRegressionCV(random_state=random_state)
                    ),
                ),
            ]
        )
    elif model_name == "bert-base-uncased":
        pipe = MultiOutputClassifier(
            LinearFinetuneClassifier(
                checkpoint="bert-base-uncased",
                normalize_embs=False,
                random_state=random_state,
                cache_embs_dir=os.path.expanduser("~/.cache_mdcalc_embeddings"),
            )
        )
    return pipe


df = defaultdict(list)
for model_name in [
    "bert-base-uncased",
    "decision_tree",
    "logistic",
]:  # "random_forest", "logistic"]:
    print(model_name)
    m = get_model(model_name)
    m.fit(X_train, y_train)
    # df['y_pred_train'].append(m.predict(X_train))
    y_pred = m.predict(X_test)
    # df['y_pred_test'].append(y_test)

    df["model_name"].append(model_name)

    # eval
    rep = classification_report(
        y_test, y_pred, target_names=classes, output_dict=True, zero_division=0
    )
    for k1 in ["micro", "macro"]:
        for k in ["precision", "recall", "f1-score"]:
            df[f"{k1}_{k}"].append(rep[k1 + " avg"][k])

bert-base-uncased


100%|██████████| 150/150 [00:05<00:00, 25.31it/s]
100%|██████████| 150/150 [00:05<00:00, 25.05it/s]
100%|██████████| 150/150 [00:05<00:00, 25.13it/s]
100%|██████████| 150/150 [00:05<00:00, 25.31it/s]
100%|██████████| 150/150 [00:05<00:00, 25.28it/s]
100%|██████████| 150/150 [00:05<00:00, 25.33it/s]
100%|██████████| 150/150 [00:05<00:00, 25.08it/s]
100%|██████████| 150/150 [00:05<00:00, 25.14it/s]
100%|██████████| 150/150 [00:06<00:00, 24.49it/s]
100%|██████████| 150/150 [00:06<00:00, 24.77it/s]
100%|██████████| 150/150 [00:06<00:00, 24.56it/s]
100%|██████████| 150/150 [00:06<00:00, 24.73it/s]
100%|██████████| 150/150 [00:06<00:00, 24.40it/s]
100%|██████████| 150/150 [00:06<00:00, 24.08it/s]
100%|██████████| 150/150 [00:06<00:00, 24.56it/s]
100%|██████████| 150/150 [00:06<00:00, 24.48it/s]
100%|██████████| 150/150 [00:06<00:00, 24.58it/s]
100%|██████████| 150/150 [00:06<00:00, 24.53it/s]
100%|██████████| 150/150 [00:06<00:00, 24.57it/s]
100%|██████████| 150/150 [00:06<00:00, 24.29it/s]


decision_tree
logistic


In [6]:
df = pd.DataFrame(df)

In [7]:
df.style.format(precision=2).background_gradient(cmap='Blues', axis=None)

Unnamed: 0,model_name,micro_precision,micro_recall,micro_f1-score,macro_precision,macro_recall,macro_f1-score
0,bert-base-uncased,0.66,0.13,0.21,0.25,0.08,0.11
1,decision_tree,0.34,0.25,0.29,0.26,0.19,0.21
2,logistic,0.82,0.22,0.35,0.45,0.15,0.21
