In [1]:
from transformers import pipeline
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cancer_text = pd.read_csv("../data/train_data_01.csv")

In [3]:
cancer_text

Unnamed: 0,bc_diagnosis_description,ha_procedure_description,target
0,Urology other P1,NEPHRECTOMY (ADULT),non_cancer
1,Claudication - stable,"FUSION - INSTRUMENTED, PLATING - POSTERIOR CER...",non_cancer
2,Obst/Gynecology other P3,THERMABLATION - ENDOMETRIAL #2,non_cancer
3,Cleft Palate,PALATE CLOSURE FISTULA,non_cancer
4,Scar without functional restriction,CORRECTION OF ECTROPION,non_cancer
...,...,...,...
86629,Surveillance where assessment period 1 year or...,COLONOSCOPY AND UGI,non_cancer
86630,Testicular cancer - radical orchiectomy,ORCHIDECTOMY SIMPLE(SCROTAL APPROACH),non_cancer
86631,Otolaryngology other P1,EAR EUA/MYRINGOTOMY,non_cancer
86632,Knee - retained hardware - moderate to severe ...,OPEN REDUCTION INTERNAL FIXATION TIBIA,non_cancer


In [4]:
# Load zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

Device set to use cpu


In [6]:
# Candidate labels
labels = ["This is a cancer surgery procedure", "This is not a cancer surgery procedure"]

In [7]:
# Add context can help the model understand the task better, play around it for a quite while and find the best one
def add_context(procedure):
    if pd.isna(procedure) or procedure.strip() == "":
        return ""
    return f"Procedure performed: {procedure}."

# Use in your pipeline
cancer_text["procedure_context"] = cancer_text["ha_procedure_description"].apply(add_context)

### Run classification

In [8]:
def classify_procedure(text):
    if pd.isna(text) or not str(text).strip():
        return "unknown", 0.0
    result = classifier(str(text), labels)
    return result["labels"][0], float(result["scores"][0])

# Get unique values
unique_texts = cancer_text["procedure_context"].dropna().unique()

# a lookup dictionary with classification results
classification_lookup = {
    text: classify_procedure(text) for text in unique_texts
}

# Step 3: Map back to the full DataFrame
cancer_text["prediction"], cancer_text["confidence"] = zip(*cancer_text["procedure_context"].map(
    lambda x: classification_lookup.get(x, ("unknown", 0.0))
))


In [61]:
unique_texts = cancer_text["procedure_context"].dropna().unique()

In [12]:
#df_unique = df.drop_duplicates(subset=["procedure"], keep="first")

cancer_text.to_csv("../data/llm_baseline_train_data_result.csv", index=False)

In [11]:
cancer_text["prediction"] = cancer_text["prediction"].replace({
    "This is a cancer surgery procedure": "non_cancer",
    "This is not a cancer surgery procedure": "cancer"
})
cancer_text["true_label"] = cancer_text["target"].apply(lambda x: "non_cancer" if x == "non_cancer" else "cancer")

y_true = cancer_text["true_label"]
y_pred = cancer_text["prediction"]

print(confusion_matrix(y_true, y_pred, labels=["cancer", "non_cancer"]))
print(classification_report(y_true, y_pred, target_names=["cancer case", "not a cancer case"]))


[[  251   730]
 [40681 44972]]
                   precision    recall  f1-score   support

      cancer case       0.01      0.26      0.01       981
not a cancer case       0.98      0.53      0.68     85653

         accuracy                           0.52     86634
        macro avg       0.50      0.39      0.35     86634
     weighted avg       0.97      0.52      0.68     86634



In [14]:
from sklearn.metrics import precision_score
precision_score(y_true, y_pred, average='macro')

0.49507953940611865