# Install dependencies  

In [None]:

try:
  import datasets, evaluate, accelerate
  import gradio as gr
except ModuleNotFoundError:
  !pip install -U datasets evaluate accelerate gradio
  import datasets, evaluate, accelerate
  import gradio as gr

import random

import numpy as np
import pandas as pd

import torch
import transformers

print(f"Using transformers version: {transformers.__version__}")
print(f"Using datasets version: {datasets.__version__}")
print(f"Using torch version: {torch.__version__}")

# Loading Dataset(PubMed_20k_RCT)

In [None]:
!git clone https://github.com/Franck-Dernoncourt/pubmed-rct
!ls pubmed-rct

In [None]:
!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/

In [None]:
data_dir="pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"

In [None]:
import os
filenames=[data_dir+filename for filename in os.listdir(data_dir)]
filenames

In [None]:
# creating a function that read filename and returns the lines of text as a line
def read_file(filename):
  with open(filename,"r") as f:
    lines=f.readlines()
  return lines


In [None]:
train_lines=read_file(data_dir+"train.txt")
train_lines[:20]

In [None]:
len(train_lines)

# Preprocessing function

In [None]:
def preprocess_text_with_line_numbers(filename):
  """Returns a list of dictionaries of abstract line data.

  Takes in filename, reads its contents and sorts through each line,
  extracting things like the target label, the text of the sentence,
  how many sentences are in the current abstract and what sentence number
  the target line is.

  Args:
      filename: a string of the target text file to read and extract line data
      from.

  Returns:
      A list of dictionaries each containing a line from an abstract,
      the lines label, the lines position in the abstract and the total number
      of lines in the abstract where the line is from. For example:

      [{"target": 'CONCLUSION',
        "text": The study couldn't have gone better, turns out people are kinder than you think",
        "line_number": 8,
        "total_lines": 8}]
  """
  input_lines = read_file(filename) # get all lines from filename
  abstract_lines = "" # create an empty abstract
  abstract_samples = [] # create an empty list of abstracts

  # Loop through each line in target file
  for line in input_lines:
    if line.startswith("###"): # check to see if line is an ID line
      abstract_id = line
      abstract_lines = "" # reset abstract string
    elif line.isspace(): # check to see if line is a new line
      abstract_line_split = abstract_lines.splitlines() # split abstract into separate lines

      # Iterate through each line in abstract and count them at the same time
      for abstract_line_number, abstract_line in enumerate(abstract_line_split):
        line_data = {} # create empty dict to store data from line
        target_text_split = abstract_line.split("\t") # split target label from text
        line_data["target"] = target_text_split[0] # get target label
        line_data["text"] = target_text_split[1].lower() # get target text and lower it
        line_data["line_number"] = abstract_line_number # what number line does the line appear in the abstract?
        line_data["total_lines"] = len(abstract_line_split) - 1 # how many total lines are in the abstract? (start from 0)
        abstract_samples.append(line_data) # add line data to abstract samples list

    else: # if the above conditions aren't fulfilled, the line contains a labelled sentence
      abstract_lines += line

  return abstract_samples

In [None]:
%%time
train_samples=preprocess_text_with_line_numbers(data_dir+"train.txt")
val_samples=preprocess_text_with_line_numbers(data_dir+"dev.txt")
test_samples=preprocess_text_with_line_numbers(data_dir+"test.txt")

len(train_samples),len(val_samples),len(test_samples)

In [None]:
import pandas as pd
train_df=pd.DataFrame(train_samples)
val_df=pd.DataFrame(val_samples)
test_df=pd.DataFrame(test_samples)
train_df.head(14)

In [None]:
train_sentences=train_df.text.to_list()
val_sentences=val_df.text.to_list()
test_sentences=test_df.text.to_list()
len(train_sentences),len(val_sentences),len(test_sentences)

In [None]:
#One hot encode labels
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder=OneHotEncoder(sparse_output=False)
train_labels_one_hot=one_hot_encoder.fit_transform(train_df.target.to_numpy().reshape(-1,1))
val_labels_one_hot=one_hot_encoder.transform(val_df.target.to_numpy().reshape(-1,1))
test_labels_one_hot=one_hot_encoder.transform(test_df.target.to_numpy().reshape(-1,1))
train_labels_one_hot.shape,val_labels_one_hot.shape,test_labels_one_hot.shape


In [None]:
import tensorflow as tf
train_lines_numbers_one_hot=tf.one_hot(train_df.line_number,depth=15)
train_total_lines_one_hot=tf.one_hot(train_df.total_lines,depth=20)

In [None]:
#Extract labels ("target" columns) and encode them into numbers

from sklearn.preprocessing import LabelEncoder
label_encoder=LabelEncoder()
train_labels=label_encoder.fit_transform(train_df.target)
val_labels=label_encoder.transform(val_df.target)
test_labels=label_encoder.transform(test_df.target)

In [None]:
num_classes=len(label_encoder.classes_)
class_names=label_encoder.classes_
num_classes,class_names

In [None]:
train_sentences_10_percent=train_sentences[:int(len(train_sentences)*0.1)]
train_labels_10_percent=train_labels[:int(len(train_labels)*0.1)]
train_lines_numbers_10_percent=train_lines_numbers_one_hot[:int(len(train_lines_numbers_one_hot)*0.1)]
train_total_lines_10_percent=train_total_lines_one_hot[:int(len(train_total_lines_one_hot)*0.1)]
len(train_sentences_10_percent),len(train_labels_10_percent),len(train_lines_numbers_10_percent),len(train_total_lines_10_percent)

In [None]:
# creating a dict with text,label,line_number and total lines
dataset=dict()
dataset["text"]=train_sentences_10_percent
dataset["label"]=train_labels_10_percent
dataset["lines_numbers"]=train_lines_numbers_10_percent
dataset["total_lines"]=train_total_lines_10_percent

In [None]:
# converting the dict into datasets with test split 0.2
from datasets import Dataset
dataset=Dataset.from_dict(dataset)
dataset=dataset.train_test_split(test_size=0.2,shuffle=False)
dataset

# Tokenizing

In [None]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained(pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
                                        use_fast=True)
tokenizer

In [None]:
def tokenize_text(examples):
  """
  Tokenize given example text and return the tokenized text.
  """
  return tokenizer(examples["text"],
                   padding=True,
                   truncation=True)

In [None]:
# map our tokenize function to the dataset
tokenized_dataset=dataset.map(function=tokenize_text,
                              batched=True,
                              batch_size=1000)
tokenized_dataset

In [None]:
# Lets visulize the tokenized dataset text
import random
random=random.randint(0,len(tokenized_dataset["train"]))
train_data_sample=tokenized_dataset['train'][12]
for key in train_data_sample.keys():
  print(f" {key} : {train_data_sample[key]}")

In [None]:
import evaluate
from typing import Tuple
import numpy as np

evaluate_metrics=evaluate.load("accuracy")
def evaluate_matrics(predictions_labels: Tuple[np.array,np.array]):
   predictions,labels=predictions_labels
   if(len(predictions.shape)>=2):
    predictions=np.argmax(predictions,axis=1)

   return evaluate_metrics.compute(predictions=predictions, references=labels)

In [None]:
# create a dict id2label for classnames and label2id
id2label={idx:label for idx,label in enumerate(class_names)}
label2id={label:idx for idx, label in id2label.items()}
id2label,label2id

# Build the Model

In [None]:
from transformers import AutoModelForSequenceClassification

# Setup model for fine-tuning with classification head(top layers of network)

model=AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
    num_labels=5,
    id2label=id2label,
    label2id=label2id
)

In [None]:
# Inspect the model
model

In [None]:
from pathlib import Path

# create a directory
models_dir=Path("models")
models_dir.mkdir(exist_ok=True)

# create a model save name
model_save_name="SkimLit_Med"

# create model save path
model_save_dir=Path(models_dir,model_save_name)

model_save_dir

In [None]:
from transformers import TrainingArguments

training_args=TrainingArguments(
    output_dir=model_save_dir,
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=0.0001,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=3,
    use_cpu=False,
    seed=42,
    load_best_model_at_end=True,
    logging_strategy="epoch",
    report_to="none"
)

In [None]:
from transformers import Trainer

#Setup Trainer
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=evaluate_matrics
)

In [None]:
results=trainer.train()

In [None]:
# Inspect trainig metrics
for key,value in results.metrics.items():
  print(f"{key}:{value}")

In [None]:
# Save the model
trainer.save_model(output_dir=model_save_dir)

In [None]:
# Get training history
trainer_history_all=trainer.state.log_history
trainer_history_metrics=trainer_history_all[:-1]
training_history_training_time=trainer_history_all[-1]

trainer_history_metrics[:4]

In [None]:
import pprint

trainer_history_training_set=[]
trainer_history_eval_set=[]

for item in trainer_history_metrics:
  item_keys=list(item.keys())
  if any("eval" in item for item in item_keys):
    trainer_history_eval_set.append(item)
  else:
    trainer_history_training_set.append(item)

# Show the first two items in each metric set
pprint.pprint(trainer_history_training_set[:3])
pprint.pprint(trainer_history_eval_set[:3])

In [None]:
# create pandas dataframe for the training and evaluation metrics
trainer_history_training_df=pd.DataFrame(trainer_history_training_set)
trainer_history_eval_df=pd.DataFrame(trainer_history_eval_set)

trainer_history_training_df.head()

In [None]:
# Plot training and evaluate loss

import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.plot(trainer_history_eval_df["epoch"],trainer_history_eval_df["eval_loss"],label="Evaluation loss")
plt.plot(trainer_history_training_df["epoch"],trainer_history_training_df["loss"],label="Training loss")
plt.xlabel("Epoch")
plt.title("Text classification with DistilBert training and evaluation loss over time")
plt.legend()
plt.show()

In [None]:
# Save our model to Hugging face hub
model_upload_url=trainer.push_to_hub(

    commit_message="Uploading skimed sentence classifier model"

)

In [None]:
model_upload_url

In [None]:
predictions_all=trainer.predict(tokenized_dataset['test'])
prediction_values=predictions_all.predictions
prediction_metrics=predictions_all.metrics
prediction_metrics

In [None]:
import torch
from sklearn.metrics import accuracy_score

# 1.Calculate the prediction probability
pred_probs=torch.softmax(torch.tensor(prediction_values),dim=1)

# 2. Get the predicted labels
pred_labels=torch.argmax(pred_probs,dim=1)

# 3.Get the true labels
true_labels=dataset['test']['label']

# 4. Copare prediced labels to true labels
test_accuracy=accuracy_score(y_true=true_labels,
                             y_pred=pred_labels)

print(f"accuracy :{test_accuracy*100}")

In [None]:
# Make a DataFrame of test predictions
test_predictions_df=pd.DataFrame({
    "text":dataset["test"]["text"],
    "true_labels":true_labels,
    "pred_label":pred_labels,
    "pred_prob":torch.max(pred_probs,dim=1).values
})

test_predictions_df.head()

In [None]:
# Show 10 examples with low prediction probabiity
test_predictions_df.sort_values("pred_prob",ascending=True).head(10)

In [None]:
# setup local model path
huggingface_model_path="gokulan006/SkimLit_Med"


In [None]:
def set_device():
  """
  Set device to CUDA if available,else MPS(Mac),else CPU.

  This defaults to using the best available device(usually).

  """

  if torch.cuda.is_available():
    device=torch.device("cuda")
  elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device=torch.device("mps")
  else:
    device=torch.device("cpu")
  return device


DEVICE=set_device()
print(DEVICE)

In [None]:
import torch
from transformers import pipeline

BATCH_SIZE=32

SkimLit_Med=pipeline(task="text-classification",
                                  model=huggingface_model_path,
                                  device=DEVICE,
                                  top_k=1,
                                  batch_size=BATCH_SIZE)

SkimLit_Med

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

# Ensure you have the required NLTK tokenizer
nltk.download('punkt_tab')

# Input abstract
abstract = """Home Health Pilot Decreases Readmissions in High Risk Ileostomy PatientsReadmissions to the hospital have come under scrutiny in a new healthcare era. At our institution UHC all-cause 30-day readmission for ileostomies ranged from 15 to 33% (mean 18%). Because ileostomy patients are a high-risk group for readmission, they are an ideal cohort for improvement. The purpose of this pilot was to develop a partnership with a home health agency VNHS in the form of standardized discharge/home health orders that included triggers that would elicit communication back to the surgeon so that a corrective action could be taken before the patient required a readmission. Our goal was to reduce UHC all-cause 30-day readmission in ileostomy patients by 15% (from 18 to 15.3%) in 5 months. The standard order set was vetted and agreed upon by all the stakeholders and implemented. A weekly 15-minute conversation was implemented between the EUH team and VNHS. Because it appeared to be a successful intervention, it was extended to a full year. The readmission rate for VNHS ileostomy patients decreased from 19 to 7%. During the same time, non-VNHS ileostomy patients were receiving standard of care and their readmission rate remained stable, 16 to 20%. Before implementation, VNHS and non-VNHS ileostomy patients had similar readmission rates, 19% and 16%, respectively. During the study period, the total sum cost of readmissions for non-VNHS patients receiving standard of care increased by 58.3%. For patients in the pilot, the readmission costs decreased by 77.6%. In conclusion, we successfully implemented a pilot program that formed a partnership with a home health agency with standardized discharge orders and decreased ileostomy UHC all-cause 30-day readmissions. The pilot was started with a small number of patients, but will be expanded based on this initial success."""

# Tokenize the abstract into sentences
sentences = sent_tokenize(abstract)

# Print each sentence (you can replace this with model inference)
for idx, sentence in enumerate(sentences):
    print(f"Sentence {idx+1}: {sentence}")


In [None]:
# Test the trained model on sentences
SkimLit_Med(sentences)

In [None]:
predicted_labels=[prediction[0]["label"] for prediction in SkimLit_Med(sentences)]
predicted_labels


In [None]:
from pathlib import Path

demos_dir=Path("../demos")
demos_dir.mkdir(exist_ok=True)

SkimLit_Med_demo_dir=Path(demos_dir,"SkimLit")
SkimLit_Med_demo_dir.mkdir(exist_ok=True)

In [None]:
%%writefile ../demos/SkimLit/app.py
import torch
import gradio as gr
import nltk
from typing import Dict, List
from transformers import pipeline
from nltk.tokenize import sent_tokenize


nltk.download('punkt')


def skimMed(text: str) -> str:
    """
    Takes an input abstract text and classifies it into sections like
    'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS', and 'BACKGROUND'.
    """

    # Load Hugging Face model
    SkimLit_Med = pipeline(
        task="text-classification",
        model="gokulan006/SkimLit_Med",
        batch_size=32,
        device=0 if torch.cuda.is_available() else -1,
        top_k=None
    )

    # Tokenize the abstract into sentences
    sentences = sent_tokenize(text)

    # Predicted labels for each sentence
    predicted_labels = [prediction[0]["label"] for prediction in SkimLit_Med(sentences)]

    # Creating a dictionary to store text based on section labels
    sections = {label: [] for label in [ "BACKGROUND","OBJECTIVE", "METHODS", "RESULTS", "CONCLUSIONS"]}

    # Map each abstract line to its predicted label
    for line, label in zip(sentences, predicted_labels):
        sections[label].append(line)


    ordered_abstract = "\n\n".join([f"{key}:\n" + "\n".join(sections[key]) for key in sections if sections[key]])

    return ordered_abstract

# 3. Create a Gradio interface
description = """
Medical Abstract Skimmer Automatically Organize Medical Abstracts into Structured Sections (Objective, Methods, Results, Conclusions, Background).

Fine-tuned from [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) on a [PubMed 20k RCT Dataset](https://github.com/Franck-Dernoncourt/pubmed-rct/tree/master/PubMed_20k_RCT).
"""

demo = gr.Interface(
    fn=skimMed,
    inputs=gr.Textbox(lines=10, placeholder="Enter your medical abstract here...", label="Enter Medical Abstract"),
    outputs=gr.Textbox(label="Formatted Medical Abstract"),
    title="📚🩺 Medical Abstract Skimmer 🩺📚",
    description=description,
    theme="soft",
    examples=[
        ["Telemedicine has emerged as a promising solution to address healthcare access issues in rural populations, particularly for patients with chronic diseases. This study aimed to evaluate the effectiveness of telemedicine in managing chronic conditions in rural areas. We conducted a randomized controlled trial with 200 participants diagnosed with hypertension or diabetes in rural areas. Participants were randomly assigned to either a telemedicine group or a usual care group. The telemedicine group received remote consultations, medication management, and monitoring through digital health platforms, while the usual care group continued standard in-person visits. Both groups were followed for six months. The telemedicine group showed a significant improvement in blood pressure control (mean decrease of 15 mmHg) and blood sugar levels (mean decrease of 1.5% in HbA1c) compared to the usual care group, where the improvements were 5 mmHg and 0.5% in HbA1c, respectively. Additionally, the telemedicine group reported higher satisfaction levels and fewer hospital visits than the usual care group. Telemedicine proved to be an effective and acceptable approach for managing chronic diseases in rural populations, demonstrating improvements in both clinical outcomes and patient satisfaction. The findings support the broader implementation of telemedicine in rural healthcare settings."]
    ],
    allow_flagging="never"
)

# 4. Launch the interface
if __name__ == "__main__":
    demo.launch()

In [None]:
%%writefile ../demos/SkimLit/requirements.txt
gradio
torch
transformers

In [None]:
%%writefile ../demos/SkimLit/README.md
---
title: Medical Abstract Skimmer
emoji:  📚🩺
colorFrom: blue
colorTo: yellow
sdk: gradio
app_file: app.py
pinned: false
license: apache-2.0
---

#  SkimLit_med

Medical Abstract Skimmer Automatically Organize Medical Abstracts into Structured Sections (Objective, Methods, Results, Conclusions, Background)

Fine-tuned from [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) on a [PubMed 20k RCT Dataset](https://github.com/Franck-Dernoncourt/pubmed-rct/tree/master/PubMed_20k_RCT).



In [None]:
!ls ../demos/SkimLit/

In [None]:
from huggingface_hub import (
    create_repo,
    get_full_repo_name,
    upload_file,
    upload_folder
)


LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD = "../demos/SkimLit"
HF_TARGET_SPACE_NAME = "SkimLit"
HF_REPO_TYPE = "space"
HF_SPACE_SDK = "gradio"

# 3. Create a Space repository on Hugging Face Hub
print(f"[INFO] Creating repo on Hugging Face Hub with name: {HF_TARGET_SPACE_NAME}")



full_hf_repo_name = get_full_repo_name(model_id=HF_TARGET_SPACE_NAME)
print(f"[INFO] Full Hugging Face Hub repo name: {full_hf_repo_name}")

#  Upload our demo folder
print(f"[INFO] Uploading {LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD} to repo: {full_hf_repo_name}")
folder_upload_url = upload_folder(
    repo_id="gokulan006/SkimLit",
    folder_path=LOCAL_DEMO_FOLDER_PATH_TO_UPLOAD,
    path_in_repo=".",

    repo_type=HF_REPO_TYPE,
    commit_message="Uploading SkimLit demo app.py"
)
print(f"[INFO] Demo folder successfully uploaded with commit URL: {folder_upload_url}")