# **PYTHON SCRIPT**

In [None]:
def categorize_sentences(report):
    # Define category keywords
    technique_keywords = ["view"]  # Only keep actual imaging technique terms
    finding_keywords = ["clear", "normal", "shift", "opacity", "hila", "diaphragm", "ratio", "clip", "costophrenic angles"]
    inference_keywords = ["advice", "recommendation", "impression", "suggest", "clinical correlation"]

    # Split the report into sentences
    sentences = report.split("\n")

    categories = {"Techniques": [], "Findings": [], "Inference": []}

    for sentence in sentences:
        sentence_lower = sentence.lower()

        # Ensure imaging technique sentences are only categorized under Techniques
        if any(word in sentence_lower for word in technique_keywords):
            categories["Techniques"].append(sentence)
        elif any(word in sentence_lower for word in finding_keywords):
            categories["Findings"].append(sentence)
        elif any(word in sentence_lower for word in inference_keywords):
            categories["Inference"].append(sentence)

    return categories

# Example input
report_text = """X-RAY CHEST P.A. VIEW
No midline shift seen.
Both lung fields are clear.
Cardiothoracic ratio appears normal.
Both hila appear normal.
Bilateral domes of diaphragm & costophrenic angles appear normal.
Multiple wire clips are present.
Advice: Clinical correlation."""

# Run the categorization function
categorized_report = categorize_sentences(report_text)

# Print results
for category, sentences in categorized_report.items():
    print(f"\n{category}:")
    for sentence in sentences:
        print(f"- {sentence}")


Techniques:
- X-RAY CHEST P.A. VIEW

Findings:
- No midline shift seen.
- Both lung fields are clear.
- Cardiothoracic ratio appears normal.
- Both hila appear normal.
- Bilateral domes of diaphragm & costophrenic angles appear normal.
- Multiple wire clips are present.

Inference:
- Advice: Clinical correlation.


### **BIOELECTRA**

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

def generate_radiology_report(transcription):


    print("Step 1: Loading BioBERT Model")
    # Load a publicly available BioBERT NER model
    model_name = "kamalkraj/bioelectra-base-discriminator-pubmed"
    print(f"Loading model from Hugging Face: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    # Create NER pipeline
    print("Step 2: Creating Named Entity Recognition (NER) pipeline...")
    nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    # Extract medical terms from transcription
    print("Step 3: Extracting medical entities from transcription...")
    ner_results = nlp_pipeline(transcription)

    print("Step 4: Categorizing extracted entities into impressions...")
    # Organize extracted entities into categories
    impression = []

    for entity in ner_results:
        print(f"Processing entity: {entity['word']} - Category: {entity['entity_group']}")
        if entity['entity_group'] in ["DIAGNOSIS", "CONDITION"]:
            impression.append(entity['word'])

    print("Step 5: Formatting the final radiology report...")
    # Format the structured report
    report = """
    X-RAY - CHEST PA VIEW

    OBSERVATION:
    - No observations recorded.

    IMPRESSION:
    ➤ {impression}
    """.format(
        impression="\n    - ".join(impression) if impression else "No significant abnormality seen."
    )

    print("Step 6: Radiology report successfully generated.")
    return report


print("Starting the process of radiology report generation...")
transcription_text = """No midline shift seen.
Both lung fields are clear.
Cardiothoracic ratio appears normal.
Both hila appear normal.
Bilateral domes of diaphragm & costophrenic angles appear normal.
Multiple wire clips are present.
Advice: Clinical correlation."""

print("Received transcription input. Beginning analysis...")

formatted_report = generate_radiology_report(transcription_text)

print("\nFinal Generated Report:")
print(formatted_report)
print("Process completed successfully. Report is ready for review.")


Starting the process of radiology report generation...
Received transcription input. Beginning analysis...
Initializing radiology report generation...
Step 1: Loading BioBERT Model
Loading model from Hugging Face: kamalkraj/bioelectra-base-discriminator-pubmed


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step 2: Creating Named Entity Recognition (NER) pipeline...
Step 3: Extracting medical entities from transcription...
Step 4: Categorizing extracted entities into impressions...
Processing entity: no midline - Category: LABEL_1
Processing entity: shift - Category: LABEL_0
Processing entity: seen. both lung - Category: LABEL_1
Processing entity: fields - Category: LABEL_0
Processing entity: are clear. cardiothoracic ratio appears normal. - Category: LABEL_1
Processing entity: both - Category: LABEL_0
Processing entity: hila appear normal. - Category: LABEL_1
Processing entity: bilateral domes - Category: LABEL_0
Processing entity: of diaphragm & - Category: LABEL_1
Processing entity: costophrenic angles - Category: LABEL_0
Processing entity: appear normal. multiple wire - Category: LABEL_1
Processing entity: clips - Category: LABEL_0
Processing entity: are present. - Category: LABEL_1
Processing entity: advice - Category: LABEL_0
Processing entity: : clinical correlation. - Category: LA

In [None]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

def generate_radiology_report(transcription):

    print("Step 1: Loading BioBERT Model")

    model_name = "kamalkraj/bioelectra-base-discriminator-pubmed"
    print(f"Loading model from Hugging Face: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


    print("Step 3: Extracting medical entities from transcription...")
    ner_results = nlp_pipeline(transcription)

    print("Step 4: Categorizing extracted entities into impressions...")

    impression = []

    for entity in ner_results:
        print(f"Processing entity: {entity['word']} - Category: {entity['entity_group']}")
        if entity['entity_group'] in ["DIAGNOSIS", "CONDITION"]:
            impression.append(entity['word'])

    print("Step 5: Formatting the final radiology report...")

    report_template = """
    X-RAY - CHEST PA VIEW

    OBSERVATION:
    The trachea is central.

    The mediastinal and cardiac silhouette are normal.

    Cardiothoracic ratio is normal.

    Cardiophrenic and costophrenic angles are normal.

    Both hila are normal.

    The lung fields are clear.

    Bones of the thoracic cage are normal.

    Soft tissues of the chest wall are normal.

    IMPRESSION:
    ➤ {impression}
    """.format(
        impression="\n    - ".join(impression) if impression else "No significant abnormality seen."
    )

    print("Step 6: Radiology report successfully generated.")
    return report_template


print("Starting the process of radiology report generation...")
transcription_text = """No midline shift seen.
Both lung fields are clear.
Cardiothoracic ratio appears normal.
Both hila appear normal.
Bilateral domes of diaphragm & costophrenic angles appear normal.
Multiple wire clips are present.
Advice: Clinical correlation."""

print("Received transcription input. Beginning analysis...")

formatted_report = generate_radiology_report(transcription_text)


print("\nFinal Generated Report:")
print(formatted_report)
print("Process completed successfully. Report is ready for review.")


Starting the process of radiology report generation...
Received transcription input. Beginning analysis...
Initializing radiology report generation...
Step 1: Loading BioBERT Model
Loading model from Hugging Face: kamalkraj/bioelectra-base-discriminator-pubmed


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step 2: Creating Named Entity Recognition (NER) pipeline...
Step 3: Extracting medical entities from transcription...
Step 4: Categorizing extracted entities into impressions...
Processing entity: no midline shift seen. both lung - Category: LABEL_1
Processing entity: fields - Category: LABEL_0
Processing entity: are clear. cardiothoracic ratio appears normal. both - Category: LABEL_1
Processing entity: hil - Category: LABEL_0
Processing entity: ##a appear normal. bilateral - Category: LABEL_1
Processing entity: domes - Category: LABEL_0
Processing entity: of - Category: LABEL_1
Processing entity: diaphragm - Category: LABEL_0
Processing entity: & costophrenic - Category: LABEL_1
Processing entity: angles - Category: LABEL_0
Processing entity: appear normal. multiple wire clips are present. advice : clinical correlation. - Category: LABEL_1
Step 5: Formatting the final radiology report...
Step 6: Radiology report successfully generated.

Final Generated Report:

    X-RAY - CHEST PA VI

# **Problem:**
Gave the whole template as the output. Failed to process the Transcriptions. No
merge performed.

In [1]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

def generate_radiology_report(transcription):

    print("Step 1: Loading BioBERT Model")

    model_name = "kamalkraj/bioelectra-base-discriminator-pubmed"
    print(f"Loading model from Hugging Face: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    print("Step 3: Extracting medical entities from transcription...")
    ner_results = nlp_pipeline(transcription)

    print("Step 4: Categorizing extracted entities into observations and impressions...")

    observations = []
    impression = []
    finding_keywords = ["clear", "normal", "shift", "opacity", "hila", "diaphragm", "ratio", "clip", "costophrenic angles"]

    for entity in ner_results:
        print(f"Processing entity: {entity['word']} - Category: {entity['entity_group']}")
        if entity['entity_group'] in ["OBSERVATION", "FINDING", "ANATOMY"]:
            observations.append(entity['word'])
        elif entity['entity_group'] in ["DIAGNOSIS", "CONDITION"]:
            impression.append(entity['word'])

    for line in transcription.split("\n"):
        if any(keyword in line.lower() for keyword in finding_keywords):
            observations.append(line)

    print("Step 5: Radiology report successfully generated.")
    return {"Observations": observations, "Impression": impression}

print("Starting the process of radiology report generation...")
transcription_text = """No midline shift seen.
Both lung fields are clear.
Cardiothoracic ratio appears normal.
Both hila appear normal.
Bilateral domes of diaphragm & costophrenic angles appear normal.
Multiple wire clips are present.
Advice: Clinical correlation."""

print("Received transcription input. Beginning analysis...")
formatted_report = generate_radiology_report(transcription_text)

print("\nExtracted Observations and Impressions:")
print(formatted_report)
print("Process completed successfully. Report is ready for review.")


Starting the process of radiology report generation...
Received transcription input. Beginning analysis...
Initializing radiology report generation...
Step 1: Loading BioBERT Model
Loading model from Hugging Face: kamalkraj/bioelectra-base-discriminator-pubmed


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step 3: Extracting medical entities from transcription...
Step 4: Categorizing extracted entities into observations and impressions...
Processing entity: no midline shift - Category: LABEL_0
Processing entity: seen. - Category: LABEL_1
Processing entity: both lung fields - Category: LABEL_0
Processing entity: are - Category: LABEL_1
Processing entity: clear - Category: LABEL_0
Processing entity: . - Category: LABEL_1
Processing entity: cardiothoracic ratio appears normal - Category: LABEL_0
Processing entity: . - Category: LABEL_1
Processing entity: both hila appear normal - Category: LABEL_0
Processing entity: . - Category: LABEL_1
Processing entity: bilateral domes - Category: LABEL_0
Processing entity: of - Category: LABEL_1
Processing entity: diaphragm & costophren - Category: LABEL_0
Processing entity: ##ic - Category: LABEL_1
Processing entity: angles appear normal - Category: LABEL_0
Processing entity: . - Category: LABEL_1
Processing entity: multiple wire clips - Category: LABE

In [8]:
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

def generate_radiology_report(transcription):

    print("Initializing radiology report generation...")
    print("Step 1: Loading BioBERT Model")

    model_name = "kamalkraj/bioelectra-base-discriminator-pubmed"
    print(f"Loading model from Hugging Face: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name)

    nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

    print("Step 2: Extracting medical entities from transcription...")
    ner_results = nlp_pipeline(transcription)

    print("Step 3: Categorizing extracted entities...")

    technique_keywords = ["view"]
    finding_keywords = ["clear", "normal", "shift", "opacity", "hila", "diaphragm", "ratio", "clip", "costophrenic angles"]
    inference_keywords = ["advice", "recommendation", "impression", "suggest", "clinical correlation"]

    technique = []
    observations = []
    inference = []

    for line in transcription.split("\n"):
        lower_line = line.lower()
        if any(keyword in lower_line for keyword in technique_keywords):
            technique.append(line)
        elif any(keyword in lower_line for keyword in finding_keywords):
            observations.append(line)
        elif any(keyword in lower_line for keyword in inference_keywords):
            inference.append(line)

    print("Step 4: Formatting the final structured radiology report...")

    report = (
        "TECHNIQUE:\n" + "\n".join(technique) + "\n\n" +
        "OBSERVATION:\n" + ("\n".join(observations) if observations else "No significant observations.") + "\n\n" +
        "INFERENCE:\n" + ("\n".join(inference) if inference else "No significant abnormality seen.")
    )

    print("Step 5: Radiology report successfully generated.")
    return report

transcription_text = """No midline shift seen.
Both lung fields are clear.
Cardiothoracic ratio appears normal.
Both hila appear normal.
Bilateral domes of diaphragm & costophrenic angles appear normal.
Multiple wire clips are present.
Advice: Clinical correlation."""

print("Received transcription input. Beginning analysis...")
formatted_report = generate_radiology_report(transcription_text)

print("\nFinal Generated Report:")
print(formatted_report)


Received transcription input. Beginning analysis...
Initializing radiology report generation...
Step 1: Loading BioBERT Model
Loading model from Hugging Face: kamalkraj/bioelectra-base-discriminator-pubmed


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Step 2: Extracting medical entities from transcription...
Step 3: Categorizing extracted entities...
Step 4: Formatting the final structured radiology report...
Step 5: Radiology report successfully generated.

Final Generated Report:
TECHNIQUE:


OBSERVATION:
No midline shift seen.
Both lung fields are clear.
Cardiothoracic ratio appears normal.
Both hila appear normal.
Bilateral domes of diaphragm & costophrenic angles appear normal.
Multiple wire clips are present.

INFERENCE:
Advice: Clinical correlation.


## Problems:
1.  First no context of the observation i.e not able to understand by the Transcription what radiology test is this.

2. Need list of keywords to identify the observation and inference. Anything out of the list wont be classified.
