In [None]:
!pip install transformers datasets pandas



In [None]:
!ls

Bleed_records_NLP.xlsx	radiologists_report.csv  sample_data


In [None]:
import pandas as pd
from transformers import pipeline
import re

## DistilGPT2

In [None]:
generator = pipeline('text-generation', model='distilgpt2')

def extract_report(text):

    prompt = f"""Extract the following from the radiology report:
    1. Normal or abnormal
    2. Pathologies
    3. Location
    4. Midline shift (yes or no), and if yes, measurement in mm
    5. Bleed subcategory (epidural, subdural, subarachnoid, intraventricular, contusion, hematoma)

    Report:
    {text}

    Provide a clear, structured answer.
    """

    result = generator(prompt, num_return_sequences=1, do_sample=False, max_new_tokens=100)[0]['generated_text']

    return result

def parse_report(generated):

    parsed = {
        "Normal/Abnormal": "",
        "Pathologies": "",
        "Location": "",
        "Midline shift": "No",
        "Midline shift measurement": "",
        "Bleed subcategory": "",
        "Measurement of abnormal": ""
    }
    normal = re.search(r'Normal or abnormal:\s*(.*)', generated, re.IGNORECASE)
    path = re.search(r'Pathologies:\s*(.*)', generated, re.IGNORECASE)
    location = re.search(r'Location:\s*(.*)', generated, re.IGNORECASE)
    shift = re.search(r'midline shift.*:\s*(.*)', generated, re.IGNORECASE)
    shift_measure = re.search(r'midline shift.*([0-9]+)\s*mm', generated, re.IGNORECASE)
    subcategory = re.search(r'Bleed subcategory:\s*(.*)', generated, re.IGNORECASE)
    measurement = re.search(r'measurement.*:\s*(.*)', generated, re.IGNORECASE)

    if normal:
        parsed["Normal/Abnormal"] = normal.group(1).strip()
    if path:
        parsed["Pathologies"] = path.group(1).strip()
    if location:
        parsed["Location"] = location.group(1).strip()
    if shift:
        parsed["Midline shift"] = "Yes" if "yes" in shift.group(1).lower() else "No"
    if shift_measure:
        parsed["Midline shift measurement"] = shift_measure.group(1) + " mm"
    if subcategory:
        parsed["Bleed subcategory"] = subcategory.group(1).strip()
    if measurement:
        parsed["Measurement of abnormal"] = measurement.group(1).strip()

    return parsed

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def main():
    df = pd.read_csv('radiologists_report.csv')

    parsed_data = []

    for idx, row in df.iloc[:10].iterrows():
        report = str(row['Radiologist Diagnosis'] or '')
        generated = extract_report(report)
        parsed = parse_report(generated)
        parsed_data.append(parsed)

    parsed_df = pd.DataFrame(parsed_data)

    df = pd.concat([df.iloc[:10].reset_index(drop=True), parsed_df], axis=1)

    df.to_csv('processed_report_test.csv', index=False)

    print("Processing complete! File saved as processed_report_test.csv")


In [None]:
main()

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:5

Processing complete! File saved as processed_report_test.csv


In [None]:
!ls

processed_report_test.csv  radiologists_report.csv  sample_data


## FLAN-T5

In [None]:
generator = pipeline("text2text-generation", model="google/flan-t5-large")


def extract_report(text):

    prompt = f"""Extract the following from the radiology report:
    1. Normal or abnormal
    2. Pathologies
    3. Location
    4. Midline shift (yes or no), and if yes, measurement in mm
    5. Bleed subcategory (epidural, subdural, subarachnoid, intraventricular, contusion, hematoma)

    Report:
    {text}

    Provide a clear, structured answer.
    """
    result = generator(prompt, max_new_tokens=200, do_sample=False)[0]['generated_text'].strip()
    return result

def parse_report(generated):
    """
    A simple parser to extract fields from the generated text.
    """
    parsed = {
        "Normal/Abnormal": "",
        "Pathologies": "",
        "Location": "",
        "Midline shift": "No",
        "Midline shift measurement": "",
        "Bleed subcategory": "",
        "Measurement of abnormal": ""
    }
    normal = re.search(r'Normal or abnormal:\s*(.*)', generated, re.IGNORECASE)
    path = re.search(r'Pathologies:\s*(.*)', generated, re.IGNORECASE)
    location = re.search(r'Location:\s*(.*)', generated, re.IGNORECASE)
    shift = re.search(r'midline shift.*:\s*(.*)', generated, re.IGNORECASE)
    shift_measure = re.search(r'midline shift.*([0-9]+)\s*mm', generated, re.IGNORECASE)
    subcategory = re.search(r'Bleed subcategory:\s*(.*)', generated, re.IGNORECASE)
    measurement = re.search(r'measurement.*:\s*(.*)', generated, re.IGNORECASE)

    if normal:
        parsed["Normal/Abnormal"] = normal.group(1).strip()
    if path:
        parsed["Pathologies"] = path.group(1).strip()
    if location:
        parsed["Location"] = location.group(1).strip()
    if shift:
        parsed["Midline shift"] = "Yes" if "yes" in shift.group(1).lower() else "No"
    if shift_measure:
        parsed["Midline shift measurement"] = shift_measure.group(1) + " mm"
    if subcategory:
        parsed["Bleed subcategory"] = subcategory.group(1).strip()
    if measurement:
        parsed["Measurement of abnormal"] = measurement.group(1).strip()

    return parsed

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def main():
    df = pd.read_csv('radiologists_report.csv')

    parsed_data = []

    for idx, row in df.iloc[:10].iterrows():
        report = str(row['Radiologist Diagnosis'] or '')
        generated = extract_report(report)
        parsed = parse_report(generated)
        parsed_data.append(parsed)

    parsed_df = pd.DataFrame(parsed_data)

    df = pd.concat([df.iloc[:10].reset_index(drop=True), parsed_df], axis=1)

    df.to_csv('processed_report_flan_t5.csv', index=False)

    print("Processing complete! File saved as processed_report_flan_t5.csv")
    return df

if __name__ == "__main__":
    processed_df = main()
    processed_df

Processing complete! File saved as processed_report_flan_t5.csv


## BioGPT

In [None]:
!pip install sacremoses

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/897.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


In [None]:

generator = pipeline("text-generation", model="microsoft/BioGPT-Large", device=0)

def extract_report(text):

    prompt = f"""You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: {text}

Return answer in the following format:
Normal/Abnormal:
Pathologies:
Location:
Midline shift:
Midline shift measurement:
Bleed subcategory:
Measurement of abnormal:

Answer:"""

    result = generator(prompt, max_new_tokens=256, do_sample=False)[0]['generated_text']

    # Optional: debug output
    print("=== RAW MODEL OUTPUT ===")
    print(result)
    print("=========================")

    answer_part = result.split("Answer:")[-1].strip()
    return answer_part



def parse_report(generated):

    parsed = {
        "Normal/Abnormal": "",
        "Pathologies": "",
        "Location": "",
        "Midline shift": "No",
        "Midline shift measurement": "",
        "Bleed subcategory": "",
        "Measurement of abnormal": ""
    }
    normal = re.search(r'Normal or abnormal:\s*(.*)', generated, re.IGNORECASE)
    path = re.search(r'Pathologies:\s*(.*)', generated, re.IGNORECASE)
    location = re.search(r'Location:\s*(.*)', generated, re.IGNORECASE)
    shift = re.search(r'midline shift.*:\s*(.*)', generated, re.IGNORECASE)
    shift_measure = re.search(r'midline shift.*?([0-9]+)\s*mm', generated, re.IGNORECASE)
    subcategory = re.search(r'Bleed subcategory:\s*(.*)', generated, re.IGNORECASE)
    measurement = re.search(r'measurement.*:\s*(.*)', generated, re.IGNORECASE)

    if normal:
        parsed["Normal/Abnormal"] = normal.group(1).strip()
    if path:
        parsed["Pathologies"] = path.group(1).strip()
    if location:
        parsed["Location"] = location.group(1).strip()
    if shift:
        parsed["Midline shift"] = "Yes" if "yes" in shift.group(1).lower() else "No"
    if shift_measure:
        parsed["Midline shift measurement"] = shift_measure.group(1) + " mm"
    if subcategory:
        parsed["Bleed subcategory"] = subcategory.group(1).strip()
    if measurement:
        parsed["Measurement of abnormal"] = measurement.group(1).strip()

    return parsed

def main():
    df = pd.read_csv('radiologists_report.csv')  # Ensure this file exists and has the right column

    parsed_data = []

    for idx, row in df.iloc[:10].iterrows():
        report = str(row['Radiologist Diagnosis'] or '')
        generated = extract_report(report)
        parsed = parse_report(generated)
        parsed_data.append(parsed)

    parsed_df = pd.DataFrame(parsed_data)

    df = pd.concat([df.iloc[:10].reset_index(drop=True), parsed_df], axis=1)

    df.to_csv('processed_report_biogpt.csv', index=False)

    print("Processing complete! File saved as processed_report_biogpt.csv")
    return df


Device set to use cuda:0


In [None]:
if __name__ == "__main__":
    processed_df = main()
    processed_df

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: No trauma related injury seen.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Normal. < / FREETEXT > < / ABSTRACT > ▃


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: Normal study.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Normal / Abnormal. < / FREETEXT > < / ABSTRACT > ▃


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: No trauma related injury seen.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Normal. < / FREETEXT > < / ABSTRACT > ▃


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: Fracture left zygoma, left orbital floor, left lateral orbital wall, anteior and lateral wall of maxillary sinus, and bilateral nasal bone.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Normal / Abnormal. < / FREETEXT > < / ABSTRACT > ▃


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: No trauma related injury seen.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Normal. < / FREETEXT > < / ABSTRACT > ▃


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: 7 mm hyperdense focus in right lentiform nucleus wiht mild perilesional edema - likley punctate hemorrhage._x000D_
Tiny hyperdense foci seen in right basal ganglia, with adjacent edema likely small bleed_x000D_
C/w prior scan dated 21-1-21, theer is subtle reduction in size of lesion 7mm to 5mm and subtle reduction in edema)

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Abnormal. _ x000D _ E / w: Abnormal: Location: Midline shift: Midline shift measurement: Bleed subcategory: Measurement of abnormal: Answer: Abnormal. _ x000D _ F / w: Abnormal: Location: Midline shift: Midline shift measurement: Bleed subcategory: Measurement of abnormal: Answer: Abnormal. _ x000D _ G / w: Abnorm

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: Left diffuse SDH seen, ~15mm, extending along falx. Contralateral midline shift to right seen, ~15mm, along with uncal herniation._x000D_
Bilateral frontal haemorrhages seen._x000D_
Bilateral from convexal SAH seen, along with SAH in bilateral sylvian fissure, and parasellar cistern.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Abnormal. < / FREETEXT > < / ABSTRACT > ▃


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: Fracture D2 spinous process._x000D_
Fracture distal end of right radius, at metadiaphyseal region. No articular surface extension of fracture line seen. Fracture styloid process of right ulna. Fracture right pisiform bone.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Normal. < / FREETEXT > < / ABSTRACT > ▃


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: Right frontal scalp hematoma.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Abnormal. < / FREETEXT > < / ABSTRACT > ▃
=== RAW MODEL OUTPUT ===
You are an expert radiology assistant AI. Given a radiology report, extract the following clinical information in plain text format:

Radiology Report: Normal study.

Return answer in the following format:
Normal/Abnormal: 
Pathologies: 
Location: 
Midline shift: 
Midline shift measurement: 
Bleed subcategory: 
Measurement of abnormal:

Answer: Normal / Abnormal. < / FREETEXT > < / ABSTRACT > ▃
Processing complete! File saved as processed_report_biogpt.csv
