In [3]:
!pip install easyocr pdfplumber python-docx transformers pandas


Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.9-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (8.6 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Collecting pdfminer.six==20251230 (from pdfplumber)
  Downloading pdfminer_six

In [4]:
import easyocr
import pdfplumber
import pandas as pd
import re
from docx import Document
from google.colab import files


In [5]:
uploaded = files.upload()

file_path = list(uploaded.keys())[0]
print("Uploaded:", file_path)


Saving archive.zip to archive.zip
Uploaded: archive.zip


In [6]:
reader = easyocr.Reader(['en'], gpu=False)




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [7]:
# IMAGE OCR
def extract_from_image(path):
    result = reader.readtext(path, detail=0)
    return "\n".join(result)


# PDF
def extract_from_pdf(path):
    text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text
    return text


# TXT
def extract_from_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()


# DOCX
def extract_from_docx(path):
    doc = Document(path)
    return "\n".join([p.text for p in doc.paragraphs])


In [8]:
def extract_text(path):
    if path.lower().endswith(('.png','.jpg','.jpeg')):
        return extract_from_image(path)

    elif path.lower().endswith('.pdf'):
        return extract_from_pdf(path)

    elif path.lower().endswith('.txt'):
        return extract_from_txt(path)

    elif path.lower().endswith('.docx'):
        return extract_from_docx(path)

    else:
        raise Exception("Unsupported file type")


In [9]:
medical_tests = {
    "hemoglobin": {"range": (13, 17), "meaning": "Oxygen carrying protein"},
    "hb": {"range": (13, 17), "meaning": "Oxygen carrying protein"},
    "rbc": {"range": (4.5, 5.9), "meaning": "Red blood cells"},
    "wbc": {"range": (4000, 11000), "meaning": "Infection fighting cells"},
    "platelet": {"range": (150000, 450000), "meaning": "Blood clotting cells"},
    "hct": {"range": (40, 50), "meaning": "Hematocrit percentage"},
    "mcv": {"range": (80, 100), "meaning": "Size of red blood cells"},
}


In [10]:
def extract_medical_values(text):
    text = text.lower()
    results = []

    for test in medical_tests.keys():
        pattern = rf"{test}[^0-9]*([0-9]+\.?[0-9]*)"
        match = re.search(pattern, text)

        if match:
            value = float(match.group(1))
            results.append((test, value))

    return results


In [11]:
def analyze_results(values):
    rows = []
    summary = []

    for test, value in values:
        low, high = medical_tests[test]["range"]
        meaning = medical_tests[test]["meaning"]

        if value < low:
            status = "Low"
            explanation = f"{test.upper()} is low (may indicate deficiency)."

        elif value > high:
            status = "High"
            explanation = f"{test.upper()} is high (may indicate infection/problem)."

        else:
            status = "Normal"
            explanation = f"{test.upper()} is normal."

        rows.append([test.upper(), value, meaning, status])
        summary.append(explanation)

    df = pd.DataFrame(rows, columns=["Test", "Value", "Meaning", "Status"])

    return df, summary


In [13]:
from google.colab import files
import os

uploaded_files = files.upload()

if 'report.jpeg' in uploaded_files:
    file_path = 'report.jpeg'
    print(f"Successfully uploaded {file_path}")
else:
    print("Please ensure you uploaded a file named 'report.jpeg'")
    # If the user uploaded a different file, try to find the first one
    if uploaded_files:
        file_path = list(uploaded_files.keys())[0]
        print(f"Using the first uploaded file: {file_path}")
    else:
        file_path = None
        print("No file was uploaded.")

Saving report.jpeg to report.jpeg
Successfully uploaded report.jpeg


In [14]:
file_path = '/content/report.jpeg' # Update file_path to point to the actual image file

text = extract_text(file_path)

print("\n======= Extracted Text =======\n")
print(text[:1500])  # preview

values = extract_medical_values(text)

df, summary = analyze_results(values)





#balblnni
dattatflaya
DATTATRAY DIAGNOSTIC CENTER, WARDHA
DIAONOsics
Acharya VInobe
phav
Ftural Hespltal, Sawangl (Maghe} Warrth
Hasnn
CENTeR
Test Report
Pallents
MHLUIGAV KISHUR DJHil NOARE
KSID , 2641020817072
Addrcabe
ALLIED SCIENCE MCA ZND YIAR DATCH
Jufko | ZuUZ0HIT
24-15 SAWANOI MECHE WARDHA
IUIld ; 2S0038a404
ComuNo
7844807076
ColkerIed Unt 1161noz6 ilsuma
Uclutt
Mecmtcdaim
On 46/01/026 [102JiP4
Helund
MFD
Sulnnk
EDTA kalh
Ilsjoned
Icuiccu 13,77 #th
TAHoluGr RETORT
Ingestigation
Observed
Utit
Biologicul
Method
Value
Referenee Range
o
Cell couuter
Maka: [3
@lFenleE-Bsem
%o Paedintric M]-Aigm%
Neonalzi (4
PHOTOMETRIC MEASUREMENT
Buh)/+24.Minnils
6 mnonitis) Me
Ml Intnnidl Yarr
Helalu;
Femal
E-J
Kedla
Ws
Namalea (At Dlirh} [QO-
MC
RIC HISTOGRAM
Intant)
manly 98-871
miuaisl Tunf a-c4
cull
Pedliina
MCH
Pica-EM
alin
Neonte
(At Birih} 31-7
Infants(J   (Hb RBC) X IU
InloueiaCJu nfantNc 25-2
Minle :5154457, Fonale
115is"
PrdlNcus
9-J7 & Nejualas
 (At Binb) J  (HtvcT)
MCHC
Joru + Inlual

In [15]:
print("\n======= Medical Test Analysis =======\n")
display(df)






Unnamed: 0,Test,Value,Meaning,Status
0,HB,25.0,Oxygen carrying protein,High
1,RBC,25.0,Red blood cells,High
2,HCT,404.0,Hematocrit percentage,High
3,MCV,950.0,Size of red blood cells,High


In [16]:
print("\n======= Simple Summary =======\n")

for s in summary:
    print("•", s)

print("\nOverall Advice:")
print("Please consult your doctor for accurate medical interpretation.")




• HB is high (may indicate infection/problem).
• RBC is high (may indicate infection/problem).
• HCT is high (may indicate infection/problem).
• MCV is high (may indicate infection/problem).

Overall Advice:
Please consult your doctor for accurate medical interpretation.


In [17]:
!pip install reportlab


Collecting reportlab
  Downloading reportlab-4.4.9-py3-none-any.whl.metadata (1.7 kB)
Downloading reportlab-4.4.9-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.4.9


In [19]:
!pip install transformers




In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Please make sure the generation config includes `forced_bos_token_id=0`. 


Loading weights:   0%|          | 0/511 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [21]:
from google.colab import files

model.save_pretrained("medical_summarizer_model")
tokenizer.save_pretrained("medical_summarizer_model")

!zip -r medical_summarizer_model.zip medical_summarizer_model

files.download("medical_summarizer_model.zip")


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

  adding: medical_summarizer_model/ (stored 0%)
  adding: medical_summarizer_model/tokenizer.json (deflated 82%)
  adding: medical_summarizer_model/config.json (deflated 59%)
  adding: medical_summarizer_model/tokenizer_config.json (deflated 50%)
  adding: medical_summarizer_model/model.safetensors (deflated 41%)
  adding: medical_summarizer_model/generation_config.json (deflated 48%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
from google.colab import files
files.download("medical_summarizer_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>