In [2]:
pip install pymupdf4llm

Collecting pymupdf4llm
  Downloading pymupdf4llm-0.0.27-py3-none-any.whl.metadata (4.8 kB)
Collecting pymupdf>=1.26.3 (from pymupdf4llm)
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf4llm-0.0.27-py3-none-any.whl (30 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, pymupdf4llm
Successfully installed pymupdf-1.26.3 pymupdf4llm-0.0.27


In [13]:
import pandas as pd
import requests
from datetime import datetime, timedelta
import os
import pymupdf4llm
from pathlib import Path


class IMDDataHandler:
    def __init__(self, filepath):
        if filepath.endswith(".csv"):
            self.df = pd.read_csv(filepath)
        else:
            self.df = pd.read_excel(filepath)

    def get_imd_code(self, district):
        match = self.df[self.df['District'].str.lower() == district.lower()]
        if not match.empty:
            return match.iloc[0]['IMD Code']
        raise ValueError(f"District '{district}' not found in the data.")


class IMDPDFDownloader:
    def __init__(self, save_dir="downloads"):
        self.base_url = "https://imdagrimet.gov.in/accessData.php?path=Files/District%20AAS%20Bulletin/English%20Bulletin/"
        self.save_dir = save_dir
        os.makedirs(save_dir, exist_ok=True)

    def download_pdf(self, imd_code, date_str):
        filename = f"{imd_code}_{date_str}_E.pdf"
        url = f"{self.base_url}/{filename}"
        print(filename)
        # local_path = os.path.join(self.save_dir, filename)
        local_path = Path(filename)
        try:
            response = requests.get(url)
            if b"file not found" in response.content.lower() or len(response.content.strip()) < 100:
                print(f"PDF not found at {url}")
                return None
            if response.status_code == 200:
                local_path.write_bytes(response.content)
                return local_path
        except requests.RequestException as e:
            print(f"Request failed: {e}")
        return None

    def try_latest_pdf(self, imd_code, max_days=5):
        today = datetime.today()
        for delta in range(max_days):
            date = today - timedelta(days=delta)
            date_str = date.strftime("%Y-%m-%d")
            pdf_path = self.download_pdf(imd_code, date_str)
            if pdf_path:
                print(f"Downloaded PDF for {date_str}")
                return pdf_path
        raise FileNotFoundError(f"No PDF found for IMDCode {imd_code} in the last {max_days} days.")


class IMDPDFProcessor:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path

    def extract_markdown(self):
        return pymupdf4llm.to_markdown(self.pdf_path)


# ---------------- Example Usage ----------------

def main():
    # pymupdf4llm.to_markdown("downloads/04_0401_2025_08_01_E.pdf")
    # Load IMD codes
    data_handler = IMDDataHandler("IMDCodes.csv")  # or .xlsx
    imd_code = data_handler.get_imd_code("Ahmedabad")  # Replace with desired district

    # Try downloading the most recent PDF
    downloader = IMDPDFDownloader()
    pdf_path = downloader.try_latest_pdf(imd_code)

    # Extract content as markdown using pymupdf4llm
    processor = IMDPDFProcessor(pdf_path)
    markdown_content = processor.extract_markdown()

    print(markdown_content)


if __name__ == "__main__":
    main()


04_0401_2025-08-01_E.pdf
Downloaded PDF for 2025-08-01
|Medium range weather forecast|Col2|Col3|Col4|Col5|Col6|
|---|---|---|---|---|---|
|~~**Parameters/ Date**~~|**02-08-2025**|**03-08-2025**|**04-08-2025**|**05-08-2025**|**06-08-2025**|
|**Rainfall**<br>**(mm) **|**4 **|**4 **|**3 **|**3 **|**2 **|
|~~**Maximum**~~<br>**temperature (0C)**|**34**|**33**|**33**|**33**|**32**|
|**Minimum**<br>**temperature (0C)**|**26**|**26**|**26**|**26**|**26**|
|**Maximum relative**<br>**humidity (%)**|**80**|**80**|**80**|**80**|**80**|
|**Minimum relative**<br>**humidity (%)**|**70**|**70**|**65**|**65**|**70**|
|**Wind speed**<br>**(km/hr)**|**15**|**15**|**16**|**17**|**17**|
|~~**Wind direction**~~<br>**(0) **|**225**|**228**|**231**|**228**|**228**|
|**Cloud cover**<br>**(Octa)**|**6 **|**7 **|**8 **|**8 **|**7 **|
|**Weather summary**|**Weather summary**|**Weather summary**|**Weather summary**|**Weather summary**|**Weather summary**|
|**As per weather forecast received from Meteorological Ce