In [31]:
# debug_mfsr_scrape.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import re
import sys
import traceback

BASE = "https://www.mfsr.sk"
PAGE = "https://www.mfsr.sk/sk/financie/hodnota-za-peniaze/hodnotenia/ostatne.html"

# Regexes
date_re = re.compile(r'\b\d{1,2}\.\d{1,2}\.\d{4}\b')
size_re = re.compile(r'\b\d+(?:[.,]\d+)?\s*(?:kB|KB|MB|GB|B|kb|mb|gb)\b')

def extract_date_and_size(anchor):
    """Search for date and size strings around the anchor."""
    search_texts = []

    try:
        search_texts.append(anchor.get_text(" ", strip=True))
    except Exception:
        pass

    if anchor.parent:
        try:
            search_texts.append(anchor.parent.get_text(" ", strip=True))
        except Exception:
            pass

    for sib in list(anchor.previous_siblings)[:12]:
        if hasattr(sib, "get_text"):
            search_texts.append(sib.get_text(" ", strip=True))
        else:
            search_texts.append(str(sib).strip())

    for sib in list(anchor.next_siblings)[:6]:
        if hasattr(sib, "get_text"):
            search_texts.append(sib.get_text(" ", strip=True))
        else:
            search_texts.append(str(sib).strip())

    combined = " ".join([t for t in search_texts if t])
    date_match = date_re.search(combined)
    size_match = size_re.search(combined)

    return date_match.group(0) if date_match else None, size_match.group(0) if size_match else None

def is_pdf_link(href, link_text):
    if not href:
        return False
    href_l = href.lower()
    if '.pdf' in href_l or 'pdf' in link_text.lower():
        return True
    return False

def main():
    try:
        session = requests.Session()
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                          "AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/117.0.0.0 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.9,sk;q=0.8"
        }

        print("Requesting page:", PAGE)
        resp = session.get(PAGE, headers=headers, timeout=20)
        if resp.status_code != 200:
            print(f"Non-200 status code: {resp.status_code}")
            with open("mfsr_page_non200.html", "wb") as f:
                f.write(resp.content)
            return

        resp.encoding = resp.apparent_encoding or 'utf-8'
        text = resp.text

        with open("mfsr_page_full.html", "w", encoding="utf-8") as f:
            f.write(text)

        soup = BeautifulSoup(text, "lxml")

        rows = []
        seen = set()
        normalized_type_words = ['hodnotenie', 'analýza', 'analyza', 'štúdia uskutočniteľnosti']

        last_sector = "Ostatne"
        last_project_name = "UNKNOWN_PROJECT"

        # Loop through all headers and links in order
        for elem in soup.find_all(["h4", "h5", "a"]):
            if elem.name == "h4":
                last_sector = elem.get_text(" ", strip=True)
            elif elem.name == "h5":
                last_project_name = elem.get_text(" ", strip=True)
            elif elem.name == "a" and is_pdf_link(elem.get("href"), elem.get_text(" ", strip=True)):
                link_text = elem.get_text(" ", strip=True)

                # Determine type
                dtype = None
                lt_lower = link_text.lower()
                for w in normalized_type_words:
                    if w in lt_lower:
                        dtype = w
                        break
                if not dtype and elem.parent:
                    parent_text = elem.parent.get_text(" ", strip=True).lower()
                    for w in normalized_type_words:
                        if w in parent_text:
                            dtype = w
                            break

                url = urljoin(BASE, elem["href"])
                date, size = extract_date_and_size(elem)

                key = (last_sector, last_project_name, dtype, url)
                if key not in seen:
                    rows.append((last_sector, last_project_name, dtype or "", url, date or "", size or ""))
                    seen.add(key)

        df = pd.DataFrame(rows, columns=["Sector", "Project Name", "Type", "URL", "Date", "File Size"])
        print(f"Total rows collected: {len(df)}")
        if not df.empty:
            print(df.head(20).to_string(index=False))

        df.to_csv("Ostatne.csv", index=False, encoding="utf-8-sig")
        print("Saved CSV to ostatne.csv")

    except Exception as e:
        print("Exception occurred:", e)
        traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()

Requesting page: https://www.mfsr.sk/sk/financie/hodnota-za-peniaze/hodnotenia/ostatne.html
Total rows collected: 19
 Sector                                                                                                     Project Name                     Type                                                                                                                                                                                                                                              URL       Date File Size
Ostatne                                                                Obmena hasičských cisternových striekačiek CAS 30               hodnotenie                                                                                                                                https://www.mfsr.sk/files/sk/financie/hodnota-za-peniaze/hodnotenia/ostatne/hodnotenie_hasicskych_striekaciek.pdf  15.8.2025    436 kB
Ostatne                                                            

In [34]:
df_1 = pd.read_csv("Ostatne.csv")
df_2 = pd.read_csv("Obrana.csv")
df_3 = pd.read_csv("Budovy.csv")
df_4 = pd.read_csv("Doprava.csv")
df_5 = pd.read_csv("Informatizacia.csv")


In [35]:
df_full = pd.concat([df_1, df_2, df_3, df_4, df_5])
df_full.head()

Unnamed: 0,Sector,Project Name,Type,URL,Date,File Size
0,Ostatne,Obmena hasičských cisternových striekačiek CAS 30,hodnotenie,https://www.mfsr.sk/files/sk/financie/hodnota-...,15.8.2025,436 kB
1,Ostatne,Obmena hasičských cisternových striekačiek CAS 30,štúdia uskutočniteľnosti,https://www.minv.sk/swift_data/source/hasici_a...,,"1,2 MB"
2,Ostatne,Plán rozvoja Leteckého útvaru MV SR pre oblasť...,hodnotenie,https://www.mfsr.sk/files/sk/financie/hodnota-...,10.7.2025,198 kB
3,Ostatne,Plán rozvoja Leteckého útvaru MV SR pre oblasť...,štúdia uskutočniteľnosti,https://www.mfsr.sk/files/sk/financie/hodnota-...,,784 kB
4,Ostatne,Koncepčný plán obnovy osobných automobilov do ...,hodnotenie,https://www.mfsr.sk/files/sk/financie/hodnota-...,12.6.2024,245 kB


In [36]:
df_full = df_full[df_full['Type'].notna()]
len(df_full)

298

In [38]:
df_full['Type'].value_counts()

Type
analýza                     196
hodnotenie                   63
štúdia uskutočniteľnosti     39
Name: count, dtype: int64

In [39]:
df_full.to_csv("full_mfsr_data.csv", index=False, encoding="utf-8-sig")