In [2]:
import pandas as pd
from bs4 import BeautifulSoup

def extract_notice_block(soup):
    """
    Get the entire relevant part of TED pages
    """
    try:
        notice_ugly = soup.find("div", {"id" : "notice-content"}).getText(" ")
    except:
        try:
            notice_ugly = soup.find("div", {"id" : "notice-content"}).find_next_sibling("div").getText(" ")
        except:
            notice_ugly = "Not found"
    return notice_ugly

def extract_summary_block(soup):
    """
    Get the summary of TED notices
    """
    try:
        summary_ugly = soup.find("div", {"id" : "summary-content"}).getText(" ")
    except:
        try:
            summary_ugly = soup.find("div", {"id" : "summary-content"}).find_next_sibling("div").getText(" ")
        except:
            summary_ugly = "Not found"
    return summary_ugly

multi_lot_file = r"C:\Users\OSBPAKSI\Carl Zeiss AG\Data & Analytics AT CH SEE - Data Projects\01_Projects\Tender Analysis\01_Data\projects\Romania\unparsed raw pages\14_1_2025\html_15647-2025.txt".replace("\\", "/")

inner_dict = dict()
with open(multi_lot_file, "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser") 
    notice_raw = extract_notice_block(soup)
    inner_dict["Notice"] = notice_raw
    summary_raw = extract_summary_block(soup)
    inner_dict["Summary"] = summary_raw

In [3]:
def get_block(input, block_start_nr, next_block_nr, block_start_r, next_block_r):
    """ 
    This is a generic function to get big chunks of the notice on the TED website.
    There are two major groups, those with roman numerals (r) as headers, and those with arabic numerals (nr).
    """
    try:
        try:
            if (next_block_nr is not None):
                return input.split(block_start_nr)[1].split(next_block_nr)[0]
            else:
                return input.split(block_start_nr)[1]
        except:
            if (next_block_r is not None):
                return input.split(block_start_r)[1].split(next_block_r)[0]
            else:
                return input.split(block_start_r)[1]
    except:
        return "Not trivial"

def get_buyer(input):
    return get_block(input, "1.  Buyer", "2.  Procedure", "Section I :  Contracting ", "Section II") # can be authority or entity
def get_procedure(input):
    return get_block(input, "2.  Procedure", "5.  Lot", "Section IV :  Procedure", "Section V")
def get_lot(input):
    return get_block(input, "5.  Lot", "6.  Results", "Section II :  Object", "Section IV")
def get_results(input):
    return get_block(input, "6.  Results", None, "Section V :  Award of contract", None)
def get_other(input):
    return get_block(input, "11.  Notice", None, "Section VI :  Complementary information", None)


In [4]:
lot_block = get_lot(notice_raw)


In [5]:
lot_block

' \n \n \n \n 5.1.  \n \n \n Lot :\xa0 LOT-0001 \n \n \n Title :  Trusa de chirurgie toracica minim invaziva \n \n \n Description :  cant. acord cadru minim 1 / maxim 3 cant. contract subsecvent minim 1 / maxim 2 Valoare acord cadru: minima 461.766,00 lei fara TVA/ maxima 1.385.298,00 lei fara TVA Valoare contract subsecvent: minima 461.766,00 lei fara TVA/maxima 923.532,00 lei fara TVA \n \n \n Internal identifier :  1 \n \n \n \n 5.1.1.  \n \n \n Purpose \n \n \n Main nature of the contract :  Supplies \n \n \n Main classification \xa0 ( cpv ):  33169000 \xa0 Surgical instruments \n \n \n \n \n 5.1.2.  \n \n \n Place of performance \n \n \n Country subdivision (NUTS) :  Bucureşti \xa0 ( RO321 ) \n \n \n Country :  Romania \n \n \n Additional information :  SUUMC \n \n \n \n \n 5.1.3.  \n \n \n Estimated duration \n \n \n Duration :\xa0 24 \xa0 Months \n \n \n \n \n 5.1.4.  \n \n \n Renewal \n \n \n Maximum renewals :  0 \n \n \n \n \n 5.1.10.  \n \n \n Award criteria \n \n \n Criteri

In [6]:
import re 

def parse_section(input : str, header_list : list, romans : list) -> dict:
    found_sections = []
    for h in header_list:
        if h in input:
            found_sections.append(h)
    
    found_index = []
    for i in found_sections:
        found_index.append(input.index(i))
    ordered_found_sections = [x for _, x in sorted(zip(found_index, found_sections))]
    found_sections = list(dict.fromkeys(ordered_found_sections)) # unclear whether this is better (purpose = ordered set to avoid repetition)

    romans # no longer calculated on the spot romans

    res = dict()
    for i in range(len(found_sections)):
        if i != len(found_sections) -1:
            next_section = found_sections[i + 1]
            field_content = input.split(found_sections[i])[1].split(next_section)[0].strip()
            if field_content.startswith(":"):
                field_content = field_content[1:].strip()
        else:
            field_content = input.split(found_sections[i])[1].strip()
            if field_content.startswith(":"):
                field_content = field_content[1:].strip()
        if (found_sections[i] == "The procurement is covered by the Government Procurement Agreement (GPA)" and "Additional inf" in field_content):
            res[re.sub(r'\s+', ' ', found_sections[i])] = field_content.split("Additional inf")[0].strip()
            field_content2 = "Additional inf" + field_content.split("Additional inf")[1].strip()
            for roman in romans:
                if field_content2.strip().endswith(roman):
                    field_content2 = field_content2.removesuffix(roman)
            res["GPA Additional info"] = field_content2.strip()
        else:
            if not (found_sections[i] == "Internet address" and field_content.startswith("(es)")): # last one is just a duplicate for the addresses which we do not need
                if not (found_sections[i] == "Description" and field_content.startswith("of the procurement")): 
                    for roman in romans:
                        if field_content.strip().endswith(roman):
                            field_content = field_content.removesuffix(roman)
                    res[re.sub(r'\s+', ' ', found_sections[i])]  = re.sub(r'\s+', ' ', field_content.replace("\xa0", " ").strip()) 
    return res

lot_field_list_unnumbered = ['Scope of the procurement', 'Title', 'Main CPV code', 'Type of contract', 'Short description', 'Information about lots', 'Total value of the procurement', 'Description', 'Title', 'Additional CPV code(s)', 'Place of performance', 'Description of the procurement', 'Award criteria', 'Information about options', 'Information about European Union funds', 'Additional information', 'Type of contract and place of performance or delivery', 'Information about a framework agreement', 'Information about framework agreement', 'CPV code(s)', 'Information about the Government Procurement Agreement (GPA)', 'Information about the dynamic purchasing system', 'Electronic auction', 'Further information, mediation and review', 'Date of the conclusion of the contract', 'Review organisation', 'TED eSender', 'Maximum renewals', 'Value', 'Estimated value excluding VAT', 'Organisation whose budget is used to pay for the contract', 'Organisation executing the payment', 'General information', 'The procurement is covered by the Government Procurement Agreement (GPA)', 'Terms of procurement', 'Information about review deadlines', 'Options', 'Renewal', 'Strategic procurement', 'Estimated duration', 'Information about previous notices', 'Identifier of the previous notice', 'Identifier of the part of the previous notice', 'The buyer reserves the right for additional purchases from the contractor, as described here', 'Total value of the contract/lot']
    # Description field may appear multiple times (not read anyway??)

more_lot_fields= ["Internal identifier", "Purpose", "Main nature of the contract", "Main classification \xa0 ( cpv )", "Additional classification \xa0 ( cpv )", "Main classification ( cpv )", "Main classification( cpv )", "Main classification (cpv)", "Award Criteria", "Techniques", "Framework Agreement","Framework agreement", "Additional classification ( cpv )"]
lot_fields = lot_field_list_unnumbered + more_lot_fields


def subheader_ordered(input: list) -> list: # only removes the first incorrect member (very few unordered anyway)
    max = 0
    ind = 0
    for i in input:
        subheader_number = i.split(".")[1].split(".")[0]
        if (int(max) > int(subheader_number)):
            print("Correcting odd sequence of subheader numbers by removing entry at index {index} from: {orig_list}".format(index = ind, orig_list = str(input)))
            print(input.pop(ind - 1))
            return input
        else:
            max = int(subheader_number)
            ind += 1
    return input


def get_lot_sections(input : list) -> list: 
    res = re.findall("5\.[0-9]\.[0-9]?[0-9]?\.?", input)
    return subheader_ordered(res)

lot_sections = parse_section(lot_block, lot_fields, get_lot_sections(lot_block))


In [7]:
lot_sections # NOT ENOUGH BC 6 LOTS in total

{'Title': 'Trusa de chirurgie toracica minim invaziva',
 'Description': 'cant. acord cadru minim 1 / maxim 3 cant. contract subsecvent minim 1 / maxim 2 Valoare acord cadru: minima 461.766,00 lei fara TVA/ maxima 1.385.298,00 lei fara TVA Valoare contract subsecvent: minima 461.766,00 lei fara TVA/maxima 923.532,00 lei fara TVA',
 'Internal identifier': '1',
 'Purpose': '',
 'Main nature of the contract': 'Supplies',
 'Main classification ( cpv )': '33169000 Surgical instruments',
 'Place of performance': 'Country subdivision (NUTS) : Bucureşti ( RO321 ) Country : Romania',
 'Additional information': 'SUUMC',
 'Estimated duration': 'Duration : 24 Months',
 'Renewal': '',
 'Maximum renewals': '0',
 'Award criteria': 'Criterion : Type : Price Name : Pretul ofertei Description : Punctajul se acorda astfel: a) Pentru cel mai scazut dintre preturi se acorda punctajul maxim alocat; b) Pentru celelalte preturi ofertate punctajul P(n) se calculeaza proportional, astfel: P(n) = (Pret minim ofer

In [8]:
lot_block.split("5.1.  \n \n \n Lot :")

[' \n \n \n \n ',
 '\xa0 LOT-0001 \n \n \n Title :  Trusa de chirurgie toracica minim invaziva \n \n \n Description :  cant. acord cadru minim 1 / maxim 3 cant. contract subsecvent minim 1 / maxim 2 Valoare acord cadru: minima 461.766,00 lei fara TVA/ maxima 1.385.298,00 lei fara TVA Valoare contract subsecvent: minima 461.766,00 lei fara TVA/maxima 923.532,00 lei fara TVA \n \n \n Internal identifier :  1 \n \n \n \n 5.1.1.  \n \n \n Purpose \n \n \n Main nature of the contract :  Supplies \n \n \n Main classification \xa0 ( cpv ):  33169000 \xa0 Surgical instruments \n \n \n \n \n 5.1.2.  \n \n \n Place of performance \n \n \n Country subdivision (NUTS) :  Bucureşti \xa0 ( RO321 ) \n \n \n Country :  Romania \n \n \n Additional information :  SUUMC \n \n \n \n \n 5.1.3.  \n \n \n Estimated duration \n \n \n Duration :\xa0 24 \xa0 Months \n \n \n \n \n 5.1.4.  \n \n \n Renewal \n \n \n Maximum renewals :  0 \n \n \n \n \n 5.1.10.  \n \n \n Award criteria \n \n \n Criterion :  \n \n Ty

In [30]:
folder = "C:/Users/OSBPAKSI/Carl Zeiss AG/Data & Analytics AT CH SEE - Data Projects/01_Projects/Tender Analysis/01_Data/projects/Romania/unparsed raw pages/14_1_2025/"

import os 

folder_files = os.listdir(folder)

lot_blocks = []
res_blocks = []
notices = []
for file in folder_files[:10]:
    if file.endswith(".txt"):
        
        inner_dict = dict()
        with open(folder + file, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser") 
            notice_raw = extract_notice_block(soup)
            lot_block = get_lot(notice_raw)
            lot_blocks.append(lot_block)
            res_block = get_results(notice_raw)
            res_blocks.append(res_block)
            notices.append(notice_raw)

In [10]:
folder_files[1]

'html_100080-2024.txt'

In [11]:
counts = []
for i in lot_blocks:
    counts.append(len(i.split("5.1.  \n \n \n Lot :")))

In [12]:
lot_blocks[1].split("5.1.  \n \n \n Lot :")[1]

"\xa0 LOT-0001 \n \n \n Title :  Set instrumente Cabinet Chirurgie Orală și BMF \n \n \n Description :  În cadrul proiectului “Dezvoltarea infrastructurii medicale prespitalicești de la nivelul Spitalului Județean de Urgență Miercurea Ciuc, prin dotarea unității de asistență medicală ambulatorie” se vor achiziționa aparate si sisteme medicale pentru dotarea laboratoarelor și cabinetelor medicale ambulatorie deja existente și noi. Set instrumente Cabinet Chirurgie Orală și BMF UM: Buc. Cantitate: 1 P.U. Estimat 60158,8 \n \n \n Internal identifier :  4245305_2023_PAAPD1438226 \n \n \n \n 5.1.1.  \n \n \n Purpose \n \n \n Main nature of the contract :  Supplies \n \n \n Main classification \xa0 ( cpv ):  33169000 \xa0 Surgical instruments \n \n \n \n \n 5.1.2.  \n \n \n Place of performance \n \n \n Country subdivision (NUTS) :  Harghita \xa0 ( RO124 ) \n \n \n Country :  Romania \n \n \n Additional information :  Sediu Spital Județean de Urgență Miercurea Ciuc, str. Dénes László nr. 2, 

In [13]:
counts # correct --> need to parse each lot separately (each lot has its own results section), skip 0th bc empty, then match each result based on the lot number

[4, 24, 2, 2, 5, 78, 2, 7, 3, 2]

In [14]:
single_lot = lot_blocks[1].split("5.1.  \n \n \n Lot :")[1]
parse_section(single_lot, lot_fields, get_lot_sections(single_lot))

{'Title': 'Set instrumente Cabinet Chirurgie Orală și BMF',
 'Description': 'În cadrul proiectului “Dezvoltarea infrastructurii medicale prespitalicești de la nivelul Spitalului Județean de Urgență Miercurea Ciuc, prin dotarea unității de asistență medicală ambulatorie” se vor achiziționa aparate si sisteme medicale pentru dotarea laboratoarelor și cabinetelor medicale ambulatorie deja existente și noi. Set instrumente Cabinet Chirurgie Orală și BMF UM: Buc. Cantitate: 1 P.U. Estimat 60158,8',
 'Internal identifier': '4245305_2023_PAAPD1438226',
 'Purpose': '',
 'Main nature of the contract': 'Supplies',
 'Main classification ( cpv )': '33169000 Surgical instruments',
 'Place of performance': 'Country subdivision (NUTS) : Harghita ( RO124 ) Country : Romania',
 'Additional information': 'Sediu Spital Județean de Urgență Miercurea Ciuc, str. Dénes László nr. 2, jud. Harghita',
 'Estimated duration': 'Duration : 5 Months',
 'Renewal': '',
 'Maximum renewals': '0',
 'General information':

In [15]:
single_lot = lot_blocks[1].split("5.1.  \n \n \n Lot :")[2]
parse_section(single_lot, lot_fields, get_lot_sections(single_lot))

{'Title': 'Suport perfuzie inox cu 4 cârlige',
 'Description': 'În cadrul proiectului “Dezvoltarea infrastructurii medicale prespitalicești de la nivelul Spitalului Județean de Urgență Miercurea Ciuc, prin dotarea unității de asistență medicală ambulatorie” se vor achiziționa aparate si sisteme medicale pentru dotarea laboratoarelor și cabinetelor medicale ambulatorie deja existente și noi. Suport perfuzie inox cu 4 cârlige UM: Buc. Cantitate: 1 P.U. Estimat 249',
 'Internal identifier': '4245305_2023_PAAPD1438226',
 'Purpose': '',
 'Main nature of the contract': 'Supplies',
 'Main classification ( cpv )': '33194100 Devices and instruments for infusion',
 'Place of performance': 'Country subdivision (NUTS) : Harghita ( RO124 ) Country : Romania',
 'Additional information': 'Sediu Spital Județean de Urgență Miercurea Ciuc, str. Dénes László nr. 2, jud. Harghita',
 'Estimated duration': 'Duration : 5 Months',
 'Renewal': '',
 'Maximum renewals': '0',
 'General information': 'Reserved par

In [16]:
folder_files[5]

'html_10354-2024.txt'

In [17]:
test_tender = lot_blocks[1].split("5.1.  \n \n \n Lot :")

In [25]:
def parse_all_lots(tender_lot_block):
    """In Romania, tenders often have multiple, nicely delineated lots. We extract them one by one (and match them with results by the lot number)"""
    lots = tender_lot_block.split("5.1.  \n \n \n Lot :")
    tender_parsed_lots = dict()
    for single_lot in lots[1:]:
        lot_num = single_lot.split("LOT-")[1].split("Title : ")[0].strip()
        parsed_lot = parse_section(single_lot, lot_fields, get_lot_sections(single_lot))
        tender_parsed_lots[lot_num] = parsed_lot
    return tender_parsed_lots

In [26]:
tender_parsed_lots = parse_all_lots(lot_blocks[1])

In [None]:
many_lot_res = res_blocks[6].split("LOT-") # many lots appear multiple times (10+ times) with various dates?!
# 5 can be split and analyzed one by one
# 6 is still too difficult for get_results

In [93]:

res_key_sections = ["Winner selection status", "Maximum value of the framework agreement", "Re-estimated value of the framework agreement", "Winner", "Leader of tendering party", "Official name", "Value of the tender", "Identifier of the contract", "Date of the conclusion of the contract", "Rank in the list of winners", "Value of the lowest admissible tender", "Value of the highest admissible tender"]
trash = ["The tender was ranked :\xa0 yes", "Identifier of lot or group of lots", "At least one winner was chosen.", "6.1.2.     Information about winners", "Tender identifier", "Tender", "The contract is awarded within a framework agreement", "Framework agreement", "Contract information", ":\xa0", "6.1.4.     Statistical information", "Range of tenders", "\xa0 RON", "    yes         ", "    no           ", "6.1.     Result lot ldentifier", "Received tenders or requests to participate", "Number of tenders or requests to participate received", "Type of received submissions", "from micro, small or medium tenderers", "from tenderers registered in other European Economic Area countries than the country of the buyer", "from tenders registered in countries outside of the European Economic Area", "submitted electronically"]
both = res_key_sections + trash
new = []
for i in many_lot_res:
    new_str = i
    for k in both:
        new_str = new_str.replace("\n", "").replace(k, "")
    new.append(new_str)

In [None]:
#res_blocks[6].split("7. Modification")
res_blocks[6].split("7.  Modification") # important results section end!!!

[' \n \n \n \n Value of all contracts awarded in this notice :\xa0 2 695 114,58 \xa0 RON \n \n \n \n \n 6.1.  \n \n \n Result lot ldentifier :\xa0 LOT-0000 \n \n \n \n 6.1.2.  \n \n \n Information about winners \n \n \n Winner :\xa0 \n \n \n Leader of tendering party :\xa0 VESTRA INDUSTRY SRL \n \n \n Official name :\xa0 VESTRA INDUSTRY SRL \n \n \n Tender :\xa0 \n \n \n Tender identifier :\xa0 REF_OF: CAN1092949/CIF: RO15969249 \n \n \n Identifier of lot or group of lots :\xa0 LOT-0000 \n \n \n Value of the tender :\xa0 2 695 114,58 \xa0 RON \n \n \n The tender was ranked :\xa0 yes \n \n \n Rank in the list of winners :\xa0 1 \n \n \n Contract information :\xa0 \n \n Identifier of the contract :\xa0 6362/21039 \n \n \n Date of the conclusion of the contract :\xa0 24/11/2022 \n \n \n \n \n \n    ',
 ' \n \n \n \n 7.1.  \n \n \n Modification \n \n \n Identifier of the previous contract award notice :\xa0 65ca476f-6bc5-4c99-8dcc-be72d6e0759a-01 \n \n \n Reason for modification :\xa0 Modi

In [106]:
single_lot_res_block = res_blocks[6].split("7.  Modification")[0]

In [None]:
single_lot_res_block # value of all contracts is not always shown at the beginning

' \n \n \n \n Value of all contracts awarded in this notice :\xa0 2 695 114,58 \xa0 RON \n \n \n \n \n 6.1.  \n \n \n Result lot ldentifier :\xa0 LOT-0000 \n \n \n \n 6.1.2.  \n \n \n Information about winners \n \n \n Winner :\xa0 \n \n \n Leader of tendering party :\xa0 VESTRA INDUSTRY SRL \n \n \n Official name :\xa0 VESTRA INDUSTRY SRL \n \n \n Tender :\xa0 \n \n \n Tender identifier :\xa0 REF_OF: CAN1092949/CIF: RO15969249 \n \n \n Identifier of lot or group of lots :\xa0 LOT-0000 \n \n \n Value of the tender :\xa0 2 695 114,58 \xa0 RON \n \n \n The tender was ranked :\xa0 yes \n \n \n Rank in the list of winners :\xa0 1 \n \n \n Contract information :\xa0 \n \n Identifier of the contract :\xa0 6362/21039 \n \n \n Date of the conclusion of the contract :\xa0 24/11/2022 \n \n \n \n \n \n    '

In [124]:
"6.1.  \n \n \n Result" in res_blocks[1]
"Value of the tender" in res_blocks[8]

True

In [120]:
single_lot_res_block.split("2 695 114")

[' \n \n \n \n Value of all contracts awarded in this notice :\xa0 ',
 ',58 \xa0 RON \n \n \n \n \n 6.1.  \n \n \n Result lot ldentifier :\xa0 LOT-0000 \n \n \n \n 6.1.2.  \n \n \n Information about winners \n \n \n Winner :\xa0 \n \n \n Leader of tendering party :\xa0 VESTRA INDUSTRY SRL \n \n \n Official name :\xa0 VESTRA INDUSTRY SRL \n \n \n Tender :\xa0 \n \n \n Tender identifier :\xa0 REF_OF: CAN1092949/CIF: RO15969249 \n \n \n Identifier of lot or group of lots :\xa0 LOT-0000 \n \n \n Value of the tender :\xa0 ',
 ',58 \xa0 RON \n \n \n The tender was ranked :\xa0 yes \n \n \n Rank in the list of winners :\xa0 1 \n \n \n Contract information :\xa0 \n \n Identifier of the contract :\xa0 6362/21039 \n \n \n Date of the conclusion of the contract :\xa0 24/11/2022 \n \n \n \n \n \n    ']

In [125]:
single_lot_res_block

' \n \n \n \n Value of all contracts awarded in this notice :\xa0 2 695 114,58 \xa0 RON \n \n \n \n \n 6.1.  \n \n \n Result lot ldentifier :\xa0 LOT-0000 \n \n \n \n 6.1.2.  \n \n \n Information about winners \n \n \n Winner :\xa0 \n \n \n Leader of tendering party :\xa0 VESTRA INDUSTRY SRL \n \n \n Official name :\xa0 VESTRA INDUSTRY SRL \n \n \n Tender :\xa0 \n \n \n Tender identifier :\xa0 REF_OF: CAN1092949/CIF: RO15969249 \n \n \n Identifier of lot or group of lots :\xa0 LOT-0000 \n \n \n Value of the tender :\xa0 2 695 114,58 \xa0 RON \n \n \n The tender was ranked :\xa0 yes \n \n \n Rank in the list of winners :\xa0 1 \n \n \n Contract information :\xa0 \n \n Identifier of the contract :\xa0 6362/21039 \n \n \n Date of the conclusion of the contract :\xa0 24/11/2022 \n \n \n \n \n \n    '

In [None]:
for i in notices:
    kw = "Value of all contracts awarded in this notice" # not there if only a competition
    print(kw in i)

False
False
False
False
False
True
True
False
False
False


In [131]:
for i in notices:
    print("Competition" in i.split("See the notice on TED website")[0].strip())

True
True
True
True
True
False
False
True
False
True


In [134]:
noncomp_notices = []
for file in folder_files[:100]:
    if file.endswith(".txt"):
        
        inner_dict = dict()
        with open(folder + file, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser") 
            notice_raw = extract_notice_block(soup)
            if "Competition" not in notice_raw.split("See the notice on TED website")[0]:
                noncomp_notices.append(notice_raw)

In [137]:

for i in noncomp_notices:
    kw = "Value of all contracts awarded in this notice" # often not there even if type result mb proxy for multi-lot??
    print(kw in i)

True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
True
False
True
True
True
True
True
False
True
False
False
False
True
True
True
True
False
True
False
False
True


In [170]:
# 14 rly doesnt have anything like estimated, lei or ron
for i,j in enumerate(noncomp_notices):
    kw = "Value of the tender" # more often there (9 and 10 are for supplies (False) but 11 too and it does have value, supplies means product??)
    kw2 = "Valoarea total"
    kw3 = "Valoarea estimata"
    kw4 = "Maximum value of the framework agreement"
    if kw in j or kw4 in j:
        True

    elif kw2 in j or kw3 in j:
        True
    else: #53: Garanția de participare (use genAI??)
        print(i)

9
14
15
23
28
29
31
33
36
44
53


In [169]:
noncomp_notices[44].split("estim") #Supplies



['\n \n \n \n \n \n \n 128231-2024 - Result       See the notice on TED website        \n   \n \n \n     128231-2024\n     \n \n \n     128231-2024 - Result \n \n      Romania – Desktop computer – Furnizare echipamente digitale proiect cod F-PNRR-Dotari-2023 Lot 1 – Computere de birou, Lot 2 – Servere, Lot 3 – Computere portabile, Lot 4 – Display interactiv cu suport, Lot 5 – Videoproiector și ecrane de proiecție, Lot 6 – Echipamente periferice, Lot 7 – Imprimante și scanere, Lot 8 – Camere videoconferință și sisteme sunet, Lot 9 – Robot umanoid educational, Lot 10 – Pachete software IT\n      \n \n      OJ S 44/2024 01/03/2024\n      \n \n      Contract or concession award notice – standard regime\n        Supplies \n \n \n \n    1.  Buyer \n \n \n \n 1.1.  \n \n \n Buyer \n \n \n Official name :\xa0 JUDETUL TIMIS \n \n \n Email :\xa0 s.achizitii.publice@cjtimis.ro \n \n \n Legal type of the buyer :  Regional authority \n \n \n Activity of the contracting authority :  General public s

In [None]:
# way to proceed: check if one or many lots (one lot one result too?!)
# when value is available, check if there are multiple lots
# for multilot, combine dates
# separate lots into different entries for statistics

### Austria first

In [176]:
folder = r"C:\Users\OSBPAKSI\Carl Zeiss AG\Data & Analytics AT CH SEE - Data Projects\01_Projects\Tender Analysis\01_Data\projects\Austria\unparsed raw pages\05_01_2025/".replace("\\", "/")
at_files = os.listdir(folder)

In [227]:
def check_lot_type(notice_raw):
    "Check if we have non-roman multi-lot, single lot or roman"

    type_count = len(notice_raw.split("5.1.  \n \n \n Lot :"))

    if type_count == 2:
        return "single-lot non-roman"
    elif type_count > 2:
        return "multi-lot non-roman"
    else:
        return "likely roman"

In [None]:
notices = [] # try selectolax bc too slow at scale
notice_types = []
for file in at_files:
    if file.endswith(".txt"):
        with open(folder + file, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f, "html.parser") 
            notice_raw = extract_notice_block(soup)
            notices.append(notice_raw)
            #summary = extract_summary_block(soup)
            notice_types.append(check_lot_type(notice_raw))

KeyboardInterrupt: 

In [None]:
notice_raw[150:]

'single-lot non-roman'

In [223]:
notice_raw.split("LOT-")

['\n \n \n \n \n 35/2024 \n \n \n 103477-2024 - Result       See the notice on TED website        \n   \n \n \n     103477-2024\n     \n \n \n     103477-2024 - Result \n \n      Austria – Detection and analysis apparatus – Ex-Ante Transparenzbekanntmachung Blutzellzähl- Diffenrenziersystem\n      \n \n      OJ S 35/2024 19/02/2024\n      \n \n      Contract or concession award notice – standard regime\n        Supplies \n \n \n \n    1.  Buyer \n \n \n \n 1.1.  \n \n \n Buyer \n \n \n Official name :\xa0 Österreichische Gesundheitskasse \n \n \n Email :\xa0 vergabeverfahren@oegk.at \n \n \n Legal type of the buyer :  Body governed by public law \n \n \n Activity of the contracting authority :  Health \n \n \n \n    2.  Procedure \n \n \n \n 2.1.  \n \n \n Procedure \n \n \n Title :\xa0 Ex-Ante Transparenzbekanntmachung Blutzellzähl- Diffenrenziersystem \n \n \n Description :\xa0 Beschaffung von vollautomatischen Blutzellzähl- und Differenziersystemen als Reparaturaustausch für die Med

In [212]:
lot_block = get_lot(notice_raw)
parse_section(lot_block, lot_fields, get_lot_sections(lot_block))

{'Title': 'Ein Stück High – End – Mikroskop',
 'Description': 'High-End-Mikroskop für die Chirurgie',
 'Internal identifier': 'Los 1',
 'Purpose': '',
 'Main nature of the contract': 'Supplies',
 'Main classification ( cpv )': '38510000 Microscopes',
 'Place of performance': 'Country : Austria Anywhere in the given country',
 'General information': 'Reserved participation : Participation is not reserved. Procurement Project not financed with EU Funds.',
 'The procurement is covered by the Government Procurement Agreement (GPA)': 'no This procurement is also suitable for small and medium-sized enterprises (SMEs)',
 'Information about previous notices': '',
 'Identifier of the previous notice': 'e9da3b98-2673-421e-94de-4cb2c536c467-01',
 'Identifier of the part of the previous notice': 'LOT-0001 5.1.9. Selection criteria Criterion : Type : Economic and financial standing The criteria will be used to select the candidates to be invited for the second stage of the procedure Information abo

In [231]:
notice_types#.value_counts()

['single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman',
 'single-lot non-roman']