In [16]:
from unstructured.partition.html import partition_html

In [41]:
from unstructured.partition.md import partition_md


In [28]:
def get_body(elements):
    '''
    This function finds the document element list indexes corresponding to the main body of the webpage, in the absence of usage of proper body tags

    Parameters:
        elements (list): A list of Unstructured document objects

    Returns:
        list: The sliced elements list whose indexes point to document objects containing text and other metadata from the main body of the webpage
    '''

    START = 0
    END = 0
    flag = False

    for i in range(len(elements)):

        if flag == False and elements[i].text == 'Earthworks, Retaining Walls, and Boundary Walls':
            flag = True
            continue
        elif START == 0 and elements[i].category == 'Title' and flag == True:
            START = i
            continue
        elif END == 0 and elements[i].text == 'Urban Redevelopment Authority' and flag == True:
            END = i
            break
    return elements[START:END]


def parse(input_html):
    '''
    This function extracts the text from the main body of the webpage and and passes it into the GPT4-o API to be chunked and sent to markdown format. Remember to set your OpenAI API key in environment variables before running.

    Parameters:
        input_html (string): URL of the webpage desired

    Returns:
        None
    '''

    elements = partition_html(url=input_html,
                              headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'})

    body_elements = get_body(elements)
    return body_elements


In [42]:
elements = partition_md(filename="../data/rc.md")

In [22]:
parsed[0].ancestortags

('html', 'body', 'form', 'div', 'div', 'div')

In [18]:
dir(parsed[0])

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_element_id',
 'ancestortags',
 'apply',
 'category',
 'convert_coordinates_to_new_system',
 'embeddings',
 'emphasized_texts',
 'id',
 'id_to_hash',
 'links',
 'metadata',
 'tag',
 'text',
 'text_as_html',
 'to_dict']

In [3]:
from unstructured.chunking.basic import chunk_elements

In [39]:
parsed = parse("https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential/Transport/RC-Flat-Roofs")

In [47]:
for text in elements:
    print(text.text)

RC Flat Roofs
RC flat roofs shall remain inaccessible except for maintenance purposes
only^1^. Activating the rooftop for uses such as roof terraces and
landscaped gardens may be allowed depending on the merits of the
proposal. No structures shall be allowed unless otherwise approved by
URA. Where allowed, structures (including any safety barriers) shall
comply with height controls^2^ and the following guidelines.
^1^ RC flat roofs that are accessible via ladders for maintenance
purposes only are subject to agencies' requirements for safety barriers
to ensure the safety of maintenance personnel. Safety barriers shall not
exceed 1m in height.
^2^ Height controls applicable, include:
Absolute technical height constraints^@^ (e.g. Aviation paths
  restrictions, military and telecommunications installations).
Conservation guidelines (available at URA
  SPACE)
Urban design height controls (available at URA
  SPACE)
^@^ The absolute technical height (Based on Singapore Height Datum
[SHD]) sh

In [43]:
chunked = chunk_elements(elements, overlap=100)


In [45]:
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(elements)

In [46]:
for items in chunks:
    print("CHUNK: \n" + items.text + "\nEND OF CHUNK \n")

CHUNK: 
RC Flat Roofs

RC flat roofs shall remain inaccessible except for maintenance purposes
only^1^. Activating the rooftop for uses such as roof terraces and
landscaped gardens may be allowed depending on the merits of the
proposal. No structures shall be allowed unless otherwise approved by
URA. Where allowed, structures (including any safety barriers) shall
comply with height controls^2^ and the following guidelines.
END OF CHUNK 

CHUNK: 
^1^ RC flat roofs that are accessible via ladders for maintenance
purposes only are subject to agencies' requirements for safety barriers
to ensure the safety of maintenance personnel. Safety barriers shall not
exceed 1m in height.

^2^ Height controls applicable, include:

Absolute technical height constraints^@^ (e.g. Aviation paths
  restrictions, military and telecommunications installations).

Conservation guidelines (available at URA
  SPACE)
END OF CHUNK 

CHUNK: 
Urban design height controls (available at URA
  SPACE)

^@^ The absolute 

In [20]:
for items in chunked:
    print(items.text)

Release of 4th Quarter 2023 real estate statistics

Sale transaction volume for private residential properties decreased by 17% on a quarter-on-quarter basis in 4th Quarter 2023. The total transaction volume in 2023 fell by 13% compared to 2022, and was at its lowest level in seven years, since 2016.
Private residential property rentals declined for the first time in over three years, by 2.1% in 4th Quarter 2023. For 2023 as a whole, private residential property rentals increased by 8.7%, a significant moderation from the 29.7% increase in 2022.
About 4,100 private residential units (including ECs) were completed in 4th Quarter 2023. For the whole of 2023, a total of about 21,300 private residential units were completed, more than twice the number of completions in 2022. This was also the highest annual supply completion since 2016.
The Government has ramped up housing supply via the Government Land Sales (GLS) programme. The Confirmed List supply of private housing in the GLS Programm

In [6]:
txt = str()
for el in parsed:
    txt += el.text + " "

In [8]:
f = open("demofile.txt", "w")
f.write(txt)
f.close()

In [9]:
from unstructured.partition.auto import partition

In [10]:
part = partition("demofile.txt")

In [11]:
for el in part:
    print(el)
    print()

Release of 4th Quarter 2023 real estate statistics Sale transaction volume for private residential properties decreased by 17% on a quarter-on-quarter basis in 4th Quarter 2023. The total transaction volume in 2023 fell by 13% compared to 2022, and was at its lowest level in seven years, since 2016. Private residential property rentals declined for the first time in over three years, by 2.1% in 4th Quarter 2023. For 2023 as a whole, private residential property rentals increased by 8.7%, a significant moderation from the 29.7% increase in 2022. About 4,100 private residential units (including ECs) were completed in 4th Quarter 2023. For the whole of 2023, a total of about 21,300 private residential units were completed, more than twice the number of completions in 2022. This was also the highest annual supply completion since 2016. The Government has ramped up housing supply via the Government Land Sales (GLS) programme. The Confirmed List supply of private housing in the GLS Programme

In [12]:
from openai import OpenAI

In [13]:
client = OpenAI(api_key=#API_KEY)

In [14]:
def chunking(text):
    global client
    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system",
                "content": "I want you to be an expert in processing text into chunks for natural language processing . I am building a vector database of text and I have a document chunk in a string. I want you to split the chunk into header and text. Do not add anything to the paragraphs that is outside the string, and do not re-seqeunce the text, generate the header and text. Return only these header and text in Markdown with h2 headers without any pre-empted welcome or response message."
                },
            {"role": "user",
                "content": f"The text is as follows: {text}"}
        ]
    )

    completion_text = completion.choices[0].message.content
    print(completion_text)
    print()

In [15]:
for el in part:
    chunking(el.text)

## Release of 4th Quarter 2023 real estate statistics
Sale transaction volume for private residential properties decreased by 17% on a quarter-on-quarter basis in 4th Quarter 2023. The total transaction volume in 2023 fell by 13% compared to 2022, and was at its lowest level in seven years, since 2016. Private residential property rentals declined for the first time in over three years, by 2.1% in 4th Quarter 2023. For 2023 as a whole, private residential property rentals increased by 8.7%, a significant moderation from the 29.7% increase in 2022.

## Private residential property completions
About 4,100 private residential units (including ECs) were completed in 4th Quarter 2023. For the whole of 2023, a total of about 21,300 private residential units were completed, more than twice the number of completions in 2022. This was also the highest annual supply completion since 2016.

## Government Land Sales (GLS) Programme
The Government has ramped up housing supply via the Government Lan