#### Scraping

In [119]:
import requests
from bs4 import BeautifulSoup


def scrape_sections(url):
    """
    Fetch URL, parse its main content, and split into sections at the first <h1> then each <h2>.
    Returns a list of dicts:
      [
        {
          "title" : "Heading Text",
             "tag": "h1"|"h2",
             "text": "...plain text under heading...",
             "html": "...raw HTML under heading...",
             "images": [...all img src...],
             "tables": [...all <table>...</table>...]
          
        },
        ...
      ]
    """
    # 1) Fetch + parse
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # 2) Locate main content
    main = (
        soup.find("main")
        or soup.find(id="main")
        # or soup.find("article")
        or soup.find("content")
        or soup.body
    )
    if main is None:
        return []
    
    # 3) Gather only the first <h1> and then all <h2> that follow it
    all_heads = main.find_all(["h1", "h2"])
    sections = []
    seen_h1 = False
    for tag in all_heads:
        if tag.name == "h1" and not seen_h1:
            sections.append(tag)
            seen_h1 = True
        elif tag.name == "h2" and seen_h1:
            sections.append(tag)
    
    # 4) Slice out each section’s content
    output = []
    for idx, heading in enumerate(sections):
        # determine where this section ends
        end_tag = sections[idx + 1] if idx + 1 < len(sections) else None
        
        # collect all nodes until the next heading
        content_nodes = []
        for sib in heading.next_siblings:
            if sib == end_tag:
                break
            content_nodes.append(sib)
        
        # build a mini-soup of just that section
        section_html = "".join(str(n) for n in content_nodes)
        section_soup = BeautifulSoup(section_html, "html.parser")
        
        # extract fields
        heading_text = heading.get_text(strip=True)
        plain_text   = section_soup.get_text(separator=" ", strip=True)
        img_alt  = [str(alt) for alt in section_soup.find_all("img", alt=True)]
        images       = [str(img) for img in section_soup.find_all("img")]
        tables       = [str(tbl) for tbl in section_soup.find_all("table")]
        codes = [str(code) for code in section_soup.find_all("code")]
        
        
        info = {
            "id" : 'section-' + str(idx),
            "title" : heading_text,
            'content' : [{"type":    heading.name, "text":   plain_text}],
            "html":   section_html,
            "images": images,
            "img_alt" : img_alt,
            "tables": tables,
            "codes" : codes
        }
        # output.append({ heading_text: info })
        output.append(info)
    
    return output


In [124]:
sections = scrape_sections("https://learn.microsoft.com/en-us/azure/load-balancer/gateway-overview")

#### Adding Difficulty Info to Scraped Data

In [131]:
def difficulty(sections):


    full_webpage_text = "\n\n".join([
        f"{section['title']}\n{section['content'][0]['text']}"
        for section in sections
    ])



    difficulty_detection_prompt = """You are a 'difficultly detection' LLm. You are a component of a larger process designed to simplifiy texts. 

        ## Context
        ### The first LLM of this process is to identify the topic as well as difficult terms and the purpose of the document, the next LLM revises the text based on the user's expertise in the field to make it more comprehensible and easier to read, and the last LLM iteratively improves on the summarization to ensure certain quality requirements are met. 

        **You are the first step in this process, and so your job is to identify the topic and the difficult terms and the overall purpose of the document.**

        ## Instructions
        - You will be provided two bits of information: The full webpage text where the chunk comes from, and the specific text chunk that will later be summarized.

        - You may also be given the alternate text for images or tables if those were present in the section chunk. If those are empty, assume there were no images or tables.

        - **Generate the difficult terms from the text chink. If the tables or alternate image text are available, consider them as part of the text chunk, and use them for listing the difficult terms as well**

        - **Use the full webpage text to detemine the topic. DO NOT USE IT FOR THE DIFFICULT TERMS LIST. Also use the full webpage text to determine the purpose of the text.**

        - Format your response as a valid JSON string, where strings are wrapped in double quotes, using backslash escapes.

        **Structure your response according to the following template: {{"topic" : "__topic__", "purpose_of_document" : "__purpose__", "difficult_terms" : []}}**

        The topic and purpose must be returned as strings, and the difficult terms must be a list of just the terms. Do not include any other information.

        """

    token = DefaultAzureCredential().get_token("https://cognitiveservices.azure.com/.default").token
    
    endpoint = "https://aoai-l-eastus2.openai.azure.com/"
    model_name = "gpt-4.1"
    deployment_name = "gpt-4.1"
    api_version = "2024-12-01-preview"
    
    llm = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=token,
    )



    for section in sections:
        specific_text = section['content'][0]['text']
        tables = section["tables"]
        image_alt_text = section["img_alt"]

        response = llm.chat.completions.create(
        model= deployment_name,
        messages=[
            {
            "role":"system",
            "content": difficulty_detection_prompt
            },
            {
            "role":"user",
            "content": f"## Full Webpage Text\n{full_webpage_text}\n## Specific Text\n{specific_text}\n## Tables\n{tables}\n## Image Alt Texts\n{image_alt_text}\n"
            }
        ],
        max_tokens=4096
        )

        response_content = response.choices[0].message.content

        pattern = r'\{(?:[^{}]|(?:\\{[^{}]*\\}))*\}'
        cleaned_response = re.findall(pattern, response_content)[0]

        cleaned_response_json = json.loads(cleaned_response)

        section["topic_difficult_terms"] = cleaned_response_json


In [133]:

from azure.identity import DefaultAzureCredential
from openai import AzureOpenAI
import re
import json

def intermediate_rewrite(sections):

    sections_interm = []
    token = DefaultAzureCredential().get_token("https://cognitiveservices.azure.com/.default").token
    
    endpoint = "https://aoai-l-eastus2.openai.azure.com/"
    model_name = "gpt-4.1"
    deployment_name = "gpt-4.1"
    api_version = "2024-12-01-preview"
    
    llm = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=token,
    )


    system_prompt = """
    You are a content revision and teaching assistant. You are a component of a larger process designed to simplify complicated documentations.

    ## Context
    You are given a section of a technical documentation and the content of the whole page for context. Your task is to simplify the content of that section only, making it easier to understand for a user who is a professional or student in the field, but not familiar with this particular topic.

    ## Instructions
    - **You need to re-write the content given in 'text' only**
    - You have access to the section title ('title'), text content ('text'), image alt ('image_alt') texts of the images within the section, the main topic of the page ('topic'), the purpose of the document (purpose_of_document), the difficult terms in the document (difficult_terms) and the original html of the section in 'html'.
    - Use simple language and short sentences.
    - Do not add any new information and do not omit any existing information, except the CODE. **OMIT ALL CODE** \
    - **Retain the hyperlinks and the specific words hyperlinked and keep them in the same <a> wrapper like <a data-linktype="external" href="https://azure.microsoft.com/pricing/details/load-balancer/">Load Balancer pricing</a>**
    - Analyse the purpose of the document in "purpose_of_document" to decide if you need to explain the difficult terms given in "difficult_terms" appearing within that section, for example if it is an instructional document, don't explain the difficult terms; if it's a tutorial, explain a bit depending on the professional/student's potential knowledge.
    - Assume that after the revised section content, the ORIGINAL code, then the images, then the original tables will be displayed, in that order. **Integrate ONLY the references and explanations to them in the revised content, RETURN ONLY PLAIN TEXT CONTENT.**
    - If 'text' is empty, return empty string.
    - Use only these HTML tags to improve the output - <ul>, <li>, <b> and <i> when making points

    - **Return just the revised section content, without any additional text or headings or any text formatting.**
    """

    for section in sections:
        sec = {}
        # 1) Build a structured payload
        payload = {
            "title": section["title"],
            "text": section['content'][0]['text'],
            "img_alt": section["img_alt"],
            "topic": section["topic_difficult_terms"]["topic"],
            "purpose_of_document": section["topic_difficult_terms"]["purpose_of_document"],
            "difficult_terms": section["topic_difficult_terms"]["difficult_terms"],
            'html': section['html']
        }

        # 2) Serialize to JSON
        user_content = json.dumps(payload)

        # 3) Send to the LLM
        response = llm.chat.completions.create(
            model=deployment_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_content}
            ],
            max_tokens=4096,
            temperature=1.0,
            top_p=1.0,
        )

        sec = {'id' : section['id'], 'title' : section['title'], 'content' : [{'type' : 'paragraph', 'text' : response.choices[0].message.content}]}

        # section['intermediate_level_text'] = response.choices[0].message.content
        # int_rewrites.append(response.choices[0].message.content)

        sections_interm.append(sec)

    return sections_interm






In [134]:
sections_intermediate = intermediate_rewrite(sections)

In [147]:

from azure.identity import DefaultAzureCredential
from openai import AzureOpenAI
import re
import json

def beginner_rewrite(sections):

    sections_beg = []
    token = DefaultAzureCredential().get_token("https://cognitiveservices.azure.com/.default").token
    
    endpoint = "https://aoai-l-eastus2.openai.azure.com/"
    model_name = "gpt-4.1"
    deployment_name = "gpt-4.1"
    api_version = "2024-12-01-preview"
    
    llm = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=token,
    )



    system_prompt = """
    You are a content revision and teaching assistant. You are a component of a larger process designed to simplify complicated documentations.

    ## Context
    You are given a section of a technical documentation and the content of the whole page for context. Your task is to simplify the content of that section only, making it so simple and easy to understand that even a user who is **Zero knowledge** of the field, **No familiarity** with the topic.

    ## Instructions
    - **You need to re-write the content given in 'text' only**
    - You have access to the section title ('title'), text content ('text'), image alt ('image_alt') texts of the images within the section, the main topic of the page ('topic'), the purpose of the document ('purpose_of_document'), the difficult terms in the document ('difficult_terms').
    - Use extremely simple language, examples and short sentences, fit for anyone with basic education to understand.
    - Do not add any new information and do not omit any existing information, except the CODE. **OMIT ALL CODE FROM TEXT** \
    - **Retain the hyperlinks and the specific words hyperlinked and keep them in the same <a> wrapper like <a data-linktype="external" href="https://azure.microsoft.com/pricing/details/load-balancer/">Load Balancer pricing</a>**
    - Analyse the purpose of the document in "purpose_of_document" and explain the difficult terms given in "difficult_terms" appearing within that section keeping the purpose in mind.
    - Assume that after the revised section content, the ORIGINAL code, then the images, then the original tables will be displayed, in that order. **Integrate ONLY the references and explanations to them in the revised content, RETURN ONLY PLAIN TEXT CONTENT.**
    - If 'text' is empty, return empty string.
    - Use only these HTML tags to improve the output - <ul>, <li>, <b> and <i> when making points



    - **Return just the revised section content AS PLAIN TEXT, without any additional text or explanations.**
    """



    for section in sections:
        sec = {}
        # 1) Build a structured payload
        payload = {
            "title": section["title"],
            "text": section['content'][0]['text'],
            "img_alt": section["img_alt"],
            "topic": section["topic_difficult_terms"]["topic"],
            "purpose_of_document": section["topic_difficult_terms"]["purpose_of_document"],
            "difficult_terms": section["topic_difficult_terms"]["difficult_terms"],
            'html': section['html']
        }

        # 2) Serialize to JSON
        user_content = json.dumps(payload)

        # 3) Send to the LLM
        response = llm.chat.completions.create(
            model=deployment_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": user_content}
            ],
            max_tokens=4096,
            temperature=1.0,
            top_p=1.0,
        )

        sec = {'id' : section['id'], 'title' : section['title'], 'content' : [{'type' : 'paragraph', 'text' : response.choices[0].message.content}]}

        # section['intermediate_level_text'] = response.choices[0].message.content
        # int_rewrites.append(response.choices[0].message.content)

        sections_beg.append(sec)

    return sections_beg




In [None]:
sections_beginner = beginner_rewrite(sections)

#### Summarizers

In [154]:
import requests
from bs4 import BeautifulSoup

url = "https://learn.microsoft.com/en-us/azure/load-balancer/gateway-overview"




def summarizer_func(url):

    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    content = soup.get_text()

    token = DefaultAzureCredential().get_token("https://cognitiveservices.azure.com/.default").token
    
    endpoint = "https://aoai-l-eastus2.openai.azure.com/"
    model_name = "gpt-4.1"
    deployment_name = "gpt-4.1"
    api_version = "2024-12-01-preview"
    
    llm = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=token,
    )


    system_prompt = """
    You are a content summarization and teaching assistant. You are a component of a larger process designed to simplify complicated documentations.

    ## Context
    You are given the content of a technical documentation. Your task is to simplify the content, making it so simple and easy to understand that even a user who is **Zero knowledge** of the field, **No familiarity** with the topic.

    ## Instructions
    - **You need to generate a **short**, clear summary, explaining the content point-wise, providing only the most important details from the document**
    - Use extremely simple language, examples and short sentences, fit for anyone with basic education to understand.
    - Do not add any new information \
    - Analyse the purpose of the document in "purpose_of_document" and keep it in mind for the summary.
    - Use only these HTML tags to improve the output - <ul>, <li>, <b> and <i> when making points



    - **Return just the summary AS PLAIN TEXT, without any additional text or explanations.**
    """



    system_prompt2 = """
    You are a content summarization and teaching assistant. You are a component of a larger process designed to simplify complicated documentations.

    ## Context
    You are given the content of a technical documentation. Your task is to simplify the content, making it simple and easy to understand for a professional or student in the field, but not having familiarity with the particular topic.

    ## Instructions
    - **You need to generate a **short**, clear summary, explaining the content point-wise, providing only the most important details from the document**
    - Use simple language, examples and short sentences, fit for anyone with basic knowledge to understand.
    - Do not add any new information \
    - Analyse the purpose of the document in "purpose_of_document" and keep it in mind for the summary.
    - Use only these HTML tags to improve the output - <ul>, <li>, <b> and <i> when making points


    - **Return just the summary AS PLAIN TEXT, without any additional text or HEADING, start DIRECTLY AT THE SUMMARY.**
    """


    system_prompt3 = """
    You are a content summarization and teaching assistant. You are a component of a larger process designed to simplify complicated documentations.

    ## Context
    You are given the content of a technical documentation. Your task is to summarize the content.

    ## Instructions
    - **You need to generate a **short**, clear summary, explaining the content point-wise, providing only the most important details from the document**
    - Do not add any new information \
    - Analyse the purpose of the document in "purpose_of_document" and keep it in mind for the summary.
    - Use only these HTML tags to improve the output - <ul>, <li>, <b> and <i> when making points


    - **Return just the summary AS PLAIN TEXT, without any additional text or HEADING, start DIRECTLY AT THE SUMMARY.**
    """



    response1 = llm.chat.completions.create(
        model=deployment_name,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": content}
        ],
        max_tokens=4096,
        temperature=1.0,
        top_p=1.0,
    )


    response2 = llm.chat.completions.create(
        model=deployment_name,
        messages=[
            {"role": "system", "content": system_prompt2},
            {"role": "user",   "content": content}
        ],
        max_tokens=4096,
        temperature=1.0,
        top_p=1.0,
    )


    response3 = llm.chat.completions.create(
        model=deployment_name,
        messages=[
            {"role": "system", "content": system_prompt3},
            {"role": "user",   "content": content}
        ],
        max_tokens=4096,
        temperature=1.0,
        top_p=1.0,
    )



    summaries = {'beginner_level_summary' : response1.choices[0].message.content, 'intermediate_level_summary' : response2.choices[0].message.content, 'advanced_level_summary' : response3.choices[0].message.content}


    return summaries





In [155]:
summaries = summarizer_func(url)

In [156]:
summaries['beginner_level_summary']

'<ul>\n  <li><b>Gateway Load Balancer</b> is a special tool in Microsoft Azure that helps you control and manage network traffic for your applications.</li>\n  <li><b>Main Use:</b> It helps you add extra security or analysis tools (like firewalls or systems that spot bad traffic) to your network easily.</li>\n  <li><b>Easy to Use:</b> You can add or remove these tools without much extra work.</li>\n  <li><b>How It Works:</b> All traffic going to your app goes first through these tools, making sure everything is checked and protected.</li>\n  <li><b>Types of Tools Supported:</b> Firewalls, security scanners, tracking traffic, and more.</li>\n  <li><b>Reliable and Efficient:</b> It keeps traffic organized, so information goes out and comes back the same way, making things stable and fast.</li>\n  <li><b>Easy Scaling:</b> You can make your system bigger or smaller as needed without stopping everything.</li>\n  <li><b>Works Across Companies:</b> You can connect apps and tools even if they 

In [157]:
summaries['intermediate_level_summary']

'<ul>\n  <li><b>Gateway Load Balancer</b> is a type (SKU) of Azure Load Balancer designed to add, manage, and scale high-performance third-party Network Virtual Appliances (NVAs) like firewalls, traffic analyzers, or DDoS protection.</li>\n  <li><b>Easy Integration:</b> Lets you transparently insert NVAs into your application’s network path, so all traffic to/from your app goes through these appliances without complex setup.</li>\n  <li><b>Service Chaining:</b> You can chain a Gateway Load Balancer to public endpoints with just one configuration step, ensuring advanced network functions are always included.</li>\n  <li><b>Maintains Flow Symmetry:</b> Ensures network flows remain symmetrical (both directions use the same path), which is required for most NVAs to work properly.</li>\n  <li><b>Automatic Scaling:</b> Easily scale up or down by adding/removing backend VMs; load balancer automatically updates itself with no extra configuration needed.</li>\n  <li><b>Works with Both Traffic D

In [158]:
summaries['advanced_level_summary']

'<ul>\n  <li><b>Purpose:</b> Gateway Load Balancer is an Azure Load Balancer SKU designed for high performance/high availability with third-party Network Virtual Appliances (NVAs), enabling easy deployment, scaling, and management of NVAs in network paths.</li>\n  <li><b>Key Scenarios:</b> Enables transparent insertion of appliances for use cases such as firewalls, packet analytics, intrusion protection, traffic mirroring, DDoS, and custom appliances.</li>\n  <li><b>Operational Model:</b> Ensures all traffic to/from a public endpoint passes through the appliance before reaching its application using "bump-in-the-wire" technology, maintaining flow symmetry and stickiness for seamless NVA operation.</li>\n  <li><b>Technical Details:</b>\n    <ul>\n      <li>Maintains flow stickiness and symmetry for consistent routes, allowing appliances to operate properly without extra configuration.</li>\n      <li>Health probes monitor all ports and traffic is managed via HA ports rule.</li>\n      <

In [170]:
advanced_sections = []

for sec in sections:
    new_sec = {'id' : sec['id'], 'title' : sec['title'], 'content' : [{'type' : 'paragraph', 'text' : sec['html']}]}
    advanced_sections.append(new_sec)

#### Creating Final JSONs with Summaries

In [None]:
final_json = {'summaries' : {'content' : [{'type' : 'paragraph', 'text' : summaries['beginner_level_summary']}, {'type' : 'paragraph', 'text' : summaries['intermediate_level_summary']}, {'type' : 'paragraph', 'text' : summaries['advanced_level_summary']}]}}

In [182]:
sections_combined = []
for i in range(len(sections_beginner)):
    section = {}
    section["id"] = sections_beginner[i]["id"]
    section["title"] = sections_beginner[i]["title"]
    content = [sections_beginner[i]["content"][0], sections_intermediate[i]["content"][0], advanced_sections[i]["content"][0]]
    section["content"] = content

    sections_combined.append(section)

In [183]:
sections_combined

[{'id': 'section-0',
  'title': 'Gateway Load Balancer',
  'content': [{'type': 'paragraph',
    'text': 'Gateway Load Balancer is a special type of Azure Load Balancer. It is designed to help your apps work fast and stay available, especially when you use tools made by other companies called Network Virtual Appliances (NVAs).\n\nWith Gateway Load Balancer, you can:\n<ul>\n  <li>Set up, grow, or manage NVAs easily.</li>\n  <li>Connect your public internet address to the Load Balancer with just one step.</li>\n</ul>\n\nYou can use Gateway Load Balancer to add things like:\n<ul>\n  <li>Firewalls (help block unwanted traffic)</li>\n  <li>Advanced packet analytics (look closely at the data traveling through your network)</li>\n  <li>Intrusion detection and prevention systems (spot and stop attacks)</li>\n  <li>Traffic mirroring (make copies of network traffic for checking)</li>\n  <li>DDoS protection (protect against attacks that flood your service)</li>\n  <li>Custom appliances (other too

In [184]:
final_json["sections"] = sections_combined

In [185]:
json_str = json.dumps(final_json, indent=4)  # pretty-prints with 4-space indent
print(json_str)

# 2) To save directly to a file:
with open("final_version.json", "w", encoding="utf-8") as f:
    json.dump(final_json, f, ensure_ascii=False, indent=4)

{
    "summaries": {
        "content": [
            {
                "type": "paragraph",
                "text": "<ul>\n  <li><b>Gateway Load Balancer</b> is a special tool in Microsoft Azure that helps you control and manage network traffic for your applications.</li>\n  <li><b>Main Use:</b> It helps you add extra security or analysis tools (like firewalls or systems that spot bad traffic) to your network easily.</li>\n  <li><b>Easy to Use:</b> You can add or remove these tools without much extra work.</li>\n  <li><b>How It Works:</b> All traffic going to your app goes first through these tools, making sure everything is checked and protected.</li>\n  <li><b>Types of Tools Supported:</b> Firewalls, security scanners, tracking traffic, and more.</li>\n  <li><b>Reliable and Efficient:</b> It keeps traffic organized, so information goes out and comes back the same way, making things stable and fast.</li>\n  <li><b>Easy Scaling:</b> You can make your system bigger or smaller as needed