In [1]:
%pip install bs4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


#### Raw HTML from the Page

In [2]:
import requests
from bs4 import BeautifulSoup

url = "https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/embeddings?tabs=python-new%2Ccommand-line&pivots=programming-language-python"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
content = soup.get_text()


#### Organized HTML from the Page

In [3]:
import requests
from bs4 import BeautifulSoup

def scrape_sections(url):
    """
    Fetch URL, parse its main content, and split into sections at the first <h1> then each <h2>.
    Returns a list of dicts:
      [
        {
          "title" : "Heading Text",
             "tag": "h1"|"h2",
             "text": "...plain text under heading...",
             "html": "...raw HTML under heading...",
             "images": [...all img src...],
             "tables": [...all <table>...</table>...]
          
        },
        ...
      ]
    """
    # 1) Fetch + parse
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # 2) Locate main content
    main = (
        soup.find("main")
        or soup.find(id="main")
        # or soup.find("article")
        or soup.find("content")
        or soup.body
    )
    if main is None:
        return []
    
    # 3) Gather only the first <h1> and then all <h2> that follow it
    all_heads = main.find_all(["h1", "h2"])
    sections = []
    seen_h1 = False
    for tag in all_heads:
        if tag.name == "h1" and not seen_h1:
            sections.append(tag)
            seen_h1 = True
        elif tag.name == "h2" and seen_h1:
            sections.append(tag)
    
    # 4) Slice out each section’s content
    output = []
    for idx, heading in enumerate(sections):
        # determine where this section ends
        end_tag = sections[idx + 1] if idx + 1 < len(sections) else None
        
        # collect all nodes until the next heading
        content_nodes = []
        for sib in heading.next_siblings:
            if sib == end_tag:
                break
            content_nodes.append(sib)
        
        # build a mini-soup of just that section
        section_html = "".join(str(n) for n in content_nodes)
        section_soup = BeautifulSoup(section_html, "html.parser")
        
        # extract fields
        heading_text = heading.get_text(strip=True)
        plain_text   = section_soup.get_text(separator=" ", strip=True)
        img_alt  = [str(alt) for alt in section_soup.find_all("img", alt=True)]
        images       = [str(img) for img in section_soup.find_all("img")]
        tables       = [str(tbl) for tbl in section_soup.find_all("table")]
        codes = [str(code) for code in section_soup.find_all("code")]
        
        
        info = {
            "title" : heading_text,
            "tag":    heading.name,
            "text":   plain_text,
            "html":   section_html,
            "images": images,
            "img_alt" : img_alt,
            "tables": tables,
            "codes" : codes
        }
        # output.append({ heading_text: info })
        output.append(info)
    
    return output


In [4]:
sections = scrape_sections("https://learn.microsoft.com/en-us/azure/ai-services/openai/tutorials/embeddings?tabs=python-new%2Ccommand-line&pivots=programming-language-python")
# for sec in sections:
#     for title, data in sec.items():
#         print(title, data["tag"], "→", data["text"][:80], "…")
#         print("Images:", data["images"])
#         print("Tables:", data["tables"])
#         print("Codes:", data["codes"])
#         print("Image Alts:", data["img_alt"])
#         print()


In [5]:
sections

[{'title': 'Tutorial: Explore Azure OpenAI in Azure AI Foundry Models embeddings and document search',
  'tag': 'h1',
  'text': '',
  'html': '',
  'images': [],
  'img_alt': [],
  'tables': [],
  'codes': []},
 {'title': 'In this article',
  'tag': 'h2',
  'text': '',
  'html': '\n',
  'images': [],
  'img_alt': [],
  'tables': [],
  'codes': []},
 {'title': 'Prerequisites',
  'tag': 'h2',
  'text': "An Azure subscription - Create one for free An Azure OpenAI resource with the text-embedding-ada-002 (Version 2) model deployed. This model is currently only available in certain regions .  If you don't have a resource the process of creating one is documented in our resource deployment guide . Python 3.8 or later version The following Python libraries: openai, num2words, matplotlib, plotly, scipy, scikit-learn, pandas, tiktoken. Jupyter Notebooks",
  'html': '\n<ul>\n<li>An Azure subscription - <a data-linktype="external" href="https://azure.microsoft.com/free/cognitive-services?azure-po

##### We need the Output in this exact format, this would be kept consistent and after LLM summarization, Backend will recieve the output in the same format, but with Additional keys "begginer_level_summary", "intermediate_level_summary", "advanced_level_summary"

##### LLM Code (Ignore)

In [6]:
from azure.identity import DefaultAzureCredential
from openai import AzureOpenAI



token = DefaultAzureCredential().get_token("https://cognitiveservices.azure.com/.default").token

endpoint = "https://aoai-l-eastus2.openai.azure.com/"
model_name = "gpt-4.1"
deployment_name = "gpt-4.1"
api_version = "2024-12-01-preview"

llm = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=token,
)

ModuleNotFoundError: No module named 'azure'

In [None]:
sections[0].keys()

dict_keys(['Tutorial: Explore Azure OpenAI in Azure AI Foundry Models embeddings and document search'])

In [None]:
for section in sections:

    content = {"type" : "text", "text": section[list(section.keys())[0]]["html"]}
    ``
    response = llm.chat.completions.create(
        model= deployment_name,
        messages=[
            {
            "role":"system",
            "content": '''Simplify the content of the following section. \
                            Use simple language and short sentences. \
                            Do not add any new information. \
                            Use the HTML code of the section for context
                '''},
            {
            "role":"user",
            "content": content
            }
        ],
        max_tokens=4096,
        temperature = 1.0,
        top_p = 1.0
        )
