In [None]:
## Project Overview
This project combines web scraping with OpenAI’s GPT models to summarize online training content. It extracts material from Microsoft’s **Quantum Computing Fundamentals** learning path, cleans it, and generates concise summaries per lesson as well as an overall course summary.  

## Key Features
- Fetches and parses webpages using **requests** and **BeautifulSoup**  
- Produces summaries in multiple languages (e.g., English, Spanish, or any language) and at varying levels of detail (short, medium, detailed)  
- Summarizes individual lessons on demand or processes entire learning paths  
- Presents results as clean, structured **Markdown** directly in the notebook  

## Tech Stack
- **Model**: GPT-4o-mini  
- **Language**: Python  
- **Libraries**: BeautifulSoup, OpenAI  

## Purpose
This project demonstrates how AI can streamline the understanding of technical documentation and online courses by generating multilingual, customizable summaries.  


In [None]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook!

In [None]:
# Load environment variables  from .env file (not included)

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found")
elif not api_key.startswith("sk-proj-"):
    print("An API key was found, but it doesn't start sk-proj-; please check you're using the right key")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them")
else:
    print("API key found and looks good so far!")


In [None]:
openai = OpenAI()

# If this doesn't work, try Kernel menu >> Restart Kernel and Clear Outputs Of All Cells, then run the cells from the top of this notebook down.


In [None]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        """
        Create this Website object from the given url using the BeautifulSoup library
        """
        self.url = url
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [None]:


training_website = Website("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")
print(training_website.title)
print(training_website.text)

In [None]:
# Create a system prompt function that can use different language and length 

def build_system_prompt(language="Spanish", length="short"):
    return f"""You are an assistant that analyzes the contents of a website and provides a {length} summary, ignoring text that might be navigation related.
    Respond in 20 words or less markdown, and respond in {language}.
    """
    
    
                        

In [None]:
# Create a function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a short summary in {language} of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
print(user_prompt_for(training_website))


In [None]:
print(user_prompt_for(training_website))

## And now let's build useful messages for GPT-4o-mini, using a function

In [None]:

def messages_for(website, language="Spanish", length="short"):
    return [
        {"role": "system", "content": build_system_prompt(language, length)},
        {"role": "user", "content": user_prompt_for(website)}
    ]

## Time to bring it together - the API for OpenAI is very simple!

In [None]:
#call the OpenAI API. 

def summarize(url, language="Spanish", length="short"):
    website = Website(url)
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages_for(website, language, length)
    )
    return response.choices[0].message.content
    

In [None]:
#Summarize all the lessons in microsoft quantum computer training, having the option to summarize by lesson, or the training as a whole

def summarize_training(path_url, language="Spanish", length="short"):
    links = get_links_from_path(path_url)
    print(f"Found {len(links)} lessons")

    all_summaries = []

    for link in links:
        print(f"Summarizing {link}...")
        summary = summarize(link, language, length)
        all_summaries.append(f"### {link}\n{summary}\n")

    combined_prompt = "Here are summaries of each lesson:\n\n" + "\n".join(all_summaries)
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": build_system_prompt(language, length)},
            {"role": "user", "content": "Please summarize the entire training path based on these lesson summaries:\n\n" + combined_prompt}
        ]
    )

    return "\n".join(all_summaries) + "\n\n## General Course Summary\n" + response.choices[0].message.content
    

In [None]:
summarize("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")

In [None]:
# A function to display this nicely in the Jupyter output, using markdown

def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [None]:
display_summary("https://learn.microsoft.com/en-us/training/paths/quantum-computing-fundamentals/")