# Web Data Extraction and Summarization using openAI Latest model gpt-5-mini

####         Import Libraries

In [None]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI 

#### load api key

In [None]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

#### ScrapWebsite using  BeautifulSoup

In [None]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class ScrapeWebsite:

    def __init__(self, url):
        """ Scraping Website which provides title and content"""
        self.url = url
        response = requests.get(self.url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

#### System Prompt

In [None]:
system_prompt = "You are an analyst that analyses the content of the website \
    provides summary and ignore text related to navigation. Respond in markdown."

#### User Prompt

In [None]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; Please provide short summary in Markdown. Please include news and \
        announcements"
    user_prompt+=website.text
    return user_prompt

#### Format messages in openAI standard

In [None]:
def messages_for(website):
    return [
        {"role":"system", "content": system_prompt},
        {"role":"user", "content": user_prompt_for(website)}
    ]

#### Summarise the content in website using openAI latest model gpt-5-mini

In [None]:
def summarise(url):
    website = ScrapeWebsite(url)
    openai = OpenAI()
    response = openai.chat.completions.create(model="gpt-5-mini", messages=messages_for(website))
    return response.choices[0].message.content

#### Show summary as Markdown

In [None]:
def display_summary(url):
    summary = summarise(url)
    display(Markdown(summary))

#### Output

In [None]:
display_summary("https://www.firstpost.com/world/united-states/")