# Website Summary and Translation Tool using free open source models

This notebook demonstrates a workflow for extracting content from websites, generalizing it, and translating the summary into a target language using OpenAI's API.

## Features
- **Web Scraping**: Custom functions (`fetch_website_contents`) to retrieve and clean text from web pages using `BeautifulSoup`.
- **AI Summarization & Translation**: Integrates `gpt-4o-mini` to summarize web content and translate it into languages like Hindi.
- **Free API**: Using Ollama Open sourced models for creating the tools
## Quick Start
1. Run the initialization cells.
2. Use `display_summary_translation(url, language='Hindi')` to process a website.


In [None]:
# imports
from IPython.display import Markdown, display
from openai import OpenAI

# If you get an error running this cell, then please head over to the troubleshooting notebook!

# Sub Function for fetching website contents

In [None]:
from bs4 import BeautifulSoup
import requests


# Standard headers to fetch a website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}


def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url;
    truncate to 2,000 characters as a sensible limit
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]


def fetch_website_links(url):
    """
    Return the links on the webiste at the given url
    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
    Feel free to use a class and optimize it!
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]

In [None]:
# Step 1: Prompt for the Summarizer and Translator
def display_summary_translation(url, language="Hindi"):  
    system_prompt = f"You are an Expert in Summarization and Translation. Your task is to summarize and translate the given text extracted from a website into {language}."
    user_prompt = "Here is the content of the website: " + fetch_website_contents(url)

    # Step 2: Make the messages list

    messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]

    # Step 3: Call Ollama
    OLLAMA_BASE_URL = "http://localhost:11434/v1"
    ollama = OpenAI(base_url=OLLAMA_BASE_URL,api_key="ollama")
    response = ollama.chat.completions.create(model="gpt-oss:120b-cloud", messages=messages)

    # Step 4: print the result
    display(Markdown(response.choices[0].message.content))

In [None]:
display_summary_translation("https://ollama.com", language="Hindi")
