In [1]:
import google.generativeai as genai
import requests
from bs4 import BeautifulSoup
import time
import textwrap
from IPython.display import Markdown, display
import sounddevice as sd
from scipy.io.wavfile import write
from dotenv import load_dotenv
import whisper
import tempfile
import os

In [None]:
load_dotenv()

api_key = os.getenv("API_KEY")

In [73]:
def wrap_text(text, width=100):
    return '\n'.join(textwrap.wrap(text, width=width))

In [74]:
def type_text(text, delay=0.005, width=100):
    wrapped = wrap_text(text, width)
    for line in wrapped.split('\n'):
        for char in line:
            print(char, end='', flush=True)
            time.sleep(delay)
        print()

In [75]:
def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    for tag in soup(["script", "style", "img", "video", "iframe"]):
        tag.decompose()

    text = soup.get_text(separator="\n")
    cleaned_text = ' '.join(text.split())
    return cleaned_text

In [76]:
def summarize_with_gemini(text):
    model = genai.GenerativeModel("models/gemini-1.5-flash")
    prompt = f"""
Please generate a **detailed summary** of the following webpage content:
- Use bullet points wherever appropriate.
- Include subheadings like 'Overview', 'Key Insights', etc.
- Do not mention any images, videos, or links.

Text content:
{text}
"""
    response = model.generate_content(prompt)
    return response.text

In [77]:
def pretty_display(summary, url, use_typing=True):
    heading_main = "📄 SUMMARY OF THE PAGE"
    source_line = f"🔗 URL Entered: {url}"

    if use_typing:
        type_text(heading_main)
        type_text(source_line)
    else:
        display(Markdown(f"## {heading_main}"))
        display(Markdown(f"**URL Entered:** {url}"))

    sections = summary.strip().split("\n\n")

    for section in sections:
        lines = section.strip().split("\n")
        if not lines:
            continue

        heading = lines[0].strip().replace(":", "").upper()
        if use_typing:
            type_text(f"\n🟩 {heading}")
        else:
            display(Markdown(f"### 🟩 *{heading}*"))

        for line in lines[1:]:
            clean_line = line.strip().lstrip("*").strip()
            if clean_line:
                bullet = f"- {clean_line}"
                if use_typing:
                    type_text(bullet)
                else:
                    display(Markdown(bullet))


In [78]:
def spoken_to_url(text):
    replacements = {
        " dot ": ".",
        " dotcom": ".com",
        " slash ": "/",
        " forward slash ": "/",
        " backslash ": "\\",
        " colon ": ":",
        " space ": "",
        " dash ": "-",
        " underscore ": "_"
    }
    text = text.lower()
    for spoken, symbol in replacements.items():
        text = text.replace(spoken, symbol)
    text = text.replace(" ", "")  # Remove extra spaces
    return text


In [79]:

# Record duration in seconds
duration = 8 # seconds
samplerate = 44100  # Hz

print("🎙️ Speak now...")
recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='int16')
sd.wait()
print("✅ Recording done.")

# Save to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
    write(f.name, samplerate, recording)
    audio_path = f.name

# Load Whisper model
model = whisper.load_model("base")  # or "tiny", "small", "medium", "large"

# Transcribe
result = model.transcribe(audio_path)
transcribed_text = result["text"].strip()
print("📝 Raw Transcription:", transcribed_text)

# Convert spoken words to URL
normalized_text = spoken_to_url(transcribed_text)
print("🔗 Normalized URL text:", normalized_text)

# Add https:// if missing
if not normalized_text.startswith("http://") and not normalized_text.startswith("https://"):
    url = "https://" + normalized_text
else:
    url = normalized_text

print("\nFetching content and generating summary from:", url, "\n")

try:
    text = extract_text_from_url(url)
    summary = summarize_with_gemini(text)
    pretty_display(summary, url, use_typing=True)
except Exception as e:
    print("❌ Error fetching or summarizing content:", e)


🎙️ Speak now...
✅ Recording done.
📝 Raw Transcription: BBC.com
🔗 Normalized URL text: bbc.com

Fetching content and generating summary from: https://bbc.com 

📄 SUMMARY OF THE PAGE
🔗 URL Entered: https://bbc.com
 🟩 **OVERVIEW**
 🟩 THIS IS A SUMMARY OF THE BBC NEWS HOMEPAGE, SHOWCASING A SELECTION OF BREAKING NEWS STORIES AND
FEATURES ACROSS VARIOUS CATEGORIES.  THE SITE FEATURES SECTIONS DEDICATED TO NEWS (GLOBAL AND
REGIONAL), SPORT, BUSINESS, INNOVATION, CULTURE, ARTS, TRAVEL, AND EARTH, ALONG WITH AUDIO AND VIDEO
CONTENT.
 🟩 **KEY INSIGHTS (HEADLINES AND NEWS SUMMARIES)**
 🟩 * **POLITICS & WORLD AFFAIRS**  THE DEATH OF A RUSSIAN TRANSPORT MINISTER, HOURS AFTER BEING
DELAY OF TARIFFS IS DISCUSSED IN THE CONTEXT OF INTERNATIONAL TRADE NEGOTIATIONS.  LEAKED AUDIO
SUGGESTS A FORMER BANGLADESH LEADER AUTHORIZED A DEADLY CRACKDOWN ON PROTESTERS.  THE ONGOING
ISRAEL-GAZA CONFLICT IS COVERED, FOCUSING ON EFFORTS FOR A CEASEFIRE.  THE IMPACT OF TEXAS FLOODS IS
DETAILED, WITH A SIGNIFICANT NU

In [36]:
import os

In [37]:
def save_summary_to_markdown(summary, filename="summary.md"):
    with open(filename, "w", encoding="utf-8") as md_file:
        md_file.write("# Website Summary\n\n")
        md_file.write(summary)
    print("✅ Summary saved at:", os.path.abspath("summary.md"))

In [38]:
save_summary_to_markdown(summary)

✅ Summary saved at: c:\Users\Naman Goyal\OneDrive\Desktop\urlSummary\Summarizer\summary.md


In [None]:
def search_keyword_in_summary(keyword, summary_file="summary.md", original_text=None):
    keyword = keyword.lower()
    found_lines = []

    try:
        with open(summary_file, "r", encoding="utf-8") as md_file:
            lines = md_file.readlines()
            for line in lines:
                if keyword in line.lower():
                    found_lines.append(line.strip())

        if found_lines:
            print(f"\n🔍 Keyword *{keyword}* found in summary:\n")
            for match in found_lines:
                print("•", match)
            return

        print(f"❌ Keyword '{keyword}' not found in summary. Searching original content...")

    except FileNotFoundError:
        print("⚠ Summary file not found. Skipping summary search.")

    if original_text:
        if keyword in original_text.lower():
            print(f"\n🔍 Keyword *{keyword}* found in original page content.\n")
            print(f"✅ Keyword '{keyword}' is present in the original URL content.")
        else:
            print(f"❌ The keyword '{keyword}' was not found in the summary or original URL content.")
    else:
        print("⚠ Original text not provided, so could not search beyond summary.")

        
def get_voice_input(duration=3, retries=2):
    for attempt in range(retries + 1):
        print(f"\n🎙️ Speak now ({duration} sec)...")
        samplerate = 44100
        recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype='int16')
        sd.wait()
        print("✅ Voice input captured.")

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            write(f.name, samplerate, recording)
            audio_path = f.name

        result = model.transcribe(audio_path)
        os.remove(audio_path)

        spoken_text = result["text"].strip().lower()
        if spoken_text:
            print(f"🗣️ You said: {spoken_text}")
            return spoken_text
        else:
            print("⚠️ Couldn't understand. Please try speaking again.")

    print("❌ Failed to capture voice input after multiple attempts.")
    return None



In [62]:
keyword = get_voice_input(duration=3)

if keyword:
    search_keyword_in_summary(keyword, summary_file="summary.md", original_text=text)
else:
    print("❌ No valid keyword provided. Skipping search.")




🎙️ Speak now (3 sec)...
✅ Voice input captured.
⚠️ Couldn't understand. Please try speaking again.

🎙️ Speak now (3 sec)...
✅ Voice input captured.
🗣️ You said: india

🔍 Keyword *india* found in summary:

• * **India:**  India's air safety regulator assures that the country's skies remain safe despite rising concerns.


In [54]:
import json
import os
from datetime import datetime

def log_summary(source_url, summary_file_path, summary_text=None, model_used="Gemini"):
    history_file = "history.json"

    if os.path.exists(history_file):
        try:
            with open(history_file, "r") as f:
                history = json.load(f)
            if not isinstance(history, dict):
                history = {}
        except json.JSONDecodeError:
            history = {}
    else:
        history = {}

    summary_entry = {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "source_url": source_url,
        "summary_file": os.path.basename(summary_file_path),
        "used_model": model_used
    }

    if summary_text:
        summary_entry["summary_preview"] = summary_text[:200] + "..."

    history.setdefault("summaries", []).append(summary_entry)

    with open(history_file, "w") as f:
        json.dump(history, f, indent=4)

    print(f"✅ Summary logged to '{history_file}'")


log_summary(url, "summary.md")

✅ Summary logged to 'history.json'
