In [None]:
!pip install selenium

In [None]:
!pip install -q -U google-genai

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display
from google import genai
from google.genai import types



In [None]:

#load env
load_dotenv(override=True)
api_key = os.getenv('GEMINI_API_KEY')

# Check the key

if not api_key:
    print("No API key was found - please head over to the troubleshooting notebook in this folder to identify & fix!")
elif api_key.strip() != api_key:
    print("An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them - see troubleshooting notebook")
else:
    print("API key found and looks good so far!")

# Lets make a simple call for check our model is working fine or not

In [None]:
client = genai.Client(api_key=api_key)


In [None]:

response = client.models.generate_content(
    model="gemini-2.5-flash-preview-05-20",
    contents=["hi gemini"]
)
print(response.text)



In [None]:

class Website:
    def __init__(self, url, driver_path=None, wait_time=3):
        self.url = url
        self.wait_time = wait_time

        # Headless Chrome settings
        options = Options()
        # options.add_argument("--headless")  
        # Headless mode runs the browser in the background (invisible).
        # However, some websites (like openai.com) block headless browsers.
        # So if this line is active, the page may not load correctly and you may not get the full content.
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920x1080")

        # Driver path
        if driver_path:
            service = Service(executable_path=driver_path)
        else:
            service = Service() 

        # Start browser
        driver = webdriver.Chrome(service=service, options=options)
        driver.get(url)

        # Wait for the loading page
        time.sleep(self.wait_time)

        # Take page source
        html = driver.page_source
        driver.quit()

        # Analysis with BeautifulSoup 
        soup = BeautifulSoup(html, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"

        # Clean irrelevant tags
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()

        self.text = soup.body.get_text(separator="\n", strip=True)

In [None]:
system_prompt = """You are an academic research assistant specialized in summarizing scholarly papers. Follow this workflow rigorously:

Step 1: Document Verification
Verify if the input is a research paper by checking for:

Presence of academic sections (Abstract, Introduction, Methodology, Results, Discussion, References)

Technical/scholarly language

Citations (in-text or bibliography)

Research claims or data analysis
If NOT a research paper:
→ Respond: "This doesn't appear to be a research paper. Please upload peer-reviewed academic literature for summarization."

Step 2: Structured Summary (If verified)
Generate a 5-section summary in this exact format:

1. Research Question
[Identify core problem/gap addressed in 1 sentence]

2. Methodology
[Study design, data sources, analytical techniques in 2 bullet points]

3. Key Findings
[3-4 quantified results with numerical evidence from tables/figures]

4. Limitations
[2 major constraints acknowledged by authors]

5. Significance
[Impact on field & practical implications in 1 sentence]

Critical Rules:
Accuracy Priority: Never invent data. Write "Not specified" for missing elements

Source Anchoring: Cite page/paragraph numbers for claims (e.g., "Fig 3 shows 24% improvement")

Jargon Handling: Simplify complex terms using: [Technical Term → Layman Explanation] inline

Bias Alert: Flag any undeclared funding/sponsorship conflicts

Output Format: Strict Markdown with section headers, 200-word maximum

Example Output:
1. Research Question
How does microplastic concentration affect zebrafish neural development?

2. Methodology

Exposed embryos to 0.1-10μm PET particles (5-100mg/L) for 96h

Quantified gene expression (RT-qPCR) and behavioral assays (Open Field Test)

3. Key Findings
▲ 40% reduction in neuron count at 50mg/L exposure (p<0.01, Fig 2B)
■ 2.3x increase in anxiolytic behavior (Table 3)
▼ 17% downregulation in shha expression (p=0.03)

4. Limitations
    
Used static exposure vs dynamic aquatic environments

Limited proteomic validation

5. Significance
Establishes dose-dependent neurotoxicity thresholds for aquatic toxicology regulations."""

In [None]:
# A function that writes a User Prompt that asks for summaries of websites:

def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; \
please provide a summary of this website in markdown.\n\n"
    user_prompt += website.text
    return user_prompt

In [None]:
def summarize(url):
    website = Website(url)
    response = client.models.generate_content(
        model="gemini-2.5-flash-preview-05-20",
        config=types.GenerateContentConfig(
            system_instruction=system_prompt),
        contents=user_prompt_for(website)
    )

    return response.text


In [None]:
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))
        

In [None]:
display_summary("https://onlinelibrary.wiley.com/doi/full/10.1155/2021/8812542")