In [62]:
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import urllib.parse
from ast import *

In [63]:
location = "san francisco bay area"
topic = "economy"

In [64]:
def scrape_article(url):
    # Send a request to the URL
    response = requests.get(url)

    # Check if the request was successful
    # if response.status_code != 200:
      #  raise Exception(f"Failed to load page {url}, status code: {response.status_code}")

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the relevant content
    # This part depends on the structure of the website. We'll use an example of a common structure.
    article_content = ''

    # Many news websites use <article> tag or specific class names for article content
    article = soup.find('article')
    if article:
        paragraphs = article.find_all('p')
    else:
        # Fallback to a generic method, searching for <p> tags within a div with a common class name
        paragraphs = soup.find_all('p')

    for p in paragraphs:
        article_content += p.get_text() + '\n'

    return article_content.strip()

In [65]:
def get_news_search_results(query, num_results):
    # Google news search URL with query
    search_url = f"https://www.google.com/search?q={query}&tbm=nws&num={num_results}"

    # Send a request to Google Search
    response = requests.get(search_url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()  # Check if the request was successful

    # Parse the response content with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract the news headlines
    #soup.find_all('div', class_= 'BNeawe vvjwJb AP7Wnd'):
    
    divs = soup.find_all('div', class_='BNeawe vvjwJb AP7Wnd')
    articles = []
    
    if not divs:
        print("No news results found.")
        return
    
    for div in divs:
        # Extract headline text
        headline = div.text
        # Find the parent 'a' tag which contains the link
        link_tag = div.find_parent('a')
        # Extract the link URL
        if link_tag:
            # Google search results URLs are often prefixed with '/url?q='
            # Extract and decode the actual URL using urllib.parse
            raw_url = link_tag.get('href')
            if raw_url.startswith('/url?q='):
                url = urllib.parse.unquote(raw_url.split('/url?q=')[1].split('&sa=')[0])
            else:
                url = 'No valid link found'
        else:
            url = 'No link found'
            
        articles.append((headline, url))
            
    return articles

In [66]:
# Define the query and the number of results
query = f"{location} local news in the past week about {topic}"
num_results = 25
articles = []

# Get the news search results
news_search_results = get_news_search_results(query, num_results)

# Print the results
for i, result in enumerate(news_search_results, 1):
    # print(f"{i}. {result}")
    articles.append(result)



In [67]:
articles

[('Downtown San Jose visit activity soars, Oakland jumps, San Francisco nosedives',
  'https://www.mercurynews.com/2024/05/09/economy-san-jose-oakland-downtown-jobs-restaurant-store-tech-property/'),
 ('25000 fans headed to San Francisco for sold out rave with Skrillex, Fred Again',
  'https://abc7news.com/post/sold-rave-25000-fans-headed-san-francisco-skrillex/14888819/'),
 ('The APEC summit is happening this week in San Francisco. What is APEC, anyway?',
  'https://www.nbcbayarea.com/news/local/apec-summit-san-francisco-explained/3369623/'),
 ('How the Tri-Valley drives the local economy, embraces innovation - San Francisco Business Times',
  'https://www.bizjournals.com/sanfrancisco/news/2024/02/16/tri-valley-drives-economy-embraces-innovation.html'),
 ('What Happened to San Francisco, Really?',
  'https://www.newyorker.com/magazine/2023/10/23/what-happened-to-san-francisco-really'),
 ("Protesters Shut Down I-880 Freeway in Oakland as Part of 'Economic Blockade' for Gaza",
  'https:

In [68]:
def prompt_openai(prompt):
    api_key = "sk-proj-DpFFaCf3nHFx384aZQrmT3BlbkFJRsuMTVNdDhK3GQn1j4Si"
    client = OpenAI(api_key=api_key)
    completion = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        {"role": "user", "content": prompt}
      ]
    )
    return completion.choices[0].message.content

In [74]:
prompt = f"""For the following list, give me a list of five important distinct events that are referenced by several articles (i.e. a short blurb). 
{articles} If it's not related to {topic} and/or it's not in {location}, don't account for it. If it's an opinion article or a guide, don't account for it.
It should be formatted as a python array of tuples, with the first item being the event title and the second item the links that pertain to it. The third item should be the image address of an image pertaining to it from a google search of the first item.. 
The event should be something distinct and not a general topic — i.e. the Golden Gate Bridge has shut down."
Please don't do things like '''python or /n, i should be able to assign the output text to a variable
"""
response_text = prompt_openai(prompt)
print(response_text)

[
    ("APEC Summit 2023 in San Francisco",
     [
         "https://www.nbcbayarea.com/news/local/apec-summit-san-francisco-explained/3369623/",
         "https://www.sfchronicle.com/sf/article/apec-san-francisco-closures-traffic-18481349.php",
         "https://www.nbcbayarea.com/news/local/san-francisco/san-francisco-streets-after-apec/3395312/",
         "https://www.kqed.org/news/11965942/from-street-closures-to-security-checks-what-to-know-about-sf-apec-2023",
         "https://abc7news.com/sf-chinatown-apec-business-san-francisco/14073667/",
         "https://www.sfchronicle.com/bayarea/article/apec-protest-san-francisco-18483819.php",
         "https://abc7news.com/apec-summit-san-francisco-schedule-asia-pacific-economic-cooperation-2023/14028661/",
         "https://abc7news.com/apec-2023-san-francisco-road-closures-summit-security-sf-safety/14022548/",
         "https://www.sfchronicle.com/sf/article/sf-apec-homeless-encampments-clear-hotspots-18478050.php",
         "https:/

In [70]:
events = eval(response_text)

In [72]:
# first_event = events[0]
# articles_text = "";

# index = 0
# for link in first_event[1]:
#     articles_text += scrape_article(link)
#     index += 1
#     if index == 5:
#         break
    
# summary = prompt_openai(f"""Summarize the following text with three to five important bullet points: {articles_text}. Then, add the headline {first_event[0]}
# at the top.""")

# summary

summaries = []

for event in events:
    summary_text = "";
    index = 0
    for link in event[1]:
        summary_text += scrape_article(link)
        index += 1
        if index == 5:
            break
    summary = prompt_openai(f"""Summarize the following text with three to five important bullet points: {summary_text}. Then, add the headline {event[0]}
    at the top.""")
    summary = summary.replace("\n", "")
    summaries.append((event[0], summary))

summaries

[('2023 APEC Summit in San Francisco',
  "### 2023 APEC Summit in San Francisco- **APEC Summit Details:**  - The United States hosts the annual APEC summit for the first time since 2011.  - Leaders from the 21-member Asia-Pacific Economic Cooperation group discuss trade and economic growth.  - The key event is the meeting between President Joe Biden and Chinese President Xi Jinping amidst strained U.S.-China relations.- **Backdrop and Key Issues:**  - The summit occurs amid tensions from global issues like the Israel-Hamas conflict and Russia's invasion of Ukraine.  - Focus areas include making APEC economies more resilient to climate change and post-pandemic recovery.- **APEC's Role and Recent Developments:**  - APEC promotes trade and economic cooperation without binding agreements.  - There have been heightened security measures and public transit impacts across San Francisco due to the summit.  - **Local Impact and Community Response:**  - Chinatown and other local businesses have 