In [26]:
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import urllib.parse


In [27]:
def get_news_search_results(query, num_results):
    # Google news search URL with query
    search_url = f"https://www.google.com/search?q={query}&tbm=nws&num={num_results}"

    # Send a request to Google Search
    response = requests.get(search_url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()  # Check if the request was successful

    # Parse the response content with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract the news headlines
    #soup.find_all('div', class_= 'BNeawe vvjwJb AP7Wnd'):
    
    divs = soup.find_all('div', class_='BNeawe vvjwJb AP7Wnd')
    articles = []
    
    if not divs:
        print("No news results found.")
        return
    
    for div in divs:
        # Extract headline text
        headline = div.text
        # Find the parent 'a' tag which contains the link
        link_tag = div.find_parent('a')
        # Extract the link URL
        if link_tag:
            # Google search results URLs are often prefixed with '/url?q='
            # Extract and decode the actual URL using urllib.parse
            raw_url = link_tag.get('href')
            if raw_url.startswith('/url?q='):
                url = urllib.parse.unquote(raw_url.split('/url?q=')[1].split('&sa=')[0])
            else:
                url = 'No valid link found'
        else:
            url = 'No link found'
            
        articles.append((headline, url))
            
    return articles

In [28]:
# Define the query and the number of results
def fetch_links(location, topic):
    query = f"{location} local news in the past week about {topic}"
    num_results = 25
    articles = []
    
    # Get the news search results
    news_search_results = get_news_search_results(query, num_results)

    # # Print the results
    for i, result in enumerate(news_search_results, 1):
    #     # print(f"{i}. {result}")
         articles.append(result)

    return articles


In [29]:
def prompt_openai(prompt):
    api_key = "sk-proj-f72yNKo5Aj5GpzO4ye7KT3BlbkFJFTEm12nM4PJtgNNMLR2o"
    client = OpenAI(api_key=api_key)
    completion = client.chat.completions.create(
      model="gpt-4o",
      messages=[
        {"role": "user", "content": prompt}
      ]
    )
    return completion.choices[0].message.content

In [30]:
def generate_events(location, topic):
    articles = fetch_links(location, topic)
    
    prompt = f"""For the following list, give me a list of five important distinct events that are referenced by several articles (i.e. a short blurb). 
    {articles} If it's not related to {topic} and/or it's not in {location}, don't account for it. If it's an opinion article or a guide, don't account for it.
    It should be formatted as a python array of arrays, with the first item being the event title and the second item the links that pertain to it. 
    The event should be something distinct and not a general topic — i.e. the Golden Gate Bridge has shut down."
    Please don't do things like '''python or /n, i should be able to assign the output text to a variable
    """
    response_text = prompt_openai(prompt)
    return eval(response_text)

In [31]:
output = generate_events("san francisco bay area", "economy")
output

[['APEC Summit 2023 in San Francisco',
  ['https://www.nbcbayarea.com/news/local/apec-summit-san-francisco-explained/3369623/',
   'https://www.sfchronicle.com/sf/article/apec-san-francisco-closures-traffic-18481349.php',
   'https://www.nbcbayarea.com/news/local/san-francisco/san-francisco-streets-after-apec/3395312/',
   'https://www.kqed.org/news/11965942/from-street-closures-to-security-checks-what-to-know-about-sf-apec-2023',
   'https://abc7news.com/apec-summit-san-francisco-schedule-asia-pacific-economic-cooperation-2023/14028661/',
   'https://abc7news.com/apec-2023-san-francisco-road-closures-summit-security-sf-safety/14022548/',
   'https://www.sfchronicle.com/sf/article/apec-san-francisco-closures-traffic-18497432.php',
   'https://www.sfchronicle.com/sf/article/sf-apec-homeless-encampments-clear-hotspots-18478050.php',
   'https://www.sfchronicle.com/sf/article/apec-san-francisco-closures-traffic-protests-18494935.php',
   'https://abc7news.com/apec-summit-san-francisco-roa

In [34]:
import re

def fetch_first_image(query):
    # Construct the Google search URL
    search_url = f"https://www.google.com/search?q={query}&tbm=isch"
    
    # Set user-agent to mimic a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    
    # Perform the search
    response = requests.get(search_url, headers=headers)
    
    # Check if the request was successful
    if response.status_code != 200:
        return "Failed to retrieve search results"
    
    # Parse the content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the first image element
    first_image = soup.find('img', {'src': re.compile('^https://')})
    
    if first_image:
        return first_image['src']
    else:
        return "No image found"

# Example usage
query = "puppies"
first_image_url = fetch_first_image(query)
print("First image URL:", first_image_url)


First image URL: No image found
