# RedFin Summarizer
This is a small project that uses a RedFin Scraper to gather information and links for houses on sale, <br>
then uses GPT-5-nano or Llama 3.2 to generate a summary of the listing in place them into a neat table.

>**WARNING**: If using GPT model, there is a risk of high token usage.

### Redfin Scraper and Relevant Link Finder Implementation
This portion of the code will focus on implementing the scraper for RedFin, which will gather the <br>
listing links from a city-wide search page and return the contents of each listing url link into a combined output.

The list of url links will be scrubbed for only relevant links using a similar prompt from day5 lesson.

In [None]:
# redfin_scraper.py

# Scraper Imports
import json
import requests
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from urllib.parse import urljoin, urlparse


In [None]:
# API Key Check
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API Key is Correct!")
else:
    print("Api Key Error.")

# Model Constants
MODEL_GPT = 'gpt-4o-mini'
MODEL_LLAMA = 'llama3.2'

# OpenAI Client Instantiation
openai = OpenAI()

In [None]:
# Reused Content Scraper w/o Text Limits and Updated Link Scraper
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

BASE_URL = "https://www.redfin.com"

def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url.
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)


def fetch_website_links(url, base=BASE_URL):
    """
    Return absolute links from the website at the given URL.
    Relative links are converted to absolute URLs using the base URL.
    Invalid links like '#' or 'javascript:void(0)' are ignored.
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    
    raw_links = [link.get("href") for link in soup.find_all("a")]
    
    absolute_links = []
    for link in raw_links:
        if not link or link.startswith("#") or link.startswith("javascript:"):
            continue
        if link.startswith("http://") or link.startswith("https://"):
            absolute_links.append(link)
        elif link.startswith("/"):
            absolute_links.append(urljoin(base, link))
    return absolute_links

In [None]:
# Get Links from Website
links = fetch_website_links(url)
# links

In [None]:
# Similar Link System Prompt (Updated for RedFin Website)
link_system_prompt = """
You are provided with a list of links found on a RedFin city search webpage.
You are able to decide which of the links are relevant links to a property listing that is for sale,
such links usually include state, city, and address, usually in the form:
'*STATE*/*City*/*Address*/home/*12345678*', but without the '*' characters.

You should respond in JSON as in this example:

{
    "links": [
        {"type": "property link", "url": "https://full.url/goes/here/about"},
        {"type": "property link", "url": "https://another.full.url/careers"}
    ]
}
"""

In [None]:
# Reused Function for User Prompt
def get_links_user_prompt(url):
    user_prompt = f"""
Here is the list of links on the website {url} -
Please decide which of these are relevant web links for a property listing, 
respond with the full https URL in JSON format.
Do not include News, Terms of Service, Privacy, or email links.

Links (some might be relative links):

"""
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

In [None]:
# Check User Prompt with Provided URL
print(get_links_user_prompt("https://www.redfin.com/city/26659/CT/West-Hartford"))

In [None]:
# Reused Function for Relevant Link Selection
def select_relevant_links(url):
    response = openai.chat.completions.create(
        model=MODEL_GPT,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    links = json.loads(result)
    print(f"[INFO] Found {len(links['links'])} relevant links")

    return links

In [None]:
# Check If Appropriate Links were Chosen
select_relevant_links("https://www.redfin.com/city/26659/CT/West-Hartford")

In [None]:
# Reused Function for Relevant Link Selection
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [None]:
# Check If Output Provides Necessary Information with Relevant Links
print(fetch_page_and_all_relevant_links("https://www.redfin.com/city/26659/CT/West-Hartford"))

### Analyze Property Data and Generate Report
This portion of the code uses a secondary "one-shot" prompt to generate a detailed report on the <br>
property listings found from the above link search.

General information about the city and simple, readily available details on the properties are <br>
neatly organized in an easy-to-read format with necessary details.

In [None]:
# Similar Secondary System Prompt (Updated for RedFin Website)
real_estate_system_prompt = """
You are a real estate analysis assistant that analyzes the contents of several relevant pages from a RedFin City-Wide search result webpage.
You create a detailed report on the city and each property listing for prospective buyers. Neatly organize the information for each listing into a table
and include information such as address, price, beds, baths, sqft, features, description, property type, school rating, walkinh/biking/hiking scores, etc. 
Keep the information relevant to what a buyer must know when purchasing a home. Provide a summary of the city specified and city market notes, 
with a small table showing the averages of prices, square footage, lot size, build year, bed/bath numbers, etc.
If any of the necessary information is not available, keep the heading but fill it with a 'null' occupier. 

Ignore information about RedFin or related to RedFin. Simply focus on the property data and the city.

Respond in markdown without code blocks.
"""

In [None]:
def extract_city_name(url):
    """
    Extract the city name from a Redfin URL and convert hyphens to spaces.
    Example:
        'https://www.redfin.com/city/26659/CT/West-Hartford' -> 'West Hartford'
        'https://www.redfin.com/city/12345/NY/Maine' -> 'Maine'
    """
    path = urlparse(url).path  # '/city/26659/CT/West-Hartford'
    city_part = path.strip("/").split("/")[-1]  # 'West-Hartford'
    city_name = city_part.replace("-", " ")  # 'West Hartford'
    return city_name

In [None]:
# Check if City Name is Extracted Correctly
extract_city_name("https://www.redfin.com/city/26659/CT/West-Hartford")

In [None]:
# Reused Function for Creating Secondary User Prompt
def get_real_estate_user_prompt(url):
    city_name = extract_city_name(url)
    print(f"[INFO] Looking for property information in {city_name}...")
    user_prompt = f"""
You are looking at a RedFin search for city called: {city_name}
Here are the property listings for sale on its landing page;
use this information to build a detailed report of the properties 
in this city and also provide a short background of the city.
in markdown without code blocks.\n\n
"""
    user_prompt += fetch_page_and_all_relevant_links(url)
    user_prompt = user_prompt
    return user_prompt

In [None]:
# Check If Output Provides Necessary Information
print(get_real_estate_user_prompt("https://www.redfin.com/city/26659/CT/West-Hartford"))

In [None]:
# Reused Function for Creating Final Output
def create_report(url):
    stream = openai.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {"role": "system", "content": real_estate_system_prompt},
            {"role": "user", "content": get_real_estate_user_prompt(url)}
        ],
        stream=True
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
create_report(url="https://www.redfin.com/city/26659/CT/West-Hartford")