In [125]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json


In [1]:
print("hi")

hi


In [171]:

def get_total_pages(url):
    response = requests.get(url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})
    
    if response.status_code != 200:
        print(f"Failed to retrieve data from {url}")
        print(response.status_code)
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    page_info = soup.find('span', class_='pageText', attrs={'data-rf-test-name': 'download-and-save-page-number-text'})
    
    if page_info:
        page_text = page_info.text.strip()
        print(f"Page text: {page_text}")
        # Extract the total number of pages using regular expressions
        match = re.search(r'of (\d+)', page_text)
        if match:
            total_pages = int(match.group(1))
            return total_pages
        else:
            print("Total pages not found in text")
            return None
    else:
        print("Page info not found")
        return None

def scrape_redfin(url):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})
    soup = BeautifulSoup(response.text, 'html.parser')
    house_containers = soup.find_all('div', class_='HomeCardContainer')

    house_data = []

    for container in house_containers:
        if container.get('aria-label') == 'Advertisement':
            continue

        # Initialize variables for address, geo, and link
        street_address = address_locality = address_region = postal_code = address_country = latitude = longitude = None

        # Extract the address from JSON-LD
        script_tag = container.find('script', type='application/ld+json')
        if script_tag:
            try:
                json_data = json.loads(script_tag.string)
                if isinstance(json_data, list):
                    json_data = json_data[0]  # Use the first item if it's a list
                street_address = json_data['address'].get('streetAddress', '')
                address_locality = json_data['address'].get('addressLocality', '')
                address_region = json_data['address'].get('addressRegion', '')
                postal_code = json_data['address'].get('postalCode', '')
                address_country = json_data['address'].get('addressCountry', '')
                latitude = json_data['geo'].get('latitude', None)
                longitude = json_data['geo'].get('longitude', None)
            except (json.JSONDecodeError, KeyError, TypeError):
                pass

        try:
            # Extract the price
            price = container.find('span', {"class": 'bp-Homecard__Price--value'}).text.strip()
        except AttributeError:
            price = None

        try:
            # Extract the number of beds
            beds = container.find('span', {"class": 'bp-Homecard__Stats--beds'}).text.strip()
        except AttributeError:
            beds = None

        try:
            # Extract the number of baths
            baths = container.find('span', {"class": 'bp-Homecard__Stats--baths'}).text.strip()
        except AttributeError:
            baths = None

        try:
            # Extract the square footage
            sqft = container.find('span', {"class": 'bp-Homecard__LockedStat--value'}).text.strip()
        except AttributeError:
            sqft = None

        try:
            # Extract the link
            link = container.find('a', {"class": 'link-and-anchor'})['href']
            full_link = f"https://www.redfin.com{link}"
        except (AttributeError, TypeError):
            full_link = None

        # Append the extracted data to the list
        house_data.append({
            'price': price,
            'beds': beds,
            'baths': baths,
            'sqft': sqft,
            'address': f"{street_address}, {address_locality}, {address_region} {postal_code}, {address_country}",
            'street': street_address,
            'locality': address_locality,
            'region': address_region,
            'zip': postal_code,
            'country': address_country,
            'latitude': latitude,
            'longitude': longitude,
            'link': full_link
        })

    return house_data


# Function to construct the URL based on the selected filters
def redfin_url_filter(filters):
    filter_params = []
    
    # Property type
    property_types = []
    for prop_type, enabled in filters["property_type"].items():
        if enabled:
            property_types.append(prop_type)
    if property_types:
        filter_params.append(f"property-type={'+'.join(property_types)}")
    
    # Price filters
    if "min_price" in filters and filters["min_price"] is not None:
        filter_params.append(f"min-price={filters['min_price']}")
    if "max_price" in filters and filters["max_price"] is not None:
        filter_params.append(f"max-price={filters['max_price']}")
    
    # Beds filters
    if "min_beds" in filters and filters["min_beds"] is not None:
        filter_params.append(f"min-beds={filters['min_beds']}")
    if "max_beds" in filters and filters["max_beds"] is not None:
        filter_params.append(f"max-beds={filters['max_beds']}")
    
    # Baths filters
    if "min_baths" in filters and filters["min_baths"] is not None:
        filter_params.append(f"min-baths={filters['min_baths']}")
    if "max_baths" in filters and filters["max_baths"] is not None:
        filter_params.append(f"max-baths={filters['max_baths']}")
    
    # Construct the final URL
    filter_string = ",".join(filter_params)
    return f"/filter/{filter_string}"

## Begin Script using functions above

In [195]:
# City = "Peoria"
# State = "IL"
zip = 98177

base_url = "https://www.redfin.com/zipcode/"

filters = {
  "property_type": {
        "house": True,
        "townhouse": True,
        "condo": False,
        "multifamily": False,
        "land": False,
        "manufactured": False,
        "co-op": False
    },
    "min_price": None,
    "max_price": None,
    "min_beds": None,
    "max_beds": None,
    "min_baths": None,
    "max_baths": None
}


# Example usage
url_filters = redfin_url_filter(filters)


print(base_url)
print(f"{base_url}{url_filters}")

zips = pd.read_csv("zip_code_database.csv")

if zip == None:

  target_zips = zips[(zips["primary_city"] == City) & (zips["state"] == State)]["zip"].tolist()
else:
    target_zips = [zip]


print(target_zips)

data = []


for zip in target_zips:
    url = f"{base_url}{zip}{url_filters}"
    
    try:
        total_pages = get_total_pages(url)
        if total_pages is None:
            print(f"No pages found for ZIP code {zip}. Skipping.")
            continue
    except Exception as e:
        print(f"Error getting total pages for ZIP code {zip}: {e}")
        continue
    for page_number in range(1,total_pages+1):
        print(zip, " Page: ", page_number)
        if page_number == 1:
            page_url = url
        else:
            page_url = f"{url}/page-{page_number}"

        print(page_url)

        scraped_data = scrape_redfin(page_url)
        
        for record in scraped_data:
            record['search_zip'] = str(zip)

        data.extend(scraped_data)

df = pd.DataFrame(data)

df = df[df['zip'] == df['search_zip']]

df.head()

https://www.redfin.com/zipcode/
https://www.redfin.com/zipcode//filter/property-type=house+townhouse
[98177]
Page text: Viewing page 1 of 1
98177  Page:  1
https://www.redfin.com/zipcode/98177/filter/property-type=house+townhouse


Unnamed: 0,price,beds,baths,sqft,address,street,locality,region,zip,country,latitude,longitude,link,search_zip
0,"$3,360,000",5 beds,5.5 baths,3456,"861 NW 175th St, Shoreline, WA 98177, US",861 NW 175th St,Shoreline,WA,98177,US,47.755255,-122.368715,https://www.redfin.com/WA/Shoreline/861-NW-175...,98177
1,"$975,000",3 beds,2.5 baths,1750,"20316 3rd Ave NW, Shoreline, WA 98177, US",20316 3rd Ave NW,Shoreline,WA,98177,US,47.77713,-122.361046,https://www.redfin.com/WA/Shoreline/20316-3rd-...,98177
2,"$925,000",3 beds,2 baths,1720,"208 NW 177th St, Shoreline, WA 98177, US",208 NW 177th St,Shoreline,WA,98177,US,47.757613,-122.360175,https://www.redfin.com/WA/Shoreline/208-NW-177...,98177
3,"$1,600,000",2 beds,2 baths,2235,"16732 16th Ave NW, Shoreline, WA 98177, US",16732 16th Ave NW,Shoreline,WA,98177,US,47.751582,-122.378856,https://www.redfin.com/WA/Shoreline/16732-16th...,98177
4,"$1,325,000",4 beds,3.5 baths,3070,"138 NW 143rd St, Seattle, WA 98177, US",138 NW 143rd St,Seattle,WA,98177,US,47.732626,-122.359516,https://www.redfin.com/WA/Seattle/138-NW-143rd...,98177


# Try to use stingray to get the same data

In [188]:
#House Initial Info API

house = df['link'].str.replace(r'^https?://[^/]+', '', regex=True)[1]

print(house)

stingray_url = f"http://www.redfin.com/stingray/api/home/details/initialInfo?path={house}"

print(stingray_url)

/WA/Shoreline/20316-3rd-Ave-NW-98177/home/78377
http://www.redfin.com/stingray/api/home/details/initialInfo?path=/WA/Shoreline/20316-3rd-Ave-NW-98177/home/78377


In [191]:
#Full House API (needs listing and property IDS. Can be gotten from the Initial Info API)

pid = 78377

lid = 188760919

new_url = f"https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId={pid}&accessLevel=1&listingId={lid}"

print(new_url)


https://www.redfin.com/stingray/api/home/details/belowTheFold?propertyId=78377&accessLevel=1&listingId=188760919



https://github.com/alientechsw/RedfinPlus/blob/master/docs/REDFIN.md

In [213]:
# GIS Search API

import requests

# Base URL
base_url = "https://www.redfin.com/stingray/api/gis"

# Default parameters
default_params = {
    "al": 1,
    "include_nearby_homes": False,
    "market": "seattle",
    "num_homes": 200,
    "ord": "redfin-recommended-asc",
    "page_number": 1,
    "poly": "-122.54472 47.44109,-122.11144 47.44109,-122.11144 47.78363,-122.54472 47.78363,-122.54472 47.44109",
    "sf": "1,2,3,4,5,6,7",
    "start": 0,
    "status": 1,
    "uipt": "1,2,3,4,5,6,7,8",
    "v": 8,
    "zoomLevel": 11,
    "region_id" : 98177,
    "region_type" : 2
}

# Parameter toggle configuration
param_toggle = {
    "al": True,
    "include_nearby_homes": True,
    "market": False,
    "num_homes": True,
    "ord": True,
    "page_number": True,
    "poly": False,
    "sf": True,
    "start": True,
    "status": True,
    "uipt": True,
    "v": True,
    "zoomLevel": True,
    "region_id" : True,
    "region_type" : True
}

# Function to convert parameters dictionary to query string format
def dict_to_query_string(params):
    return "&".join(f"{key}={value}" for key, value in params.items() if param_toggle.get(key, False))

print(f"{base_url}?{dict_to_query_string(default_params)}")




https://www.redfin.com/stingray/api/gis?al=1&include_nearby_homes=False&num_homes=200&ord=redfin-recommended-asc&page_number=1&sf=1,2,3,5,6,7&start=0&status=1&uipt=1,2,3,4,5,6,7,8&v=8&zoomLevel=11&region_id=98177&region_type=2


In [None]:
https://spys.me/proxy.txt
