In [1]:
import pandas as pd
import geopandas as gpd
import requests
from bs4 import BeautifulSoup
import requests
import json
from datetime import datetime


In [2]:
# Load Demographic Data
demographic_areas = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\census_block_group_source_nationwide\v107\blkgrp.gdb")
cbsa_source = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\cbsa_source\tl_2020_us_cbsa.shp")
state_source = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\state_source\States_shapefile.shp")



In [3]:
def get_target_zips(state, city=None, zip_code=None):
    zips = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\zipcode_source\zip_code_database.csv")
    
    if city is None and zip_code is None:
        target_zips = zips[zips["state"] == state]["zip"].tolist()
    elif zip_code is None:
        target_zips = zips[(zips["primary_city"] == city) & (zips["state"] == state)]["zip"].tolist()
    else:
        target_zips = [zip_code]
    
    return target_zips


def get_stingray_rgn_id(zip):
    query_location_api = f"https://www.redfin.com/stingray/do/query-location?location={zip}&v=2"
    response = requests.get(query_location_api, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}) 
    soup = BeautifulSoup(response.text, 'html.parser').text
    prefix_removed = soup.split('&&', 1)[1]
    data = json.loads(prefix_removed)
    try:
        region_id = data["payload"]["exactMatch"].get("id").split("_",1)[1]
        return region_id
    except:
        # print(f"No Exact match found for zip: {zip}")
        return None


def build_stingray_gis_params(params):
        return "&".join(f"{key}={value}" for key, value in params.items() if params.get(key) != None)


def call_stingray_buy_gis(params_url):
    api_url = "https://www.redfin.com/stingray/api/gis"
    url = f"{api_url}?{params_url}"
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})
    soup = BeautifulSoup(response.text, 'html.parser').text
    prefix_removed = soup.split('&&', 1)[1]
    # print(url)
    data = json.loads(prefix_removed)

    return data


def parse_stingray_buy_gis(data):
    homes = data.get('payload', {}).get('homes', [])
    parsed_homes = []
    
    for home in homes:
        lat_long = home.get('latLong', {}).get('value', {})
        home_info = {
            "MLS ID": home.get('mlsId', {}).get('value'),
            "Status": home.get('mlsStatus'),
            "Price": home.get('price', {}).get('value'),
            "HOA Fee": home.get('hoa', {}).get('value'),
            "Square Feet": home.get('sqFt', {}).get('value'),
            "Price per Square Foot": home.get('pricePerSqFt', {}).get('value'),
            "Lot Size": home.get('lotSize', {}).get('value'),
            "Bedrooms": home.get('beds'),
            "Bathrooms": home.get('baths'),
            "Location": home.get('location', {}).get('value'),
            "Stories": home.get('stories'),
            "Address": home.get('streetLine', {}).get('value'),
            "City": home.get('city'),
            "State": home.get('state'),
            "ZIP Code": home.get('postalCode', {}).get('value'),
            "Year Built": home.get('yearBuilt', {}).get('value'),
            "URL": home.get('url'),
            "Latitude": lat_long.get('latitude'),
            "Longitude": lat_long.get('longitude')
        }
        parsed_homes.append(home_info)
    
    return parsed_homes



def geocode_dataframe(df, latitude_col='Latitude', longitude_col='Longitude', demographics_df = demographic_areas, cbsa_df = cbsa_source, state_df = state_source):
    import geopandas as gpd
    import pandas as pd
    """
    Geocode the given DataFrame based on geographic data files.

    Parameters:
    df (pd.DataFrame): DataFrame containing the data to be geocoded.
    longitude_col (str): Name of the column containing longitude values.
    latitude_col (str): Name of the column containing latitude values.

    Returns:
    pd.DataFrame: Geocoded DataFrame.
    """
    # Convert the DataFrame to a GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.points_from_xy(df[longitude_col], df[latitude_col]), crs="EPSG:4326"
    )

    # Load and preprocess demographic areas
    demographic_areas = demographics_df
    demographic_areas.to_crs("EPSG:4326", inplace=True)
    demographic_areas["GEOID"] = demographic_areas["FIPS"]
    demographic_areas = demographic_areas[["GEOID", "geometry"]].rename(columns={"GEOID": "cbg_geoid"})

    # Load and preprocess CBSA areas
    cbsa_source = cbsa_df
    cbsa_source.to_crs("EPSG:4326", inplace=True)
    cbsa_source = cbsa_source[["GEOID", "NAME", "geometry"]].rename(columns={"GEOID": "cbsa_geoid", "NAME": "cbsa_name"})

    # Load and preprocess state areas
    state_source = state_df
    state_source.to_crs("EPSG:4326", inplace=True)
    state_source = state_source[["FID", "State_Code", "geometry"]].rename(columns={"FID": "state_id", "State_Name": "state_name"})

    # Perform spatial joins
    geocoded_dots = gdf.sjoin(demographic_areas, how="left").drop(["index_right"], axis=1)
    geocoded_dots = geocoded_dots.sjoin(cbsa_source, how='left').drop(["index_right"], axis=1)
    geocoded_dots = geocoded_dots.sjoin(state_source, how='left').drop(["index_right"], axis=1)

    # Drop unnecessary columns
    geocoded_dots = geocoded_dots.drop(['geometry'], axis=1)

    return pd.DataFrame(geocoded_dots)


In [7]:
# GIS Search API

import requests

Zip = None
City = None
States = ["TX"]


for State in States:
    target_zips = get_target_zips(State, City, Zip)
    
    print(f"Number of Zipcodes to be Scrubbed in {State}: {len(target_zips)}")

    data = []

    for index, zip in enumerate(target_zips):
        if index % 100 == 0:
            print(f"{index} Zip Codes Evaluated")


        params = {
        #??Active Listings
        "al": 1,
        #Include Nearby Homes
        "include_nearby_homes": "false",
        # Market. ie Seattle
        "market": None,
        # Number of homes to retrieve
        "num_homes": 350,
        #How to Sort the homes
        "ord": "days-on-redfin-asc",
        "page_number": 1,
        "poly": None,
        #Listing Types
        "sf": "1,2,3,4,5,6,7",
        "start": None,
        "status": 9,
        # User input property types (currently only single family, townhomes : 13)
        "uipt": "1,3",
        # ??API Version?
        "v": 8,
        "zoomLevel": None,
        #Type of Region analyzed
        "region_type" : 2,
        "region_id" : get_stingray_rgn_id(zip)
        }

        if params.get("region_id") == None:
            continue
        else:
            url_param = build_stingray_gis_params(params)
            json_data = call_stingray_buy_gis(url_param)
            list_data = parse_stingray_buy_gis(json_data)
            
            data.extend(list_data)

    df= pd.DataFrame(data)
    
    df.drop_duplicates(subset=["MLS ID"], inplace=True)

    df["updated_date"] = datetime.now().date()
    print("here")
    df = geocode_dataframe(df, "Latitude", 'Longitude')

    existing_homes = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\for_sale_homes.csv", index_col = None)

    existing_homes['MLS ID'] = existing_homes['MLS ID'].astype(str).str.strip()
    df['MLS ID'] = df['MLS ID'].astype(str).str.strip()

    # Identify common Property IDs
    common_property_ids = existing_homes[existing_homes['MLS ID'].isin(df['MLS ID'])]

    # Filter out these common Property IDs from the existing rentals DataFrame
    existing_homes = existing_homes[~existing_homes['MLS ID'].isin(common_property_ids['MLS ID'])]

    updated_homes = pd.concat([existing_homes, df], ignore_index=True)

    print(f"In the State of {State}, {df.shape[0]} Homes were Downloaded. {common_property_ids.shape[0]} were duplicates leaving {updated_homes.shape[0]} homes in the whole dataset")

    updated_homes.to_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\for_sale_homes.csv", index = False)


Number of Zipcodes to be Scrubbed in TX: 2661
0 Zip Codes Evaluated
100 Zip Codes Evaluated
200 Zip Codes Evaluated
300 Zip Codes Evaluated
400 Zip Codes Evaluated
500 Zip Codes Evaluated
600 Zip Codes Evaluated
700 Zip Codes Evaluated
800 Zip Codes Evaluated
900 Zip Codes Evaluated
1000 Zip Codes Evaluated
1100 Zip Codes Evaluated
1200 Zip Codes Evaluated
1300 Zip Codes Evaluated
1400 Zip Codes Evaluated
1500 Zip Codes Evaluated
1600 Zip Codes Evaluated
1700 Zip Codes Evaluated
1800 Zip Codes Evaluated
1900 Zip Codes Evaluated
2000 Zip Codes Evaluated
2100 Zip Codes Evaluated
2200 Zip Codes Evaluated
2300 Zip Codes Evaluated
2400 Zip Codes Evaluated
2500 Zip Codes Evaluated
2600 Zip Codes Evaluated
here
In the State of TX, 180368 Homes were Downloaded. 1380 were duplicates leaving 293236 homes in the whole dataset
