In [60]:
import pandas as pd
import geopandas as gpd
import requests
from bs4 import BeautifulSoup
import requests
import json
from datetime import datetime


def get_target_zips(state, city=None, zip_code=None):
    zips = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\zipcode_source\zip_code_database.csv")
    
    if city is None and zip_code is None:
        target_zips = zips[zips["state"] == state]["zip"].tolist()
    elif zip_code is None:
        target_zips = zips[(zips["primary_city"] == city) & (zips["state"] == state)]["zip"].tolist()
    else:
        target_zips = [zip_code]
    
    return target_zips


def get_stingray_rgn_id(zip):
    query_location_api = f"https://www.redfin.com/stingray/do/query-location?location={zip}&v=2"
    response = requests.get(query_location_api, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}) 
    soup = BeautifulSoup(response.text, 'html.parser').text
    prefix_removed = soup.split('&&', 1)[1]
    data = json.loads(prefix_removed)
    try:
        region_id = data["payload"]["exactMatch"].get("id").split("_",1)[1]
        return region_id
    except:
        print(f"No Exact match found for zip: {zip}")
        return None


def build_stingray_gis_params(params):
        return "&".join(f"{key}={value}" for key, value in params.items() if params.get(key) != None)




def call_stingray_rent_gis(params_url):
    api_url = "https://www.redfin.com/stingray/api/v1/search/rentals?"
    url = f"{api_url}?{params_url}"
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'})
    soup = BeautifulSoup(response.text, 'html.parser').text
    # print(url)
    data = json.loads(soup)

    return data


def parse_stingray_rent_gis(data):
    homes = data.get('homes', [])
    parsed_homes = []
    
    for home in homes:
        home_data = home.get('homeData', {})
        rental_data = home.get('rentalExtension', {})
        
        home_info = {
            "Property ID": home_data.get('propertyId'),
            "URL": home_data.get('url'),
            "Property Type": home_data.get('propertyType'),
            # "Photos Info": home_data.get('photosInfo', {}).get('photoRanges'),
            # "Static Map URL": home_data.get('staticMapUrl'),
            # "Has AT&T Fiber": home_data.get('hasAttFiber'),
            "Address": home_data.get('addressInfo', {}).get('formattedStreetLine'),
            "City": home_data.get('addressInfo', {}).get('city'),
            "State": home_data.get('addressInfo', {}).get('state'),
            "ZIP Code": home_data.get('addressInfo', {}).get('zip'),
            "Country Code": home_data.get('addressInfo', {}).get('countryCode'),
            "Latitude": home_data.get('addressInfo', {}).get('centroid', {}).get('centroid', {}).get('latitude'),
            "Longitude": home_data.get('addressInfo', {}).get('centroid', {}).get('centroid', {}).get('longitude'),
            "Rental ID": rental_data.get('rentalId'),
            "Max Beds": rental_data.get('bedRange', {}).get('max'),
            "Max Baths": rental_data.get('bathRange', {}).get('max'),
            "Max Square Feet": rental_data.get('sqftRange', {}).get('max'),
            "Max Rent Price": rental_data.get('rentPriceRange', {}).get('max'),
            # "Last Updated": rental_data.get('lastUpdated'),
            # "Number of Available Units": rental_data.get('numAvailableUnits'),
            # "Status": rental_data.get('status'),
            # "Date Available": rental_data.get('dateAvailable'),
            # "Rental Details Page Type": rental_data.get('rentalDetailsPageType'),
            # "Search Rank Score": rental_data.get('searchRankScore'),
            # "Freshness Timestamp": rental_data.get('freshnessTimestamp'),
            "Description": rental_data.get('description'),
            # "Revenue Per Lead": rental_data.get('revenuePerLead'),
            # "Feed Source Internal ID": rental_data.get('feedSourceInternalId'),
            # "Is Commercial Paid": rental_data.get('isCommercialPaid'),
            # "Feed Original Source": rental_data.get('feedOriginalSource'),
            # "Desktop Phone": rental_data.get('desktopPhone'),
            # "Mobile Web Phone": rental_data.get('mobileWebPhone'),
            # "Mobile App Phone": rental_data.get('mobileAppPhone')
        }
        parsed_homes.append(home_info)
    
    return parsed_homes

def geocode_dataframe(df, latitude_col='Latitude', longitude_col='Longitude'):
    import geopandas as gpd
    import pandas as pd
    """
    Geocode the given DataFrame based on geographic data files.

    Parameters:
    df (pd.DataFrame): DataFrame containing the data to be geocoded.
    longitude_col (str): Name of the column containing longitude values.
    latitude_col (str): Name of the column containing latitude values.

    Returns:
    pd.DataFrame: Geocoded DataFrame.
    """
    # Convert the DataFrame to a GeoDataFrame
    gdf = gpd.GeoDataFrame(
        df, geometry=gpd.points_from_xy(df[longitude_col], df[latitude_col]), crs="EPSG:4326"
    )

    # Load and preprocess demographic areas
    demographic_areas = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\census_block_group_source_nationwide\v107\blkgrp.gdb")
    demographic_areas.to_crs("EPSG:4326", inplace=True)
    demographic_areas["GEOID"] = (
        demographic_areas["STATE_FIPS"].astype(str).str.zfill(2)
        + demographic_areas["COUNTY_FIPS"].astype(str).str.zfill(3)
        + demographic_areas["TRACT_FIPS"].astype(str).str.zfill(6)
        + demographic_areas["BLOCKGROUP_FIPS"].astype(str)
    )
    demographic_areas = demographic_areas[["GEOID", "geometry"]].rename(columns={"GEOID": "cbg_geoid"})

    # Load and preprocess CBSA areas
    cbsa_source = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\cbsa_source\tl_2020_us_cbsa.shp")
    cbsa_source.to_crs("EPSG:4326", inplace=True)
    cbsa_source = cbsa_source[["GEOID", "NAME", "geometry"]].rename(columns={"GEOID": "cbsa_geoid", "NAME": "cbsa_name"})

    # Load and preprocess state areas
    state_source = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\state_source\States_shapefile.shp")
    state_source.to_crs("EPSG:4326", inplace=True)
    state_source = state_source[["FID", "State_Code", "geometry"]].rename(columns={"FID": "state_id", "State_Name": "state_name"})

    # Perform spatial joins
    geocoded_dots = gdf.sjoin(demographic_areas, how="left").drop(["index_right"], axis=1)
    geocoded_dots = geocoded_dots.sjoin(cbsa_source, how='left').drop(["index_right"], axis=1)
    geocoded_dots = geocoded_dots.sjoin(state_source, how='left').drop(["index_right"], axis=1)

    # Drop unnecessary columns
    geocoded_dots = geocoded_dots.drop(['geometry'], axis=1)

    return pd.DataFrame(geocoded_dots)


In [61]:
#Load Spatial Datasets
demographic_areas = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\census_block_group_source_nationwide\v107\blkgrp.gdb")
demographic_areas.to_crs("EPSG:4326", inplace=True)

demographic_areas["GEOID"] = \
    demographic_areas["STATE_FIPS"].astype(str).str.zfill(2)  \
    + demographic_areas["COUNTY_FIPS"].astype(str).str.zfill(3) \
    + demographic_areas["TRACT_FIPS"].astype(str).str.zfill(6) \
    + demographic_areas["BLOCKGROUP_FIPS"].astype(str)

demographic_areas = demographic_areas[["GEOID", "geometry"]].rename(columns={"GEOID":"cbg_geoid"})

cbsa_source = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\cbsa_source\tl_2020_us_cbsa.shp")
cbsa_source.to_crs("EPSG:4326", inplace=True)
cbsa_source = cbsa_source[["GEOID", "NAME", "geometry"]].rename(columns={"GEOID":"cbsa_geoid", "NAME": "cbsa_name"})

state_source = gpd.read_file(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\state_source\States_shapefile.shp")
state_source.to_crs("EPSG:4326", inplace=True)
state_source = state_source[["FID", "State_Code", "geometry"]].rename(columns={"FID":"state_id", "State_Name": "state_name"})

In [119]:


Zip = None
City = None
States = ["IL", "WI", "MT", "IN", "ID"]

target_zips = []

for State in States:
    target_zips.extend(get_target_zips(State, City, Zip))


print(f"Number of Zipcodes to be Scrubbed: {len(target_zips)}")

data = []

for index, zip in enumerate(target_zips):
    if index % 10 == 0:
        print(f"{index} Zip Codes Evaluated")

    params = {
        #??Active Listings
        "al": 1,
        #Rentals Only
        "isRentals":"true",
        #Include Nearby Homes
        "include_nearby_homes": "false",
        # Market. ie Seattle
        "market": None,
        # Number of homes to retrieve
        "num_homes": 350,
        #How to Sort the homes
        "ord": "days-on-redfin-asc",
        "page_number": 1,
        "poly": None,
        #Listing Types
        "sf": "1,2,3,4,5,6,7",
        "start": None,
        "status": 9,
        # User input property types (currently only single family, townhomes, multifamily : 134)
        "uipt": "1,3,4",
        # ??API Version?
        "v": 8,
        "zoomLevel": None,
        #Type of Region analyzed
        "region_type" : 2,
        "region_id" : get_stingray_rgn_id(zip)
    }

    if params.get("region_id") == None:
        continue
    else:
        url_param = build_stingray_gis_params(params)
        json_data = call_stingray_rent_gis(url_param)
        list_data = parse_stingray_rent_gis(json_data)

    data.extend(list_data)


new_rentals = pd.DataFrame(data)
new_rentals.drop_duplicates(subset=["Property ID"], inplace=True)

new_rentals["updated_date"] = datetime.now().date()


Number of Zipcodes to be Scrubbed: 4209
0 Zip Codes Evaluated
10 Zip Codes Evaluated
20 Zip Codes Evaluated
30 Zip Codes Evaluated
40 Zip Codes Evaluated
50 Zip Codes Evaluated
60 Zip Codes Evaluated
70 Zip Codes Evaluated
80 Zip Codes Evaluated
90 Zip Codes Evaluated
100 Zip Codes Evaluated
110 Zip Codes Evaluated
120 Zip Codes Evaluated
130 Zip Codes Evaluated
140 Zip Codes Evaluated
150 Zip Codes Evaluated
160 Zip Codes Evaluated
170 Zip Codes Evaluated
180 Zip Codes Evaluated
No Exact match found for zip: 60290
190 Zip Codes Evaluated
200 Zip Codes Evaluated
210 Zip Codes Evaluated
220 Zip Codes Evaluated
230 Zip Codes Evaluated
240 Zip Codes Evaluated
250 Zip Codes Evaluated
260 Zip Codes Evaluated
270 Zip Codes Evaluated
280 Zip Codes Evaluated
290 Zip Codes Evaluated
300 Zip Codes Evaluated
310 Zip Codes Evaluated
320 Zip Codes Evaluated
330 Zip Codes Evaluated
No Exact match found for zip: 60569
340 Zip Codes Evaluated
350 Zip Codes Evaluated
360 Zip Codes Evaluated
370 Zip Cod

ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'An established connection was aborted by the software in your host machine', None, 10053, None))

In [121]:

new_rentals = pd.DataFrame(data)
new_rentals.drop_duplicates(subset=["Property ID"], inplace=True)

new_rentals["updated_date"] = datetime.now().date()

display(new_rentals)

Unnamed: 0,Property ID,URL,Property Type,Address,City,State,ZIP Code,Country Code,Latitude,Longitude,Rental ID,Max Beds,Max Baths,Max Square Feet,Max Rent Price,Description,updated_date
0,191464720,/IL/Antioch/445-Donin-Dr-60002/unit-428-108/ap...,5,445 Donin Dr Unit 428 108,Antioch,IL,60002,1,42.491402,-88.098319,198a1738-7532-4a17-9398-302747072a3b,1,1.0,735.0,1095.0,Antioch Manor Apartments is an apartment commu...,2024-07-04
1,191463871,/IL/Antioch/445-Donin-Dr-60002/unit-397-204/ap...,5,445 Donin Dr Unit 397 204,Antioch,IL,60002,1,42.491402,-88.098319,9f4fa2b6-1c67-4995-93be-d43efe7699de,2,1.0,830.0,1250.0,Antioch Manor Apartments is an apartment commu...,2024-07-04
2,181910503,/IL/Antioch/Antioch-Manor-Apartments/apartment...,5,455 Donin Dr,Antioch,IL,60002,1,42.490300,-88.097700,5dd0da47-a556-4aef-a3da-e9a3a287897d,2,1.0,965.0,1350.0,Antioch Manor Apartments is an apartment commu...,2024-07-04
3,14195029,/IL/Antioch/42328-N-Oak-St-60002/home/14195029,6,42328 N Oak St,Antioch,IL,60002,1,42.478843,-88.152840,3c224640-848e-4eef-9ab0-4b98a81062cc,2,1.0,772.0,1950.0,A cozy 2 bedroom cottage with access to channe...,2024-07-04
4,191258358,/IL/Antioch/39937-Hidden-Bunker-Ct-60002/unit-...,6,39937 Hidden Bunker Ct Unit 39937,Antioch,IL,60002,1,42.442073,-88.121548,10b4bb31-84dc-4e62-b5e2-472d8c218d3c,2,1.0,1036.0,1600.0,"WELCOME HOME TO THIS 2 BED, 1 BATH BRIGHT AND ...",2024-07-04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27688,105145323,/MT/Butte/706-W-Broadway-St-59701/home/105145323,6,706 W Broadway St,Butte,MT,59701,1,46.012938,-112.546383,cebcfbad-3c67-4709-a58e-4e2fad66c736,4,2.0,2362.0,2500.0,This beautifully furnished house offers the pe...,2024-07-04
27689,105156304,/MT/Butte/824-S-Main-St-59701/home/105156304,6,824 S Main St,Butte,MT,59701,1,46.003422,-112.534147,c03b1f75-23a7-47cb-815a-0e628e687e09,3,1.0,1191.0,1500.0,"Newly remodeled, 3 bedroom 1 bath house all on...",2024-07-04
27690,189956542,/MT/Butte/1111-S-Wyoming-St-59701/apartment/18...,5,1111 S Wyoming St,Butte,MT,59701,1,46.002162,-112.527933,760993b8-a0d4-437b-9d3e-c533e543a16a,1,1.0,,900.0,Top unit 1 bedroom apartment for rent. Excelle...,2024-07-04
27691,188063217,/MT/Butte/123-W-Broadway-St-59701/apartment/18...,5,123 W Broadway St,Butte,MT,59701,1,46.013651,-112.538724,9f1cbf52-5ad5-4191-b2e6-97bd0f3ae417,0,1.0,,650.0,Recently remodeled studio apartment in secure ...,2024-07-04


In [122]:
print('Beginning Geocoding')


gdf = gpd.GeoDataFrame(
    new_rentals, geometry=gpd.points_from_xy(new_rentals["Longitude"], new_rentals["Latitude"]), crs="EPSG:4326"
)

# Perform spatial joins
geocoded_dots = gdf.sjoin(demographic_areas, how="left").drop(["index_right"], axis=1)
geocoded_dots = geocoded_dots.sjoin(cbsa_source, how='left').drop(["index_right"], axis=1)
geocoded_dots = geocoded_dots.sjoin(state_source, how='left').drop(["index_right"], axis=1)

# Drop unnecessary columns
geocoded_dots = geocoded_dots.drop(['geometry'], axis=1)

new_rentals_geocoded = pd.DataFrame(geocoded_dots)

# display(new_rentals_geocoded)


Beginning Geocoding


In [123]:
print('Beginning Deduping')

existing_rentals = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\rentals.csv")



existing_rentals['Property ID'] = existing_rentals['Property ID'].astype(str).str.strip()
new_rentals_geocoded['Property ID'] = new_rentals_geocoded['Property ID'].astype(str).str.strip()

# Identify common Property IDs
common_property_ids = existing_rentals[existing_rentals['Property ID'].isin(new_rentals_geocoded['Property ID'])]

# Filter out these common Property IDs from the existing rentals DataFrame
existing_rentals = existing_rentals[~existing_rentals['Property ID'].isin(common_property_ids['Property ID'])]

updated_rentals = pd.concat([existing_rentals, new_rentals_geocoded], ignore_index=True)

print(new_rentals_geocoded.shape[0], " Rentals Downloaded")
print(common_property_ids.shape[0], " Duplicate Rentals")
print(updated_rentals.shape[0], " Total Rentals in Dataset")


Beginning Deduping
16980  Rentals Downloaded
0  Duplicate Rentals
26616  Total Rentals in Dataset


In [124]:
updated_rentals.to_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\rentals.csv", index=False)