In [2]:
import os
import pandas as pd
from supabase import create_client, Client
from dotenv import load_dotenv
import numpy as np
from redfin_scraping_utils import RentScraper, BuyScraper
from geocoding_utils import Geocoder 
import geopandas as  gpd

In [3]:
#parameters
states = ['WA', 'ID', 'OR', 'MI', 'IL', 'IA', 'WI', 'MN', 'IN']

In [10]:
load_dotenv()

# Create Supabase Client
url: str = os.getenv("SUPABASE_URL")
key: str = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(url, key)

#Initialize Scraper
buy_scraper = BuyScraper()
rent_scraper = RentScraper()

In [13]:
states = ['WA', 'ID', 'OR', 'MI', 'IL', 'IA', 'WI', 'MN', 'IN']

for state in states:
    
    #Initialize New DF for each run
    rent_df = pd.DataFrame()
    # Scrape for-sale listings and append to buy_df
    rent_data = rent_scraper.scrape_state(state=state)
    
    if not rent_data.empty:
        rent_df = pd.concat([rent_df, rent_data], ignore_index=True)
    
    rent_df = rent_df.replace(np.nan, None)
    rent_df = rent_df.drop_duplicates(subset= "property_id")
    records = rent_df.to_dict(orient='records')

    print(f"Scraped {len(rent_data)} for-rent listings for state {state}")


    geocoder = Geocoder(
        rent_df, 
        latitude_col='latitude', 
        longitude_col='longitude'
    )

    df_geocoded = geocoder.geocode_all(
        demographic_areas_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\census_block_group_source_nationwide\v107\blkgrp.gdb",
        cbsa_source_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\cbsa_source\tl_2020_us_cbsa.shp", 
        state_source_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\state_source\States_shapefile.shp"
    )

    print("Writing")

    df_geocoded.to_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\rentals_0926.csv", mode='a')

    print(f"Write Complete")



Scraping 733 Zip Codes in WA
Processing 24% done (183/733 zip codes)
Processing 49% done (366/733 zip codes)
Processing 74% done (549/733 zip codes)
Scraped 10834 for-rent listings for state WA
Writing
Write Complete
Scraping 325 Zip Codes in ID
Processing 24% done (81/325 zip codes)
Processing 49% done (162/325 zip codes)
Processing 74% done (243/325 zip codes)
Scraped 1403 for-rent listings for state ID
Writing
Write Complete
Scraping 492 Zip Codes in OR
Processing 25% done (123/492 zip codes)
Processing 50% done (246/492 zip codes)
Processing 75% done (369/492 zip codes)
Scraped 6371 for-rent listings for state OR
Writing
Write Complete
Scraping 1170 Zip Codes in MI
Processing 24% done (292/1170 zip codes)
Processing 50% done (585/1170 zip codes)
Processing 74% done (877/1170 zip codes)
Scraped 6324 for-rent listings for state MI
Writing
Write Complete
Scraping 1590 Zip Codes in IL
Processing 24% done (397/1590 zip codes)
Processing 50% done (795/1590 zip codes)
Processing 74% done 

In [4]:

for state in states:
    
    #Initialize New DF for each run
    buy_df = pd.DataFrame()
    # Scrape for-sale listings and append to buy_df
    buy_data = buy_scraper.scrape_state(state)
    
    if not buy_data.empty:
        buy_df = pd.concat([buy_df, buy_data], ignore_index=True)
    
    buy_df = buy_df.replace(np.nan, None)
    buy_df = buy_df.drop_duplicates(subset= "property_id")
    records = buy_df.to_dict(orient='records')

    print(f"Scraped {len(buy_data)} for-sale listings for state {state}")

    geocoder = Geocoder(
        buy_df, 
        latitude_col='latitude', 
        longitude_col='longitude'
    )

    df_geocoded = geocoder.geocode_all(
        demographic_areas_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\census_block_group_source_nationwide\v107\blkgrp.gdb",
        cbsa_source_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\cbsa_source\tl_2020_us_cbsa.shp", 
        state_source_path=r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Areas\state_source\States_shapefile.shp"
    )

    

    # print("Writing...")
    # response = (
    #     supabase.table("redfin_listings_bronze")
    #     .upsert(records, on_conflict="property_id")
    #     .execute()
    # )

    print(f"Write Complete")

Scraping 733 Zip Codes in WA
Processing 24% done (183/733 zip codes)
Processing 49% done (366/733 zip codes)
Processing 74% done (549/733 zip codes)
Scraped 37148 for-sale listings for state WA
Writing...
Write Complete
Scraping 325 Zip Codes in ID
Processing 24% done (81/325 zip codes)
Processing 49% done (162/325 zip codes)
Processing 74% done (243/325 zip codes)
Scraped 16945 for-sale listings for state ID
Writing...
Write Complete
Scraping 492 Zip Codes in OR
Processing 25% done (123/492 zip codes)
Processing 50% done (246/492 zip codes)
Processing 75% done (369/492 zip codes)
Scraped 27530 for-sale listings for state OR
Writing...
Write Complete


In [12]:
df = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\geocoded_forsale.csv")



with open(r'../train_predict\utils.py') as f:
    exec(f.read())

# Load the selected features
with open(r'../train_predict/selected_features.json', 'r') as f:
    selected_features = json.load(f)


df = df[(df["bedrooms"] >= 1) 
        & (df["bedrooms"] < 6)  
        & (df["bathrooms"] < 4)
        & (df["bathrooms"] >= 1)  
        & (df["square_feet"] < 5000)    
        & (df["state"].notna())]

# Convert Bedrooms to string and clean up
df["bedrooms"] = df["bedrooms"].astype(str).str.split('.').str[0].astype(int)

#Convert bathrooms to .5 increments
df['bathrooms'] = df['bathrooms'].round(1)
df['bathrooms'] = (df['bathrooms'] * 2).round() / 2  # Ensures rounding to nearest 0.5


basic_features = ["square_feet", "bedrooms", "bathrooms"]
basic_metadata = ['mls_id', 'status', 'price', 'hoa_fee', 'lot_size', 
       'location', 'stories', 'address', 'city', 'state', 'zip_code',
       'year_built', 'url', 'latitude', 'longitude', 'updated_date']


  df = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\geocoded_forsale.csv")


In [16]:
df = get_median_income_data(df, 'cbg_geoid', r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Rent Training Data\ACSDT5Y2022.B19013-Data.csv")

In [21]:

df = fill_null(df, columns_to_fill=['median_income'], method='median', groupby='state')

# Filter out rows where the state is not in the trained states
states_trained = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\rentals.csv")
states = states_trained["State_Code"].unique()
df = df[df["state_code"].isin(states)]


# Generate Rent Benchmarks using KNN models
knn_features = ["latitude", "longitude"]
n_values = [1, 5, 10]
save_location = r'..\Models'
df, benchmark_features = create_knn_benchmark_rent(df, knn_features, target='Rent', n_values=n_values, save_location=save_location, mode='predict')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [24]:
columns_to_encode = ['bedrooms', 'bathrooms']
df, one_hot_features = one_hot_encode_features(
    df, 
    columns_to_encode, 
    mode='predict', 
    drop_first=True, 
    encoder_filename=r'C:\Users\mattl\OneDrive\Desktop\Projects\stoebebirch\Models\one_hot_encoder.pkl', 
    feature_names_filename=r'..\train_predict\encoded_feature_names.json'
)

ValueError: Found unknown categories [4.0] in column 1 during transform