In [56]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json

#Loat Utils
with open('utils.py') as f:
    exec(f.read())

# Load the selected features
with open('selected_features.json', 'r') as f:
    selected_features = json.load(f)


In [57]:

# Load new data
df = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\for_sale_homes.csv", index_col=False)

# Initial Filtering
df = df[(df["Bedrooms"] >= 1) 
        & (df["Bedrooms"] < 6)  
        & (df["Bathrooms"] < 4)
        & (df["Bathrooms"] >= 1)  
        & (df["Square Feet"] < 5000)    
        & (df["State"].notna())]

# Convert Bedrooms to string and clean up
df["Bedrooms"] = df["Bedrooms"].astype(str).str.split('.').str[0].astype(int)

#Convert bathrooms to .5 increments
df['Bathrooms'] = df['Bathrooms'].round(1)
df['Bathrooms'] = (df['Bathrooms'] * 2).round() / 2  # Ensures rounding to nearest 0.5


basic_features = ["Square Feet", "Bedrooms", "Bathrooms"]
basic_metadata = ['MLS ID', 'Status', 'Price', 'HOA Fee', 'Lot Size', 
       'Location', 'Stories', 'Address', 'City', 'State', 'ZIP Code',
       'Year Built', 'URL', 'Latitude', 'Longitude', 'updated_date']


  df = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\for_sale_homes.csv", index_col=False)


In [58]:
# Add the median income data
df = get_median_income_data(df, 'cbg_geoid', r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Rent Training Data\ACSDT5Y2022.B19013-Data.csv")
df = fill_null(df, columns_to_fill=['median_income'], method='median', groupby='State')

# Filter out rows where the state is not in the trained states
states_trained = pd.read_csv(r"C:\Users\mattl\OneDrive\Documents\reibrowser\Database\Redfin Data\rentals.csv")
states = states_trained["State_Code"].unique()
df = df[df["State_Code"].isin(states)]


In [59]:

# Generate Rent Benchmarks using KNN models
knn_features = ["Latitude", "Longitude"]
n_values = [1, 5, 10]
save_location = r'C:\Users\mattl\OneDrive\Desktop\Projects\stoebebirch\Models'
df, benchmark_features = create_knn_benchmark_rent(df, knn_features, target='Rent', n_values=n_values, save_location=save_location, mode='predict')

In [60]:

# One-Hot Encode Bedrooms and Bathrooms in 'predict' mode
columns_to_encode = ['Bedrooms', 'Bathrooms']
df, one_hot_features = one_hot_encode_features(
    df, 
    columns_to_encode, 
    mode='predict', 
    drop_first=True, 
    encoder_filename=r'C:\Users\mattl\OneDrive\Desktop\Projects\stoebebirch\Models\one_hot_encoder.pkl', 
    feature_names_filename=r'.\encoded_feature_names.json'
)

In [61]:


# Combine features
potential_features = ["median_income"] +basic_features +one_hot_features  + benchmark_features

df = df.dropna(subset=potential_features)

# Scale the features using the saved scaler
X = scale_features(df[potential_features], mode='predict', scaler_filename=r'C:\Users\mattl\OneDrive\Desktop\Projects\stoebebirch\Models\scaler.pkl')

# Generate polynomial features using the saved polynomial transformer
X, poly_feature_names = generate_polynomial_features(X, potential_features, mode='predict', poly_filename=r'C:\Users\mattl\OneDrive\Desktop\Projects\stoebebirch\Models\poly_transformer.pkl', degree=2)

In [62]:
# Select the features that were identified during the training step
with open(r"./selected_features.json", 'r') as f:
    selected_features = json.load(f)


X = pd.DataFrame(X, columns=poly_feature_names)
X_selected = X[selected_features]

# Load the trained model
model = joblib.load(r'C:\Users\mattl\OneDrive\Desktop\Projects\stoebebirch\Models\single_family_rent_predictor.joblib')

# Predict rent using the selected features
df['predicted_rent'] = model.predict(X_selected)

In [63]:

df_to_write = df[basic_metadata + basic_features + ["median_income", "predicted_rent"]]

# Optionally, save the predictions to a file
df_to_write.to_csv(r'.\nationwide_predicted_rent_clean.csv', index=False)