### MAIN DOCUMENT

In [1]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

# Load Data

In [None]:
#pricing data
zillow = pd.read_csv("zillow_data.csv")
print(zillow.head())

In [None]:
#income and rent data
acs = pd.read_csv("dp03_1yr.csv")
print(acs.head())

In [None]:
demographic = pd.read_csv("dp02_1yer.csv")
print(demographic.head())

In [None]:
google_trends = pd.read_csv("google_trends.csv")

# Inspect data
print(google_trends.head())

# Rename for clarity
google_trends.rename(columns={'buy_a_house_fast': 'buy_fast', 'foreclosure_help': 'foreclosure'}, inplace=True)

In [None]:
# merge for states
df = zillow.merge(acs, on="", how="inner")\
           .merge(demographic, on="", how="left")\
           .merge(google_trends, on="", how="left")

# Check merged dataset
print(df.head())

# Feature Engineering

In [None]:
#Price to income ratio and price to rent ratio (baselines)
df["PIR"] = df["home_price"] / df["income"]
df["PRR"] = df["home_price"] / (df["rent"] * 12)  # Convert rent to annual rent

# Flag potential overvaluation
df["overvalued"] = ((df["PIR"] > 5) & (df["PRR"] > 25)).astype(int)

# Check distribution
print(df[["ZIP_code", "PIR", "PRR", "overvalued"]].head())

In [None]:
#price growth vs. income growth
df["price_income_growth_diff"] = df["home_price"].pct_change() - df["income"].pct_change()

# Flag regions where home prices are rising significantly faster than incomes
df["high_growth_risk"] = (df["price_income_growth_diff"] > 0.1).astype(int)

# Check flagged cases
print(df[["ZIP_code", "price_income_growth_diff", "high_growth_risk"]].head())

In [None]:
#search trends
# Compute a speculative sentiment score
df["speculation_score"] = df["buy_fast"] - df["foreclosure"]

# Flag speculative markets
df["speculative_market"] = (df["speculation_score"] > 0).astype(int)

print(df[["ZIP_code", "speculation_score", "speculative_market"]].head())

In [None]:
#Normalize features
scaler = MinMaxScaler()
df[["PIR", "PRR", "price_income_growth_diff", "speculation_score"]] = scaler.fit_transform(df[["PIR", "PRR", "price_income_growth_diff", "speculation_score"]])

print(df.head())

# Data Cleaning

In [None]:
# Identify missing values
print(df.isnull().sum())

# Fill missing values with median values (FIND WHATS BEST)
df.fillna(df.median(), inplace=True)

In [None]:
# save data
df.to_csv("processed_housing_data.csv", index=False)
print("Processed data saved successfully!")