In [1]:
import pandas as pd
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
from typing import List
import re

In [2]:
def generate_year_month_range(end_date: date = None, years_back: int = 6) -> List[str]:
    """
    Generate a list of year-month combinations in 'yyyymm' format,
    starting from the specified end date and going back a specified number of years.

    Args:
        end_date (date, optional): The end date to start from. Defaults to today's date.
        years_back (int, optional): Number of years to go back. Defaults to 6.

    Returns:
        List[str]: List of year-month combinations in 'yyyymm' format, sorted in descending order.

    Example:
        >>> generate_year_month_range()  # If today is 2024-11-14
        ['202411', '202410', '202409', ..., '201812']
    """
    # If no end date is provided, use today's date
    if end_date is None:
        end_date = date.today()

    # Calculate start date
    start_date = end_date - relativedelta(years=years_back)

    # Initialize result list
    date_list = []

    # Current date for iteration
    current_date = end_date

    # Generate dates until we reach start date
    while current_date >= start_date:
        # Format date as 'yyyymm'
        date_str = current_date.strftime('%Y%m')
        date_list.append(date_str)
        # Move to previous month
        current_date -= relativedelta(months=1)

    return date_list

# Example usage
if __name__ == "__main__":
    date_list = generate_year_month_range()
    print(f"Generated {len(date_list)} year-month combinations:")
    print(date_list[:12])  # Print first year as example

Generated 73 year-month combinations:
['202503', '202502', '202501', '202412', '202411', '202410', '202409', '202408', '202407', '202406', '202405', '202404']


In [3]:
import requests

In [4]:
# read in the metro_areas.json file as a list
import json
with open('data/metro_areas.json') as f:
    metro_areas = json.load(f)

metro_areas

['Abilene, TX',
 'Akron, OH',
 'Albany, OR',
 'Albany-Schenectady-Troy, NY',
 'Albuquerque, NM',
 'Allentown-Bethlehem-Easton, PA-NJ',
 'Altoona, PA',
 'Amarillo, TX',
 'Ames, IA',
 'Amherst Town-Northampton, MA',
 'Ann Arbor, MI',
 'Appleton, WI',
 'Asheville, NC',
 'Athens-Clarke County, GA',
 'Atlanta-Sandy Springs-Roswell, GA',
 'Atlantic City-Hammonton, NJ',
 'Auburn-Opelika, AL',
 'Baltimore-Columbia-Towson, MD',
 'Barnstable Town, MA',
 'Baton Rouge, LA',
 'Battle Creek, MI',
 'Bay City, MI',
 'Birmingham, AL',
 'Bloomington, IL',
 'Bloomington, IN',
 'Boise City, ID',
 'Boston-Cambridge-Newton, MA-NH',
 'Boulder, CO',
 'Bowling Green, KY',
 'Bremerton-Silverdale-Port Orchard, WA',
 'Bridgeport-Stamford-Danbury, CT',
 'Brownsville-Harlingen, TX',
 'Buffalo-Cheektowaga, NY',
 'Burlington, NC',
 'Burlington-South Burlington, VT',
 'Canton-Massillon, OH',
 'Cape Coral-Fort Myers, FL',
 'Cape Girardeau, MO-IL',
 'Carson City, NV',
 'Cedar Rapids, IA',
 'Chambersburg, PA',
 'Champaig

In [5]:
import pandas as pd
import requests
from datetime import datetime

def check_url_exists(url):
    """Check if a URL exists without downloading the full file"""
    try:
        response = requests.head(url)
        return response.status_code == 200
    except:
        return False

def process_2024_data(url):
    """Process data from 2024 onwards"""
    df = pd.read_excel(url, skiprows=7)
    
    # Clean columns
    df.drop(columns=['Metro /Micro Code', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_2022_2023_data(url):
    """Process data from 2022-2023"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Unnamed: 9'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_pre_2022_data(url):
    """Process data from 2021 and earlier"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Monthly Coverage Percent*', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

# Initialize an empty list to store all dataframes
all_dfs = []

# Process each date
for date_str in date_list:
    year = int(date_str[:4])
    
    try:
        # Determine URL format and processing function based on year
        if year >= 2024:
            url = f'https://www.census.gov/construction/bps/xls/cbsamonthly_{date_str}.xls'
            process_func = process_2024_data
        else:
            url = f'https://www.census.gov/construction/bps/xls/msamonthly_{date_str}.xls'
            if year >= 2022:
                process_func = process_2022_2023_data
            else:
                process_func = process_pre_2022_data
        
        # Check if URL exists
        if check_url_exists(url):
            try:
                df = process_func(url)
                all_dfs.append(df)
                print(f"Successfully processed data for {date_str}")
            except Exception as e:
                print(f"Error processing {date_str}: {str(e)}")
                continue
        else:
            print(f"No data available for {date_str}")
            continue
            
    except Exception as e:
        print(f"Error with {date_str}: {str(e)}")
        continue

# Concatenate all dataframes if we have any data
if all_dfs:
    homebuilding = pd.concat(all_dfs, ignore_index=True)
    
    # Convert date column to datetime
    homebuilding['date'] = pd.to_datetime(homebuilding['date'], format='%Y%m')
    
    # Sort by date and other relevant columns
    homebuilding = homebuilding.sort_values(['date', 'Name'], ascending=[False, True])
    
    print(f"\nFinal dataset contains {len(homebuilding)} rows from {len(all_dfs)} different months")
    print(f"Date range: {homebuilding['date'].min()} to {homebuilding['date'].max()}")
else:
    print("No data was successfully processed")
    homebuilding = pd.DataFrame()

No data available for 202503
No data available for 202502
Successfully processed data for 202501
Successfully processed data for 202412
Successfully processed data for 202411
Successfully processed data for 202410
Successfully processed data for 202409
Successfully processed data for 202408
Successfully processed data for 202407
Successfully processed data for 202406
Successfully processed data for 202405
Successfully processed data for 202404
Successfully processed data for 202403
Successfully processed data for 202402
Successfully processed data for 202401
Successfully processed data for 202312
Successfully processed data for 202311
Successfully processed data for 202310
Successfully processed data for 202309
Successfully processed data for 202308
Successfully processed data for 202307
Successfully processed data for 202306
Successfully processed data for 202305
Successfully processed data for 202304
Successfully processed data for 202303
Successfully processed data for 202302
Succes

In [6]:
# homebuilding['date'] = pd.to_datetime(homebuilding['date'])
homebuilding.dtypes

CSA                                                  float64
CBSA                                                 float64
Name                                                  object
Total                                                float64
1 Unit                                               float64
2 Units                                              float64
3 and 4 Units                                        float64
5 Units or More                                      float64
Num of Structures With 5 Units or More               float64
Total_ytd                                            float64
1 Unit_ytd                                           float64
2 Units_ytd                                          float64
3 and 4 Units_ytd                                    float64
5 Units or More_ytd                                  float64
Num of Structures With 5 Units or More_ytd           float64
date                                          datetime64[ns]
dtype: object

In [7]:
homebuilding.columns

Index(['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', '3 and 4 Units',
       '5 Units or More', 'Num of Structures With 5 Units or More',
       'Total_ytd', '1 Unit_ytd', '2 Units_ytd', '3 and 4 Units_ytd',
       '5 Units or More_ytd', 'Num of Structures With 5 Units or More_ytd',
       'date'],
      dtype='object')

In [8]:
# create a 'multi_total' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total'] = homebuilding['2 Units'] + homebuilding['3 and 4 Units'] + homebuilding['5 Units or More'] + homebuilding['Num of Structures With 5 Units or More']

# create a 'multi_total_ytd' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total_ytd'] =  homebuilding['2 Units_ytd'] + homebuilding['3 and 4 Units_ytd'] + homebuilding['5 Units or More_ytd'] + homebuilding['Num of Structures With 5 Units or More_ytd']

In [9]:
# rename Name to name, Total to total, and Total_ytd to total_ytd
homebuilding.rename(
    columns={
        'Name': 'name',
        'Total': 'total',
        'Total_ytd': 'total_ytd'
    },
    inplace=True
)

homebuilding = homebuilding[['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd']]

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
0,2025-01-01,"Aberdeen, SD",0.0,0.0,0.0,0.0
1,2025-01-01,"Aberdeen, WA",24.0,24.0,0.0,0.0
2,2025-01-01,"Abilene, TX",82.0,82.0,4.0,4.0
3,2025-01-01,"Ada, OK",4.0,4.0,4.0,4.0
4,2025-01-01,"Adrian, MI",8.0,8.0,0.0,0.0
...,...,...,...,...,...,...
30752,2019-11-01,"Worcester, MA-CT",18.0,219.0,2.0,52.0
30753,2019-11-01,"Yakima, WA",52.0,823.0,13.0,476.0
30754,2019-11-01,"York-Hanover, PA",11.0,155.0,4.0,62.0
30755,2019-11-01,"Youngstown-Warren-Boardman, OH-PA",13.0,200.0,0.0,7.0


In [10]:
# find nyc data
homebuilding[homebuilding['name'].str.contains('New York')]

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
596,2025-01-01,"New York-Newark-Jersey City, NY-NJ",4200.0,4200.0,3390.0,3390.0
1517,2024-12-01,"New York-Newark-Jersey City, NY-NJ",6341.0,58580.0,5599.0,46990.0
2438,2024-11-01,"New York-Newark-Jersey City, NY-NJ",4216.0,52213.0,3448.0,41343.0
3359,2024-10-01,"New York-Newark-Jersey City, NY-NJ",4168.0,48063.0,2924.0,37914.0
4280,2024-09-01,"New York-Newark-Jersey City, NY-NJ",3666.0,43928.0,2683.0,35127.0
...,...,...,...,...,...,...
29156,2020-03-01,"New York-Newark-Jersey City, NY-NJ-PA",3410.0,12366.0,2461.0,9753.0
29524,2020-02-01,"New York-Newark-Jersey City, NY-NJ-PA",2929.0,8873.0,2147.0,7250.0
29892,2020-01-01,"New York-Newark-Jersey City, NY-NJ-PA",5982.0,5982.0,5169.0,5169.0
30260,2019-12-01,"New York-Newark-Jersey City, NY-NJ-PA",5708.0,60746.0,5096.0,50701.0


In [11]:
# clean up the name column
homebuilding['name'] = homebuilding['name'].apply(lambda x: x.split(',')[0].split('-')[0] + ',' + x.split(',')[1].split('-')[0])

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
0,2025-01-01,"Aberdeen, SD",0.0,0.0,0.0,0.0
1,2025-01-01,"Aberdeen, WA",24.0,24.0,0.0,0.0
2,2025-01-01,"Abilene, TX",82.0,82.0,4.0,4.0
3,2025-01-01,"Ada, OK",4.0,4.0,4.0,4.0
4,2025-01-01,"Adrian, MI",8.0,8.0,0.0,0.0
...,...,...,...,...,...,...
30752,2019-11-01,"Worcester, MA",18.0,219.0,2.0,52.0
30753,2019-11-01,"Yakima, WA",52.0,823.0,13.0,476.0
30754,2019-11-01,"York, PA",11.0,155.0,4.0,62.0
30755,2019-11-01,"Youngstown, OH",13.0,200.0,0.0,7.0


In [12]:
# sort by date with earliest date first
homebuilding = homebuilding.sort_values('date')



homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
30756,2019-11-01,"Yuma, AZ",80.0,1063.0,0.0,6.0
30505,2019-11-01,"Fayetteville, AR",495.0,6303.0,199.0,2562.0
30506,2019-11-01,"Flagstaff, AZ",48.0,650.0,19.0,194.0
30507,2019-11-01,"Flint, MI",18.0,371.0,0.0,130.0
30508,2019-11-01,"Florence, SC",36.0,664.0,6.0,288.0
...,...,...,...,...,...,...
608,2025-01-01,"Oak Harbor, WA",17.0,17.0,0.0,0.0
607,2025-01-01,"Norwich, CT",11.0,11.0,0.0,0.0
606,2025-01-01,"Norwalk, OH",1.0,1.0,0.0,0.0
604,2025-01-01,"North Port, FL",1470.0,1470.0,796.0,796.0


In [13]:
# Ensure the DataFrame is sorted by 'name' and 'date'
homebuilding = homebuilding.sort_values(by=['name', 'date'])

# Create a 12-month running total column
homebuilding['rt'] = (
    homebuilding.groupby('name')['total']
    .rolling(window=12, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
)

# Create a 12-month running total column for multi-unit structures
homebuilding['multi_rt'] = (
    homebuilding.groupby('name')['multi_total']
    .rolling(window=12, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
)

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt
11052,2024-01-01,"Aberdeen, SD",2.0,2.0,0.0,0.0,2.0,0.0
10131,2024-02-01,"Aberdeen, SD",2.0,4.0,0.0,0.0,4.0,0.0
9210,2024-03-01,"Aberdeen, SD",4.0,8.0,0.0,0.0,8.0,0.0
8289,2024-04-01,"Aberdeen, SD",15.0,23.0,0.0,0.0,23.0,0.0
7368,2024-05-01,"Aberdeen, SD",1.0,24.0,0.0,0.0,24.0,0.0
...,...,...,...,...,...,...,...,...
4604,2024-09-01,"Zanesville, OH",15.0,97.0,12.0,77.0,99.0,77.0
3683,2024-10-01,"Zanesville, OH",16.0,113.0,14.0,91.0,115.0,91.0
2762,2024-11-01,"Zanesville, OH",15.0,128.0,14.0,105.0,130.0,105.0
1841,2024-12-01,"Zanesville, OH",12.0,140.0,12.0,117.0,142.0,117.0


In [14]:
# show the df sorted by multi_total_12_month_running in the max date
homebuilding[homebuilding['date'] == homebuilding['date'].max()].sort_values('multi_rt', ascending=False)

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt
596,2025-01-01,"New York, NY",4200.0,4200.0,3390.0,3390.0,58837.0,47519.0
206,2025-01-01,"Dallas, TX",4876.0,4876.0,1586.0,1586.0,69245.0,23985.0
56,2025-01-01,"Austin, TX",1908.0,1908.0,769.0,769.0,32121.0,15730.0
489,2025-01-01,"Los Angeles, CA",1675.0,1675.0,755.0,755.0,26996.0,15308.0
48,2025-01-01,"Atlanta, GA",3133.0,3133.0,1350.0,1350.0,39409.0,14629.0
...,...,...,...,...,...,...,...,...
611,2025-01-01,"Odessa, TX",107.0,107.0,0.0,0.0,1020.0,0.0
606,2025-01-01,"Norwalk, OH",1.0,1.0,0.0,0.0,38.0,0.0
605,2025-01-01,"North Wilkesboro, NC",16.0,16.0,0.0,0.0,143.0,0.0
135,2025-01-01,"Cañon City, CO",8.0,8.0,0.0,0.0,119.0,0.0


In [15]:
# population

# read in the population data
metros = pd.read_csv('data/zori_metro_long_clean.csv')

# change 'date' to datetime
metros['date'] = pd.to_datetime(metros['date'])

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023
0,"New York, NY",2015-02-01,2225.296911,40.6943,-73.9249,19498249.0
1,"Los Angeles, CA",2015-02-01,1802.281380,34.1141,-118.4068,12799100.0
2,"Chicago, IL",2015-02-01,1352.236287,41.8375,-87.6866,9262825.0
3,"Dallas, TX",2015-02-01,1082.375956,32.7935,-96.7667,8100037.0
4,"Houston, TX",2015-02-01,1221.669822,29.7860,-95.3885,7510253.0
...,...,...,...,...,...,...
5995,"Salt Lake City, UT",2025-01-01,1656.973108,40.7776,-111.9311,1267864.0
5996,"Hartford, CT",2025-01-01,1870.252402,41.7661,-72.6834,1151543.0
5997,"Buffalo, NY",2025-01-01,1324.995704,42.9018,-78.8487,1155604.0
5998,"Birmingham, AL",2025-01-01,1339.420553,33.5279,-86.7971,1184290.0


In [16]:
# merge the population data with the homebuilding data
# merge on 'name' and 'date'
# only include rows where both 'name' and 'date' are in both dataframes
homebuilding_zori = pd.merge(
    homebuilding,
    metros,
    on=['name', 'date'],
    how='inner'
)

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023
0,2019-11-01,"Atlanta, GA",1819.0,30513.0,117.0,6602.0,1819.0,117.0,1371.279930,33.7628,-84.4220,6307261.0
1,2019-12-01,"Atlanta, GA",2388.0,32729.0,213.0,6732.0,4207.0,330.0,1368.872179,33.7628,-84.4220,6307261.0
2,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,-84.4220,6307261.0
3,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,-84.4220,6307261.0
4,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,-84.4220,6307261.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3145,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,-77.0163,6304975.0
3146,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,-77.0163,6304975.0
3147,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,-77.0163,6304975.0
3148,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,-77.0163,6304975.0


In [17]:
homebuilding_zori['name'].nunique()

50

In [18]:
homebuilding_zori.columns

Index(['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd',
       'rt', 'multi_rt', 'zori', 'lat', 'lng', 'pop_2023'],
      dtype='object')

In [19]:
# create 'total_per_capita' and 'total_ytd_per_capita' columns that divide 'total' and 'total_ytd' by 'pop_2023'
homebuilding_zori['total_pc'] = homebuilding_zori['total'] / homebuilding_zori['pop_2023']
homebuilding_zori['total_ytd_pc'] = homebuilding_zori['total_ytd'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_pc'] = homebuilding_zori['multi_total'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_ytd_pc'] = homebuilding_zori['multi_total_ytd'] / homebuilding_zori['pop_2023']
homebuilding_zori['rt_pc'] = homebuilding_zori['rt'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_rt_pc'] = homebuilding_zori['multi_rt'] / homebuilding_zori['pop_2023']


# change per capita columns to per 1000
for col in homebuilding_zori.columns:
    if 'pc' in col:
        homebuilding_zori[col] = homebuilding_zori[col] * 1000

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc
0,2019-11-01,"Atlanta, GA",1819.0,30513.0,117.0,6602.0,1819.0,117.0,1371.279930,33.7628,-84.4220,6307261.0,0.288398,4.837758,0.018550,1.046730,0.288398,0.018550
1,2019-12-01,"Atlanta, GA",2388.0,32729.0,213.0,6732.0,4207.0,330.0,1368.872179,33.7628,-84.4220,6307261.0,0.378611,5.189099,0.033771,1.067341,0.667009,0.052321
2,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,-84.4220,6307261.0,0.461690,0.461690,0.154267,0.154267,1.128699,0.206587
3,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,-84.4220,6307261.0,0.434737,0.897537,0.103214,0.247651,1.563436,0.309802
4,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,-84.4220,6307261.0,0.351341,1.249829,0.020136,0.258908,1.914777,0.329937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,-77.0163,6304975.0,0.142744,2.613333,0.035210,1.228712,3.471068,1.758770
3146,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,-77.0163,6304975.0,0.285647,3.014128,0.146234,1.372091,3.463614,1.728318
3147,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,-77.0163,6304975.0,0.178272,3.181297,0.050436,1.412060,3.403186,1.644574
3148,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,-77.0163,6304975.0,0.238859,3.478364,0.122443,1.583353,3.360521,1.589697


In [20]:
# create 'zori_yoy' column that calculates the year-over-year change in 'zori' for each metro area
homebuilding_zori['zori_yoy'] = homebuilding_zori.groupby('name')['zori'].pct_change(12)
homebuilding_zori

  homebuilding_zori['zori_yoy'] = homebuilding_zori.groupby('name')['zori'].pct_change(12)


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
0,2019-11-01,"Atlanta, GA",1819.0,30513.0,117.0,6602.0,1819.0,117.0,1371.279930,33.7628,-84.4220,6307261.0,0.288398,4.837758,0.018550,1.046730,0.288398,0.018550,
1,2019-12-01,"Atlanta, GA",2388.0,32729.0,213.0,6732.0,4207.0,330.0,1368.872179,33.7628,-84.4220,6307261.0,0.378611,5.189099,0.033771,1.067341,0.667009,0.052321,
2,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,-84.4220,6307261.0,0.461690,0.461690,0.154267,0.154267,1.128699,0.206587,
3,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,-84.4220,6307261.0,0.434737,0.897537,0.103214,0.247651,1.563436,0.309802,
4,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,-84.4220,6307261.0,0.351341,1.249829,0.020136,0.258908,1.914777,0.329937,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,-77.0163,6304975.0,0.142744,2.613333,0.035210,1.228712,3.471068,1.758770,0.048862
3146,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,-77.0163,6304975.0,0.285647,3.014128,0.146234,1.372091,3.463614,1.728318,0.047143
3147,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,-77.0163,6304975.0,0.178272,3.181297,0.050436,1.412060,3.403186,1.644574,0.044723
3148,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,-77.0163,6304975.0,0.238859,3.478364,0.122443,1.583353,3.360521,1.589697,0.045161


In [21]:
# sort by multi_rt_pc in the max date and filter to this date
homebuilding_zori[homebuilding_zori['date'] == homebuilding_zori['date'].max()].sort_values('rt_pc', ascending=False)

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
125,2025-01-01,"Austin, TX",1908.0,1908.0,769.0,769.0,32121.0,15730.0,1666.818076,30.3005,-97.7522,2473275.0,0.771447,0.771447,0.310924,0.310924,12.987234,6.359988,-0.030192
2330,2025-01-01,"Raleigh, NC",1067.0,1067.0,72.0,72.0,19040.0,5634.0,1721.833096,35.8324,-78.6429,1509231.0,0.706983,0.706983,0.047706,0.047706,12.615696,3.733027,0.012323
1763,2025-01-01,"Nashville, TN",1569.0,1569.0,578.0,578.0,20019.0,5945.0,1844.593747,36.1715,-86.7842,2102573.0,0.746229,0.746229,0.274901,0.274901,9.521191,2.827488,0.019711
440,2025-01-01,"Charlotte, NC",2304.0,2304.0,903.0,903.0,26486.0,7862.0,1773.988133,35.2083,-80.8303,2805115.0,0.821357,0.821357,0.321912,0.321912,9.442037,2.802737,0.017127
1952,2025-01-01,"Orlando, FL",3632.0,3632.0,2303.0,2303.0,25032.0,9390.0,1979.918361,28.4773,-81.337,2817933.0,1.288888,1.288888,0.817266,0.817266,8.883107,3.33223,0.013264
1196,2025-01-01,"Jacksonville, FL",1065.0,1065.0,64.0,64.0,15158.0,2229.0,1684.285184,30.3322,-81.6749,1713240.0,0.621629,0.621629,0.037356,0.037356,8.847564,1.301044,0.022803
1070,2025-01-01,"Houston, TX",5731.0,5731.0,1596.0,1596.0,65842.0,13396.0,1676.206282,29.786,-95.3885,7510253.0,0.76309,0.76309,0.212509,0.212509,8.766948,1.783695,0.026087
755,2025-01-01,"Dallas, TX",4876.0,4876.0,1586.0,1586.0,69245.0,23985.0,1715.783376,32.7935,-96.7667,8100037.0,0.601973,0.601973,0.195802,0.195802,8.548726,2.961098,0.003167
2078,2025-01-01,"Phoenix, AZ",2687.0,2687.0,201.0,201.0,42627.0,13327.0,1791.494866,33.5722,-112.0892,5070110.0,0.529969,0.529969,0.039644,0.039644,8.40751,2.628543,0.007653
2393,2025-01-01,"Richmond, VA",1313.0,1313.0,815.0,815.0,9160.0,4119.0,1649.262853,37.5295,-77.4756,1349732.0,0.972786,0.972786,0.603824,0.603824,6.786532,3.051717,0.064608


In [22]:
# Create a 'state' column by extracting the state abbreviation from the 'name' column
homebuilding_zori['state'] = homebuilding_zori['name'].str.split(', ').str[-1]

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state
0,2019-11-01,"Atlanta, GA",1819.0,30513.0,117.0,6602.0,1819.0,117.0,1371.279930,33.7628,-84.4220,6307261.0,0.288398,4.837758,0.018550,1.046730,0.288398,0.018550,,GA
1,2019-12-01,"Atlanta, GA",2388.0,32729.0,213.0,6732.0,4207.0,330.0,1368.872179,33.7628,-84.4220,6307261.0,0.378611,5.189099,0.033771,1.067341,0.667009,0.052321,,GA
2,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,-84.4220,6307261.0,0.461690,0.461690,0.154267,0.154267,1.128699,0.206587,,GA
3,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,-84.4220,6307261.0,0.434737,0.897537,0.103214,0.247651,1.563436,0.309802,,GA
4,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,-84.4220,6307261.0,0.351341,1.249829,0.020136,0.258908,1.914777,0.329937,,GA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,-77.0163,6304975.0,0.142744,2.613333,0.035210,1.228712,3.471068,1.758770,0.048862,DC
3146,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,-77.0163,6304975.0,0.285647,3.014128,0.146234,1.372091,3.463614,1.728318,0.047143,DC
3147,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,-77.0163,6304975.0,0.178272,3.181297,0.050436,1.412060,3.403186,1.644574,0.044723,DC
3148,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,-77.0163,6304975.0,0.238859,3.478364,0.122443,1.583353,3.360521,1.589697,0.045161,DC


In [23]:
homebuilding_zori['name'].nunique()

50

In [24]:
regions = pd.read_csv('data/regions.csv')

regions

Unnamed: 0,state,region
0,WA,West
1,OR,West
2,CA,West
3,NV,West
4,ID,West
5,MT,West
6,WY,West
7,UT,West
8,CO,West
9,HI,West


In [25]:


# merge
homebuilding_zori = pd.merge(
    homebuilding_zori,
    regions,
    on='state',
    how='left'
)

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,...,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state,region
0,2019-11-01,"Atlanta, GA",1819.0,30513.0,117.0,6602.0,1819.0,117.0,1371.279930,33.7628,...,6307261.0,0.288398,4.837758,0.018550,1.046730,0.288398,0.018550,,GA,Southeast
1,2019-12-01,"Atlanta, GA",2388.0,32729.0,213.0,6732.0,4207.0,330.0,1368.872179,33.7628,...,6307261.0,0.378611,5.189099,0.033771,1.067341,0.667009,0.052321,,GA,Southeast
2,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,...,6307261.0,0.461690,0.461690,0.154267,0.154267,1.128699,0.206587,,GA,Southeast
3,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,...,6307261.0,0.434737,0.897537,0.103214,0.247651,1.563436,0.309802,,GA,Southeast
4,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,...,6307261.0,0.351341,1.249829,0.020136,0.258908,1.914777,0.329937,,GA,Southeast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,...,6304975.0,0.142744,2.613333,0.035210,1.228712,3.471068,1.758770,0.048862,DC,Northest
3146,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,...,6304975.0,0.285647,3.014128,0.146234,1.372091,3.463614,1.728318,0.047143,DC,Northest
3147,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,...,6304975.0,0.178272,3.181297,0.050436,1.412060,3.403186,1.644574,0.044723,DC,Northest
3148,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,...,6304975.0,0.238859,3.478364,0.122443,1.583353,3.360521,1.589697,0.045161,DC,Northest


In [26]:
# save the final dataset to a csv file
homebuilding_zori.to_csv('data/homebuilding_zori.csv', index=False)