In [1]:
import pandas as pd
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
from typing import List

In [None]:
def generate_year_month_range(end_date: date = None, years_back: int = 6) -> List[str]:
    """
    Generate a list of year-month combinations in 'yyyymm' format,
    starting from the specified end date and going back a specified number of years.

    Args:
        end_date (date, optional): The end date to start from. Defaults to today's date.
        years_back (int, optional): Number of years to go back. Defaults to 6.

    Returns:
        List[str]: List of year-month combinations in 'yyyymm' format, sorted in descending order.

    Example:
        >>> generate_year_month_range()  # If today is 2024-11-14
        ['202411', '202410', '202409', ..., '201812']
    """
    # If no end date is provided, use today's date
    if end_date is None:
        end_date = date.today()

    # Calculate start date
    start_date = end_date - relativedelta(years=years_back)

    # Initialize result list
    date_list = []

    # Current date for iteration
    current_date = end_date

    # Generate dates until we reach start date
    while current_date >= start_date:
        # Format date as 'yyyymm'
        date_str = current_date.strftime('%Y%m')
        date_list.append(date_str)
        # Move to previous month
        current_date -= relativedelta(months=1)

    return date_list

# Example usage
if __name__ == "__main__":
    date_list = generate_year_month_range()
    print(f"Generated {len(date_list)} year-month combinations:")
    print(date_list[:12])  # Print first year as example

Generated 73 year-month combinations:
['202501', '202412', '202411', '202410', '202409', '202408', '202407', '202406', '202405', '202404', '202403', '202402']


In [3]:
import requests



In [4]:
# read in the metro_areas.json file as a list
import json
with open('data/metro_areas.json') as f:
    metro_areas = json.load(f)

metro_areas

['Abilene, TX',
 'Akron, OH',
 'Albany, OR',
 'Albany-Schenectady-Troy, NY',
 'Albuquerque, NM',
 'Allentown-Bethlehem-Easton, PA-NJ',
 'Altoona, PA',
 'Amarillo, TX',
 'Ames, IA',
 'Amherst Town-Northampton, MA',
 'Ann Arbor, MI',
 'Appleton, WI',
 'Asheville, NC',
 'Athens-Clarke County, GA',
 'Atlanta-Sandy Springs-Roswell, GA',
 'Atlantic City-Hammonton, NJ',
 'Auburn-Opelika, AL',
 'Baltimore-Columbia-Towson, MD',
 'Barnstable Town, MA',
 'Baton Rouge, LA',
 'Battle Creek, MI',
 'Bay City, MI',
 'Birmingham, AL',
 'Bloomington, IL',
 'Bloomington, IN',
 'Boise City, ID',
 'Boston-Cambridge-Newton, MA-NH',
 'Boulder, CO',
 'Bowling Green, KY',
 'Bremerton-Silverdale-Port Orchard, WA',
 'Bridgeport-Stamford-Danbury, CT',
 'Brownsville-Harlingen, TX',
 'Buffalo-Cheektowaga, NY',
 'Burlington, NC',
 'Burlington-South Burlington, VT',
 'Canton-Massillon, OH',
 'Cape Coral-Fort Myers, FL',
 'Cape Girardeau, MO-IL',
 'Carson City, NV',
 'Cedar Rapids, IA',
 'Chambersburg, PA',
 'Champaig

In [5]:
import pandas as pd
import requests
from datetime import datetime

def check_url_exists(url):
    """Check if a URL exists without downloading the full file"""
    try:
        response = requests.head(url)
        return response.status_code == 200
    except:
        return False

def process_2024_data(url):
    """Process data from 2024 onwards"""
    df = pd.read_excel(url, skiprows=7)
    
    # Clean columns
    df.drop(columns=['Metro /Micro Code', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_2022_2023_data(url):
    """Process data from 2022-2023"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Unnamed: 9'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_pre_2022_data(url):
    """Process data from 2021 and earlier"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Monthly Coverage Percent*', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

# Initialize an empty list to store all dataframes
all_dfs = []

# Process each date
for date_str in date_list:
    year = int(date_str[:4])
    
    try:
        # Determine URL format and processing function based on year
        if year >= 2024:
            url = f'https://www.census.gov/construction/bps/xls/cbsamonthly_{date_str}.xls'
            process_func = process_2024_data
        else:
            url = f'https://www.census.gov/construction/bps/xls/msamonthly_{date_str}.xls'
            if year >= 2022:
                process_func = process_2022_2023_data
            else:
                process_func = process_pre_2022_data
        
        # Check if URL exists
        if check_url_exists(url):
            try:
                df = process_func(url)
                all_dfs.append(df)
                print(f"Successfully processed data for {date_str}")
            except Exception as e:
                print(f"Error processing {date_str}: {str(e)}")
                continue
        else:
            print(f"No data available for {date_str}")
            continue
            
    except Exception as e:
        print(f"Error with {date_str}: {str(e)}")
        continue

# Concatenate all dataframes if we have any data
if all_dfs:
    homebuilding = pd.concat(all_dfs, ignore_index=True)
    
    # Convert date column to datetime
    homebuilding['date'] = pd.to_datetime(homebuilding['date'], format='%Y%m')
    
    # Sort by date and other relevant columns
    homebuilding = homebuilding.sort_values(['date', 'Name'], ascending=[False, True])
    
    print(f"\nFinal dataset contains {len(homebuilding)} rows from {len(all_dfs)} different months")
    print(f"Date range: {homebuilding['date'].min()} to {homebuilding['date'].max()}")
else:
    print("No data was successfully processed")
    homebuilding = pd.DataFrame()

No data available for 202501
No data available for 202412


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202411


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202410


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202409


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202408


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202407


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202406


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202405


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202404


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202403


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202402


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202401


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202312


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202311


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202310


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202309


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202308


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202307


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202306


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202305


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202304


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202303
Error processing 202302: HTTP Error 429: Too Many Requests
No data available for 202301
Error processing 202212: HTTP Error 429: Too Many Requests
Error processing 202211: HTTP Error 429: Too Many Requests
No data available for 202210
No data available for 202209
No data available for 202208
No data available for 202207
No data available for 202206
No data available for 202205
No data available for 202204
No data available for 202203
No data available for 202202
No data available for 202201
No data available for 202112
No data available for 202111
No data available for 202110
No data available for 202109
No data available for 202108
No data available for 202107
No data available for 202106
No data available for 202105
No data available for 202104
No data available for 202103
No data available for 202102
No data available for 202101
No data available for 202012
No data available for 202011
No data available for 202010
No data available for 202009
N

In [6]:
# homebuilding['date'] = pd.to_datetime(homebuilding['date'])
homebuilding.dtypes

CSA                                                  float64
CBSA                                                 float64
Name                                                  object
Total                                                float64
1 Unit                                               float64
2 Units                                              float64
3 and 4 Units                                        float64
5 Units or More                                      float64
Num of Structures With 5 Units or More               float64
Total_ytd                                            float64
1 Unit_ytd                                           float64
2 Units_ytd                                          float64
3 and 4 Units_ytd                                    float64
5 Units or More_ytd                                  float64
Num of Structures With 5 Units or More_ytd           float64
date                                          datetime64[ns]
dtype: object

In [7]:
homebuilding.columns

Index(['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', '3 and 4 Units',
       '5 Units or More', 'Num of Structures With 5 Units or More',
       'Total_ytd', '1 Unit_ytd', '2 Units_ytd', '3 and 4 Units_ytd',
       '5 Units or More_ytd', 'Num of Structures With 5 Units or More_ytd',
       'date'],
      dtype='object')

In [8]:
# create a 'multi_total' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total'] = homebuilding['1 Unit'] + homebuilding['2 Units'] + homebuilding['3 and 4 Units'] + homebuilding['5 Units or More'] + homebuilding['Num of Structures With 5 Units or More']

# create a 'multi_total_ytd' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total_ytd'] = homebuilding['1 Unit_ytd'] + homebuilding['2 Units_ytd'] + homebuilding['3 and 4 Units_ytd'] + homebuilding['5 Units or More_ytd'] + homebuilding['Num of Structures With 5 Units or More_ytd']

In [9]:
homebuilding

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More,Total_ytd,1 Unit_ytd,2 Units_ytd,3 and 4 Units_ytd,5 Units or More_ytd,Num of Structures With 5 Units or More_ytd,date,multi_total,multi_total_ytd
0,999.0,10100.0,"Aberdeen, SD",7.0,3.0,4.0,0.0,0.0,0.0,87.0,71.0,12.0,4.0,0.0,0.0,2024-11-01,7.0,87.0
1,999.0,10140.0,"Aberdeen, WA",13.0,13.0,0.0,0.0,0.0,0.0,262.0,225.0,8.0,12.0,17.0,2.0,2024-11-01,13.0,264.0
2,101.0,10180.0,"Abilene, TX",21.0,17.0,4.0,0.0,0.0,0.0,442.0,346.0,96.0,0.0,0.0,0.0,2024-11-01,21.0,442.0
3,999.0,10220.0,"Ada, OK",0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2024-11-01,0.0,2.0
4,220.0,10300.0,"Adrian, MI",10.0,10.0,0.0,0.0,0.0,0.0,128.0,128.0,0.0,0.0,0.0,0.0,2024-11-01,10.0,128.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13966,999.0,49420.0,"Yakima, WA",64.0,28.0,6.0,30.0,0.0,0.0,131.0,80.0,10.0,41.0,0.0,0.0,2023-03-01,64.0,131.0
13967,276.0,49620.0,"York-Hanover, PA",145.0,68.0,0.0,0.0,77.0,5.0,318.0,210.0,0.0,4.0,104.0,9.0,2023-03-01,150.0,327.0
13968,566.0,49660.0,"Youngstown-Warren-Boardman, OH-PA",28.0,28.0,0.0,0.0,0.0,0.0,67.0,63.0,0.0,4.0,0.0,0.0,2023-03-01,28.0,67.0
13969,472.0,49700.0,"Yuba City, CA",71.0,60.0,0.0,0.0,11.0,2.0,141.0,113.0,0.0,0.0,28.0,5.0,2023-03-01,73.0,146.0


In [None]:
# rename Name to name, Total to total, and Total_ytd to total_ytd
homebuilding.rename(
    columns={
        'Name': 'name',
        'Total': 'total',
        'Total_ytd': 'total_ytd'
    },
    inplace=True
)

homebuilding = homebuilding[['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd']]

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
0,2024-11-01,"Aberdeen, SD",7.0,87.0,7.0,87.0
1,2024-11-01,"Aberdeen, WA",13.0,262.0,13.0,264.0
2,2024-11-01,"Abilene, TX",21.0,442.0,21.0,442.0
3,2024-11-01,"Ada, OK",0.0,2.0,0.0,2.0
4,2024-11-01,"Adrian, MI",10.0,128.0,10.0,128.0
...,...,...,...,...,...,...
13966,2023-03-01,"Yakima, WA",64.0,131.0,64.0,131.0
13967,2023-03-01,"York-Hanover, PA",145.0,318.0,150.0,327.0
13968,2023-03-01,"Youngstown-Warren-Boardman, OH-PA",28.0,67.0,28.0,67.0
13969,2023-03-01,"Yuba City, CA",71.0,141.0,73.0,146.0


In [11]:
# # unify the names of the metro areas for new york city
# # replace all instances of 'New York-Newark-Jersey City, NY-NJ-PA' with 'New York-Newark-Jersey City, NY-NJ' 
# homebuilding['name'] = homebuilding['name'].str.replace('New York-Newark-Jersey City, NY-NJ-PA', 'New York-Newark-Jersey City, NY-NJ')

In [None]:
# sort by date with earliest date first
homebuilding = homebuilding.sort_values('date')



homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
13970,2023-03-01,"Yuma, AZ",19.0,160.0,19.0,160.0
13708,2023-03-01,"Florence, SC",107.0,196.0,107.0,196.0
13709,2023-03-01,"Florence-Muscle Shoals, AL",26.0,61.0,26.0,61.0
13710,2023-03-01,"Fond du Lac, WI",19.0,41.0,20.0,44.0
13711,2023-03-01,"Fort Collins, CO",305.0,871.0,310.0,893.0
...,...,...,...,...,...,...
608,2024-11-01,"Oak Harbor, WA",23.0,383.0,23.0,390.0
607,2024-11-01,"Norwich-New London-Willimantic, CT",12.0,470.0,12.0,479.0
606,2024-11-01,"Norwalk, OH",2.0,37.0,2.0,37.0
604,2024-11-01,"North Port-Bradenton-Sarasota, FL",693.0,13863.0,694.0,13940.0


In [None]:
# Change the name column so 'Austin-Round Rock-Georgetown, TX' becomes 'Austin, TX'
# Delete every character between the first '-' and the ','
homebuilding['name'] = homebuilding['name'].str.replace(r'-.*?,', ',')
homebuilding

  homebuilding['name'] = homebuilding['name'].str.replace(r'-.*?,', ',')


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
13970,2023-03-01,"Yuma, AZ",19.0,160.0,19.0,160.0
13708,2023-03-01,"Florence, SC",107.0,196.0,107.0,196.0
13709,2023-03-01,"Florence, AL",26.0,61.0,26.0,61.0
13710,2023-03-01,"Fond du Lac, WI",19.0,41.0,20.0,44.0
13711,2023-03-01,"Fort Collins, CO",305.0,871.0,310.0,893.0
...,...,...,...,...,...,...
608,2024-11-01,"Oak Harbor, WA",23.0,383.0,23.0,390.0
607,2024-11-01,"Norwich, CT",12.0,470.0,12.0,479.0
606,2024-11-01,"Norwalk, OH",2.0,37.0,2.0,37.0
604,2024-11-01,"North Port, FL",693.0,13863.0,694.0,13940.0


In [None]:
# change the name of metro areas to match the names in the metro_areas.json file
homebuilding['name'] = homebuilding['name'].str.replace(r'-.*', '')
homebuilding

  homebuilding['name'] = homebuilding['name'].str.replace(r'-.*', '')


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
13970,2023-03-01,"Yuma, AZ",19.0,160.0,19.0,160.0
13708,2023-03-01,"Florence, SC",107.0,196.0,107.0,196.0
13709,2023-03-01,"Florence, AL",26.0,61.0,26.0,61.0
13710,2023-03-01,"Fond du Lac, WI",19.0,41.0,20.0,44.0
13711,2023-03-01,"Fort Collins, CO",305.0,871.0,310.0,893.0
...,...,...,...,...,...,...
608,2024-11-01,"Oak Harbor, WA",23.0,383.0,23.0,390.0
607,2024-11-01,"Norwich, CT",12.0,470.0,12.0,479.0
606,2024-11-01,"Norwalk, OH",2.0,37.0,2.0,37.0
604,2024-11-01,"North Port, FL",693.0,13863.0,694.0,13940.0


In [15]:
# Ensure the DataFrame is sorted by 'name' and 'date'
homebuilding = homebuilding.sort_values(by=['name', 'date'])

# Create a 12-month running total column
homebuilding['rt'] = (
    homebuilding.groupby('name')['total']
    .rolling(window=12, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
)

# Create a 12-month running total column for multi-unit structures
homebuilding['multi_rt'] = (
    homebuilding.groupby('name')['multi_total']
    .rolling(window=12, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
)

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt
9210,2024-01-01,"Aberdeen, SD",2.0,2.0,2.0,2.0,2.0,2.0
8289,2024-02-01,"Aberdeen, SD",2.0,4.0,2.0,4.0,4.0,4.0
7368,2024-03-01,"Aberdeen, SD",4.0,8.0,4.0,8.0,8.0,8.0
6447,2024-04-01,"Aberdeen, SD",15.0,23.0,15.0,23.0,23.0,23.0
5526,2024-05-01,"Aberdeen, SD",1.0,24.0,1.0,24.0,24.0,24.0
...,...,...,...,...,...,...,...,...
4604,2024-07-01,"Zanesville, OH",14.0,67.0,15.0,71.0,67.0,71.0
3683,2024-08-01,"Zanesville, OH",17.0,82.0,18.0,87.0,84.0,89.0
2762,2024-09-01,"Zanesville, OH",15.0,97.0,16.0,103.0,99.0,105.0
1841,2024-10-01,"Zanesville, OH",16.0,113.0,17.0,120.0,115.0,122.0


In [None]:
# show the df sorted by multi_total_12_month_running in the max date
homebuilding[homebuilding['date'] == homebuilding['date'].max()].sort_values('multi_rt', ascending=False)

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt
206,2024-11-01,"Dallas, TX",4322.0,66043.0,4353.0,66583.0,67954.0,68489.0
380,2024-11-01,"Houston, TX",5099.0,61141.0,5131.0,61460.0,64612.0,64912.0
596,2024-11-01,"New York, NY",4216.0,52213.0,4309.0,53306.0,56155.0,57473.0
653,2024-11-01,"Phoenix, AZ",3654.0,42476.0,3719.0,42819.0,45772.0,46169.0
48,2024-11-01,"Atlanta, GA",2517.0,37058.0,2564.0,37857.0,37694.0,38485.0
...,...,...,...,...,...,...,...,...
314,2024-11-01,"Gallup, NM",0.0,1.0,0.0,1.0,1.0,1.0
459,2024-11-01,"Las Vegas, NM",0.0,0.0,0.0,0.0,0.0,0.0
496,2024-11-01,"Macomb, IL",0.0,0.0,0.0,0.0,0.0,0.0
702,2024-11-01,"Rio Grande City, TX",0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# population

# read in the population data
metros = pd.read_csv('data/zori_metro_long_clean.csv')

# change 'date' to datetime
metros['date'] = pd.to_datetime(metros['date'])

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023
0,"New York, NY",2015-02-01,2255.133793,40.6943,-73.9249,19498249.0
1,"Los Angeles, CA",2015-02-01,1814.136486,34.1141,-118.4068,12799100.0
2,"Chicago, IL",2015-02-01,1356.915213,41.8375,-87.6866,9262825.0
3,"Dallas, TX",2015-02-01,1090.919667,32.7935,-96.7667,8100037.0
4,"Houston, TX",2015-02-01,1229.410303,29.7860,-95.3885,7510253.0
...,...,...,...,...,...,...
5945,"Salt Lake City, UT",2024-12-01,1650.452457,40.7776,-111.9311,1267864.0
5946,"Hartford, CT",2024-12-01,1874.958152,41.7661,-72.6834,1151543.0
5947,"Buffalo, NY",2024-12-01,1344.539267,42.9018,-78.8487,1155604.0
5948,"Birmingham, AL",2024-12-01,1365.603146,33.5279,-86.7971,1184290.0


In [None]:
# merge the population data with the homebuilding data
# merge on 'name' and 'date'
# only include rows where both 'name' and 'date' are in both dataframes
homebuilding_zori = pd.merge(
    homebuilding,
    metros,
    on=['name', 'date'],
    how='inner'
)

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023
0,2023-03-01,"Atlanta, GA",3248.0,10245.0,3281.0,10362.0,3248.0,3281.0,1852.196831,33.7628,-84.4220,6307261.0
1,2023-04-01,"Atlanta, GA",3018.0,13175.0,3030.0,13300.0,6266.0,6311.0,1862.150304,33.7628,-84.4220,6307261.0
2,2023-05-01,"Atlanta, GA",4461.0,17621.0,4498.0,17782.0,10727.0,10809.0,1869.863596,33.7628,-84.4220,6307261.0
3,2023-06-01,"Atlanta, GA",3429.0,21029.0,3454.0,21214.0,14156.0,14263.0,1875.865516,33.7628,-84.4220,6307261.0
4,2023-07-01,"Atlanta, GA",3524.0,24417.0,3547.0,24623.0,17680.0,17810.0,1876.158151,33.7628,-84.4220,6307261.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1045,2024-07-01,"Washington, DC",1983.0,13587.0,2006.0,13728.0,22572.0,22822.0,2365.897539,38.9047,-77.0163,6304975.0
1046,2024-08-01,"Washington, DC",2137.0,15660.0,2159.0,15822.0,22460.0,22694.0,2381.806738,38.9047,-77.0163,6304975.0
1047,2024-09-01,"Washington, DC",900.0,16477.0,910.0,16648.0,21885.0,22115.0,2390.051806,38.9047,-77.0163,6304975.0
1048,2024-10-01,"Washington, DC",1801.0,19004.0,1813.0,19185.0,21838.0,22060.0,2389.848641,38.9047,-77.0163,6304975.0


In [19]:
homebuilding_zori.columns

Index(['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd',
       'rt', 'multi_rt', 'zori', 'lat', 'lng', 'pop_2023'],
      dtype='object')

In [20]:
# create 'total_per_capita' and 'total_ytd_per_capita' columns that divide 'total' and 'total_ytd' by 'pop_2023'
homebuilding_zori['total_pc'] = homebuilding_zori['total'] / homebuilding_zori['pop_2023']
homebuilding_zori['total_ytd_pc'] = homebuilding_zori['total_ytd'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_pc'] = homebuilding_zori['multi_total'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_ytd_pc'] = homebuilding_zori['multi_total_ytd'] / homebuilding_zori['pop_2023']
homebuilding_zori['rt_pc'] = homebuilding_zori['rt'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_rt_pc'] = homebuilding_zori['multi_rt'] / homebuilding_zori['pop_2023']


# change per capita columns to per 1000
for col in homebuilding_zori.columns:
    if 'pc' in col:
        homebuilding_zori[col] = homebuilding_zori[col] * 1000

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc
0,2023-03-01,"Atlanta, GA",3248.0,10245.0,3281.0,10362.0,3248.0,3281.0,1852.196831,33.7628,-84.4220,6307261.0,0.514962,1.624318,0.520194,1.642868,0.514962,0.520194
1,2023-04-01,"Atlanta, GA",3018.0,13175.0,3030.0,13300.0,6266.0,6311.0,1862.150304,33.7628,-84.4220,6307261.0,0.478496,2.088862,0.480399,2.108681,0.993458,1.000593
2,2023-05-01,"Atlanta, GA",4461.0,17621.0,4498.0,17782.0,10727.0,10809.0,1869.863596,33.7628,-84.4220,6307261.0,0.707280,2.793764,0.713146,2.819290,1.700738,1.713739
3,2023-06-01,"Atlanta, GA",3429.0,21029.0,3454.0,21214.0,14156.0,14263.0,1875.865516,33.7628,-84.4220,6307261.0,0.543659,3.334094,0.547623,3.363425,2.244397,2.261362
4,2023-07-01,"Atlanta, GA",3524.0,24417.0,3547.0,24623.0,17680.0,17810.0,1876.158151,33.7628,-84.4220,6307261.0,0.558721,3.871253,0.562368,3.903913,2.803119,2.823730
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,2024-07-01,"Washington, DC",1983.0,13587.0,2006.0,13728.0,22572.0,22822.0,2365.897539,38.9047,-77.0163,6304975.0,0.314514,2.154965,0.318161,2.177328,3.580030,3.619681
1046,2024-08-01,"Washington, DC",2137.0,15660.0,2159.0,15822.0,22460.0,22694.0,2381.806738,38.9047,-77.0163,6304975.0,0.338939,2.483753,0.342428,2.509447,3.562266,3.599380
1047,2024-09-01,"Washington, DC",900.0,16477.0,910.0,16648.0,21885.0,22115.0,2390.051806,38.9047,-77.0163,6304975.0,0.142744,2.613333,0.144330,2.640455,3.471068,3.507548
1048,2024-10-01,"Washington, DC",1801.0,19004.0,1813.0,19185.0,21838.0,22060.0,2389.848641,38.9047,-77.0163,6304975.0,0.285647,3.014128,0.287551,3.042835,3.463614,3.498824


In [21]:
# create 'zori_yoy' column that calculates the year-over-year change in 'zori' for each metro area
homebuilding_zori['zori_yoy'] = homebuilding_zori.groupby('name')['zori'].pct_change(12)
homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
0,2023-03-01,"Atlanta, GA",3248.0,10245.0,3281.0,10362.0,3248.0,3281.0,1852.196831,33.7628,-84.4220,6307261.0,0.514962,1.624318,0.520194,1.642868,0.514962,0.520194,
1,2023-04-01,"Atlanta, GA",3018.0,13175.0,3030.0,13300.0,6266.0,6311.0,1862.150304,33.7628,-84.4220,6307261.0,0.478496,2.088862,0.480399,2.108681,0.993458,1.000593,
2,2023-05-01,"Atlanta, GA",4461.0,17621.0,4498.0,17782.0,10727.0,10809.0,1869.863596,33.7628,-84.4220,6307261.0,0.707280,2.793764,0.713146,2.819290,1.700738,1.713739,
3,2023-06-01,"Atlanta, GA",3429.0,21029.0,3454.0,21214.0,14156.0,14263.0,1875.865516,33.7628,-84.4220,6307261.0,0.543659,3.334094,0.547623,3.363425,2.244397,2.261362,
4,2023-07-01,"Atlanta, GA",3524.0,24417.0,3547.0,24623.0,17680.0,17810.0,1876.158151,33.7628,-84.4220,6307261.0,0.558721,3.871253,0.562368,3.903913,2.803119,2.823730,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,2024-07-01,"Washington, DC",1983.0,13587.0,2006.0,13728.0,22572.0,22822.0,2365.897539,38.9047,-77.0163,6304975.0,0.314514,2.154965,0.318161,2.177328,3.580030,3.619681,0.050730
1046,2024-08-01,"Washington, DC",2137.0,15660.0,2159.0,15822.0,22460.0,22694.0,2381.806738,38.9047,-77.0163,6304975.0,0.338939,2.483753,0.342428,2.509447,3.562266,3.599380,0.050124
1047,2024-09-01,"Washington, DC",900.0,16477.0,910.0,16648.0,21885.0,22115.0,2390.051806,38.9047,-77.0163,6304975.0,0.142744,2.613333,0.144330,2.640455,3.471068,3.507548,0.049168
1048,2024-10-01,"Washington, DC",1801.0,19004.0,1813.0,19185.0,21838.0,22060.0,2389.848641,38.9047,-77.0163,6304975.0,0.285647,3.014128,0.287551,3.042835,3.463614,3.498824,0.047615


In [22]:
# sort by multi_rt_pc in the max date and filter to this date
homebuilding_zori[homebuilding_zori['date'] == homebuilding_zori['date'].max()].sort_values('rt_pc', ascending=False)

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
41,2024-11-01,"Austin, TX",3059.0,30193.0,3082.0,30494.0,33568.0,33846.0,1708.494615,30.3005,-97.7522,2473275.0,1.236822,12.2077,1.246121,12.329401,13.572288,13.684689,-0.032811
776,2024-11-01,"Raleigh, NC",879.0,18087.0,881.0,18173.0,20304.0,20418.0,1745.296204,35.8324,-78.6429,1509231.0,0.582416,11.984249,0.583741,12.041232,13.453209,13.528744,0.003932
587,2024-11-01,"Nashville, TN",1633.0,18699.0,1639.0,18793.0,19830.0,19941.0,1866.11355,36.1715,-86.7842,2102573.0,0.776667,8.893389,0.779521,8.938096,9.431302,9.484094,0.015025
146,2024-11-01,"Charlotte, NC",1786.0,23814.0,1801.0,23996.0,25936.0,26132.0,1775.160145,35.2083,-80.8303,2805115.0,0.636694,8.489492,0.642041,8.554373,9.245967,9.315839,0.013154
398,2024-11-01,"Jacksonville, FL",785.0,13943.0,785.0,14015.0,15766.0,15863.0,1709.775642,30.3322,-81.6749,1713240.0,0.458196,8.138381,0.458196,8.180407,9.202447,9.259065,0.015999
692,2024-11-01,"Phoenix, AZ",3654.0,42476.0,3719.0,42819.0,45772.0,46169.0,1825.953865,33.5722,-112.0892,5070110.0,0.720694,8.377728,0.733515,8.445379,9.027812,9.106114,0.008782
356,2024-11-01,"Houston, TX",5099.0,61141.0,5131.0,61460.0,64612.0,64912.0,1696.645183,29.786,-95.3885,7510253.0,0.678939,8.141004,0.683199,8.183479,8.603172,8.643118,0.027611
251,2024-11-01,"Dallas, TX",4322.0,66043.0,4353.0,66583.0,67954.0,68489.0,1743.874174,32.7935,-96.7667,8100037.0,0.533578,8.15342,0.537405,8.220086,8.389344,8.455393,0.002903
650,2024-11-01,"Orlando, FL",1388.0,21421.0,1392.0,21597.0,22901.0,23078.0,2010.439552,28.4773,-81.337,2817933.0,0.49256,7.601671,0.493979,7.664128,8.126879,8.189691,0.013152
1007,2024-11-01,"Tampa, FL",921.0,20057.0,923.0,20357.0,22665.0,22987.0,2049.558445,27.9945,-82.4447,3342963.0,0.275504,5.999767,0.276102,6.089508,6.779914,6.876235,0.018009


In [23]:

with open('data/top_metros.json') as f:
    top_metros = json.load(f)

# filter to only the top metros
homebuilding_zori = homebuilding_zori[homebuilding_zori['name'].isin(top_metros)]

In [24]:
homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
0,2023-03-01,"Atlanta, GA",3248.0,10245.0,3281.0,10362.0,3248.0,3281.0,1852.196831,33.7628,-84.4220,6307261.0,0.514962,1.624318,0.520194,1.642868,0.514962,0.520194,
1,2023-04-01,"Atlanta, GA",3018.0,13175.0,3030.0,13300.0,6266.0,6311.0,1862.150304,33.7628,-84.4220,6307261.0,0.478496,2.088862,0.480399,2.108681,0.993458,1.000593,
2,2023-05-01,"Atlanta, GA",4461.0,17621.0,4498.0,17782.0,10727.0,10809.0,1869.863596,33.7628,-84.4220,6307261.0,0.707280,2.793764,0.713146,2.819290,1.700738,1.713739,
3,2023-06-01,"Atlanta, GA",3429.0,21029.0,3454.0,21214.0,14156.0,14263.0,1875.865516,33.7628,-84.4220,6307261.0,0.543659,3.334094,0.547623,3.363425,2.244397,2.261362,
4,2023-07-01,"Atlanta, GA",3524.0,24417.0,3547.0,24623.0,17680.0,17810.0,1876.158151,33.7628,-84.4220,6307261.0,0.558721,3.871253,0.562368,3.903913,2.803119,2.823730,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,2024-07-01,"Washington, DC",1983.0,13587.0,2006.0,13728.0,22572.0,22822.0,2365.897539,38.9047,-77.0163,6304975.0,0.314514,2.154965,0.318161,2.177328,3.580030,3.619681,0.050730
1046,2024-08-01,"Washington, DC",2137.0,15660.0,2159.0,15822.0,22460.0,22694.0,2381.806738,38.9047,-77.0163,6304975.0,0.338939,2.483753,0.342428,2.509447,3.562266,3.599380,0.050124
1047,2024-09-01,"Washington, DC",900.0,16477.0,910.0,16648.0,21885.0,22115.0,2390.051806,38.9047,-77.0163,6304975.0,0.142744,2.613333,0.144330,2.640455,3.471068,3.507548,0.049168
1048,2024-10-01,"Washington, DC",1801.0,19004.0,1813.0,19185.0,21838.0,22060.0,2389.848641,38.9047,-77.0163,6304975.0,0.285647,3.014128,0.287551,3.042835,3.463614,3.498824,0.047615


In [25]:
# Create a 'state' column by extracting the state abbreviation from the 'name' column
homebuilding_zori['state'] = homebuilding_zori['name'].str.split(', ').str[-1]

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state
0,2023-03-01,"Atlanta, GA",3248.0,10245.0,3281.0,10362.0,3248.0,3281.0,1852.196831,33.7628,-84.4220,6307261.0,0.514962,1.624318,0.520194,1.642868,0.514962,0.520194,,GA
1,2023-04-01,"Atlanta, GA",3018.0,13175.0,3030.0,13300.0,6266.0,6311.0,1862.150304,33.7628,-84.4220,6307261.0,0.478496,2.088862,0.480399,2.108681,0.993458,1.000593,,GA
2,2023-05-01,"Atlanta, GA",4461.0,17621.0,4498.0,17782.0,10727.0,10809.0,1869.863596,33.7628,-84.4220,6307261.0,0.707280,2.793764,0.713146,2.819290,1.700738,1.713739,,GA
3,2023-06-01,"Atlanta, GA",3429.0,21029.0,3454.0,21214.0,14156.0,14263.0,1875.865516,33.7628,-84.4220,6307261.0,0.543659,3.334094,0.547623,3.363425,2.244397,2.261362,,GA
4,2023-07-01,"Atlanta, GA",3524.0,24417.0,3547.0,24623.0,17680.0,17810.0,1876.158151,33.7628,-84.4220,6307261.0,0.558721,3.871253,0.562368,3.903913,2.803119,2.823730,,GA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,2024-07-01,"Washington, DC",1983.0,13587.0,2006.0,13728.0,22572.0,22822.0,2365.897539,38.9047,-77.0163,6304975.0,0.314514,2.154965,0.318161,2.177328,3.580030,3.619681,0.050730,DC
1046,2024-08-01,"Washington, DC",2137.0,15660.0,2159.0,15822.0,22460.0,22694.0,2381.806738,38.9047,-77.0163,6304975.0,0.338939,2.483753,0.342428,2.509447,3.562266,3.599380,0.050124,DC
1047,2024-09-01,"Washington, DC",900.0,16477.0,910.0,16648.0,21885.0,22115.0,2390.051806,38.9047,-77.0163,6304975.0,0.142744,2.613333,0.144330,2.640455,3.471068,3.507548,0.049168,DC
1048,2024-10-01,"Washington, DC",1801.0,19004.0,1813.0,19185.0,21838.0,22060.0,2389.848641,38.9047,-77.0163,6304975.0,0.285647,3.014128,0.287551,3.042835,3.463614,3.498824,0.047615,DC


In [26]:
regions = pd.read_csv('data/regions.csv')

# merge
homebuilding_zori = pd.merge(
    homebuilding_zori,
    regions,
    on='state',
    how='left'
)

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,...,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state,region
0,2023-03-01,"Atlanta, GA",3248.0,10245.0,3281.0,10362.0,3248.0,3281.0,1852.196831,33.7628,...,6307261.0,0.514962,1.624318,0.520194,1.642868,0.514962,0.520194,,GA,Southeast
1,2023-04-01,"Atlanta, GA",3018.0,13175.0,3030.0,13300.0,6266.0,6311.0,1862.150304,33.7628,...,6307261.0,0.478496,2.088862,0.480399,2.108681,0.993458,1.000593,,GA,Southeast
2,2023-05-01,"Atlanta, GA",4461.0,17621.0,4498.0,17782.0,10727.0,10809.0,1869.863596,33.7628,...,6307261.0,0.707280,2.793764,0.713146,2.819290,1.700738,1.713739,,GA,Southeast
3,2023-06-01,"Atlanta, GA",3429.0,21029.0,3454.0,21214.0,14156.0,14263.0,1875.865516,33.7628,...,6307261.0,0.543659,3.334094,0.547623,3.363425,2.244397,2.261362,,GA,Southeast
4,2023-07-01,"Atlanta, GA",3524.0,24417.0,3547.0,24623.0,17680.0,17810.0,1876.158151,33.7628,...,6307261.0,0.558721,3.871253,0.562368,3.903913,2.803119,2.823730,,GA,Southeast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,2024-07-01,"Washington, DC",1983.0,13587.0,2006.0,13728.0,22572.0,22822.0,2365.897539,38.9047,...,6304975.0,0.314514,2.154965,0.318161,2.177328,3.580030,3.619681,0.050730,DC,Northest
1046,2024-08-01,"Washington, DC",2137.0,15660.0,2159.0,15822.0,22460.0,22694.0,2381.806738,38.9047,...,6304975.0,0.338939,2.483753,0.342428,2.509447,3.562266,3.599380,0.050124,DC,Northest
1047,2024-09-01,"Washington, DC",900.0,16477.0,910.0,16648.0,21885.0,22115.0,2390.051806,38.9047,...,6304975.0,0.142744,2.613333,0.144330,2.640455,3.471068,3.507548,0.049168,DC,Northest
1048,2024-10-01,"Washington, DC",1801.0,19004.0,1813.0,19185.0,21838.0,22060.0,2389.848641,38.9047,...,6304975.0,0.285647,3.014128,0.287551,3.042835,3.463614,3.498824,0.047615,DC,Northest


In [27]:
# save the final dataset to a csv file
# homebuilding_zori.to_csv('data/homebuilding_zori.csv', index=False)