In [28]:
import pandas as pd
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
from typing import List

In [29]:
def generate_year_month_range(end_date: date = None, years_back: int = 6) -> List[str]:
    """
    Generate a list of year-month combinations in 'yyyymm' format,
    starting from the specified end date and going back a specified number of years.

    Args:
        end_date (date, optional): The end date to start from. Defaults to today's date.
        years_back (int, optional): Number of years to go back. Defaults to 6.

    Returns:
        List[str]: List of year-month combinations in 'yyyymm' format, sorted in descending order.

    Example:
        >>> generate_year_month_range()  # If today is 2024-11-14
        ['202411', '202410', '202409', ..., '201812']
    """
    # If no end date is provided, use today's date
    if end_date is None:
        end_date = date.today()

    # Calculate start date
    start_date = end_date - relativedelta(years=years_back)

    # Initialize result list
    date_list = []

    # Current date for iteration
    current_date = end_date

    # Generate dates until we reach start date
    while current_date >= start_date:
        # Format date as 'yyyymm'
        date_str = current_date.strftime('%Y%m')
        date_list.append(date_str)
        # Move to previous month
        current_date -= relativedelta(months=1)

    return date_list

# Example usage
if __name__ == "__main__":
    date_list = generate_year_month_range()
    print(f"Generated {len(date_list)} year-month combinations:")
    print(date_list[:12])  # Print first year as example

Generated 73 year-month combinations:
['202412', '202411', '202410', '202409', '202408', '202407', '202406', '202405', '202404', '202403', '202402', '202401']


In [30]:
import requests

In [31]:
# read in the metro_areas.json file as a list
import json
with open('data/metro_areas.json') as f:
    metro_areas = json.load(f)

metro_areas

['Abilene, TX',
 'Akron, OH',
 'Albany, OR',
 'Albany-Schenectady-Troy, NY',
 'Albuquerque, NM',
 'Allentown-Bethlehem-Easton, PA-NJ',
 'Altoona, PA',
 'Amarillo, TX',
 'Ames, IA',
 'Amherst Town-Northampton, MA',
 'Ann Arbor, MI',
 'Appleton, WI',
 'Asheville, NC',
 'Athens-Clarke County, GA',
 'Atlanta-Sandy Springs-Roswell, GA',
 'Atlantic City-Hammonton, NJ',
 'Auburn-Opelika, AL',
 'Baltimore-Columbia-Towson, MD',
 'Barnstable Town, MA',
 'Baton Rouge, LA',
 'Battle Creek, MI',
 'Bay City, MI',
 'Birmingham, AL',
 'Bloomington, IL',
 'Bloomington, IN',
 'Boise City, ID',
 'Boston-Cambridge-Newton, MA-NH',
 'Boulder, CO',
 'Bowling Green, KY',
 'Bremerton-Silverdale-Port Orchard, WA',
 'Bridgeport-Stamford-Danbury, CT',
 'Brownsville-Harlingen, TX',
 'Buffalo-Cheektowaga, NY',
 'Burlington, NC',
 'Burlington-South Burlington, VT',
 'Canton-Massillon, OH',
 'Cape Coral-Fort Myers, FL',
 'Cape Girardeau, MO-IL',
 'Carson City, NV',
 'Cedar Rapids, IA',
 'Chambersburg, PA',
 'Champaig

In [32]:
import pandas as pd
import requests
from datetime import datetime

def check_url_exists(url):
    """Check if a URL exists without downloading the full file"""
    try:
        response = requests.head(url)
        return response.status_code == 200
    except:
        return False

def process_2024_data(url):
    """Process data from 2024 onwards"""
    df = pd.read_excel(url, skiprows=7)
    
    # Clean columns
    df.drop(columns=['Metro /Micro Code', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_2022_2023_data(url):
    """Process data from 2022-2023"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Unnamed: 9'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_pre_2022_data(url):
    """Process data from 2021 and earlier"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Monthly Coverage Percent*', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

# Initialize an empty list to store all dataframes
all_dfs = []

# Process each date
for date_str in date_list:
    year = int(date_str[:4])
    
    try:
        # Determine URL format and processing function based on year
        if year >= 2024:
            url = f'https://www.census.gov/construction/bps/xls/cbsamonthly_{date_str}.xls'
            process_func = process_2024_data
        else:
            url = f'https://www.census.gov/construction/bps/xls/msamonthly_{date_str}.xls'
            if year >= 2022:
                process_func = process_2022_2023_data
            else:
                process_func = process_pre_2022_data
        
        # Check if URL exists
        if check_url_exists(url):
            try:
                df = process_func(url)
                all_dfs.append(df)
                print(f"Successfully processed data for {date_str}")
            except Exception as e:
                print(f"Error processing {date_str}: {str(e)}")
                continue
        else:
            print(f"No data available for {date_str}")
            continue
            
    except Exception as e:
        print(f"Error with {date_str}: {str(e)}")
        continue

# Concatenate all dataframes if we have any data
if all_dfs:
    homebuilding = pd.concat(all_dfs, ignore_index=True)
    
    # Convert date column to datetime
    homebuilding['date'] = pd.to_datetime(homebuilding['date'], format='%Y%m')
    
    # Sort by date and other relevant columns
    homebuilding = homebuilding.sort_values(['date', 'Name'], ascending=[False, True])
    
    print(f"\nFinal dataset contains {len(homebuilding)} rows from {len(all_dfs)} different months")
    print(f"Date range: {homebuilding['date'].min()} to {homebuilding['date'].max()}")
else:
    print("No data was successfully processed")
    homebuilding = pd.DataFrame()

No data available for 202412
No data available for 202411


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202410


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202409


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202408


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202407


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202406


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202405


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202404


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202403


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202402


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202401


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202312


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202311


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202310


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202309


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202308


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202307


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202306


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202305


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202304


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202303


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202302


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202301


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202212


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202211


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202210


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202209


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202208


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202207


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202206


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202205


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202204


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202203


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202202


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202201


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202112


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202111


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202110


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202109


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202108


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202107


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202106


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202105


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202104


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202103


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202102


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202101


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202012


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202011


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202010


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202009


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202008


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202007


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202006


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202005


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202004


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202003


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202002


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202001


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 201912


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 201911
No data available for 201910
No data available for 201909
No data available for 201908
No data available for 201907
No data available for 201906
No data available for 201905
No data available for 201904
No data available for 201903
No data available for 201902
No data available for 201901
No data available for 201812

Final dataset contains 27994 rows from 60 different months
Date range: 2019-11-01 00:00:00 to 2024-10-01 00:00:00


In [33]:
# homebuilding['date'] = pd.to_datetime(homebuilding['date'])
homebuilding.dtypes

CSA                                                  float64
CBSA                                                 float64
Name                                                  object
Total                                                float64
1 Unit                                               float64
2 Units                                              float64
3 and 4 Units                                        float64
5 Units or More                                      float64
Num of Structures With 5 Units or More               float64
Total_ytd                                            float64
1 Unit_ytd                                           float64
2 Units_ytd                                          float64
3 and 4 Units_ytd                                    float64
5 Units or More_ytd                                  float64
Num of Structures With 5 Units or More_ytd           float64
date                                          datetime64[ns]
dtype: object

In [34]:
homebuilding.columns

Index(['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', '3 and 4 Units',
       '5 Units or More', 'Num of Structures With 5 Units or More',
       'Total_ytd', '1 Unit_ytd', '2 Units_ytd', '3 and 4 Units_ytd',
       '5 Units or More_ytd', 'Num of Structures With 5 Units or More_ytd',
       'date'],
      dtype='object')

In [35]:
# create a 'multi_total' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total'] = homebuilding['1 Unit'] + homebuilding['2 Units'] + homebuilding['3 and 4 Units'] + homebuilding['5 Units or More'] + homebuilding['Num of Structures With 5 Units or More']

# create a 'multi_total_ytd' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total_ytd'] = homebuilding['1 Unit_ytd'] + homebuilding['2 Units_ytd'] + homebuilding['3 and 4 Units_ytd'] + homebuilding['5 Units or More_ytd'] + homebuilding['Num of Structures With 5 Units or More_ytd']

In [36]:
homebuilding

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More,Total_ytd,1 Unit_ytd,2 Units_ytd,3 and 4 Units_ytd,5 Units or More_ytd,Num of Structures With 5 Units or More_ytd,date,multi_total,multi_total_ytd
0,999.0,10100.0,"Aberdeen, SD",15.0,11.0,4.0,0.0,0.0,0.0,80.0,68.0,8.0,4.0,0.0,0.0,2024-10-01,15.0,80.0
1,999.0,10140.0,"Aberdeen, WA",28.0,25.0,0.0,3.0,0.0,0.0,253.0,216.0,8.0,12.0,17.0,2.0,2024-10-01,28.0,255.0
2,101.0,10180.0,"Abilene, TX",48.0,28.0,20.0,0.0,0.0,0.0,422.0,330.0,92.0,0.0,0.0,0.0,2024-10-01,48.0,422.0
3,999.0,10220.0,"Ada, OK",1.0,1.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,2024-10-01,1.0,2.0
4,220.0,10300.0,"Adrian, MI",12.0,12.0,0.0,0.0,0.0,0.0,118.0,118.0,0.0,0.0,0.0,0.0,2024-10-01,12.0,118.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27989,148.0,49340.0,"Worcester, MA-CT",18.0,16.0,2.0,0.0,0.0,0.0,219.0,172.0,6.0,0.0,41.0,5.0,2019-11-01,18.0,224.0
27990,999.0,49420.0,"Yakima, WA",52.0,40.0,6.0,0.0,6.0,1.0,823.0,368.0,62.0,23.0,370.0,21.0,2019-11-01,53.0,844.0
27991,276.0,49620.0,"York-Hanover, PA",11.0,7.0,0.0,4.0,0.0,0.0,155.0,96.0,6.0,11.0,42.0,3.0,2019-11-01,11.0,158.0
27992,566.0,49660.0,"Youngstown-Warren-Boardman, OH-PA",13.0,13.0,0.0,0.0,0.0,0.0,200.0,193.0,4.0,3.0,0.0,0.0,2019-11-01,13.0,200.0


In [37]:
# rename Name to name, Total to total, and Total_ytd to total_ytd
homebuilding.rename(
    columns={
        'Name': 'name',
        'Total': 'total',
        'Total_ytd': 'total_ytd'
    },
    inplace=True
)

homebuilding = homebuilding[['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd']]

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
0,2024-10-01,"Aberdeen, SD",15.0,80.0,15.0,80.0
1,2024-10-01,"Aberdeen, WA",28.0,253.0,28.0,255.0
2,2024-10-01,"Abilene, TX",48.0,422.0,48.0,422.0
3,2024-10-01,"Ada, OK",1.0,2.0,1.0,2.0
4,2024-10-01,"Adrian, MI",12.0,118.0,12.0,118.0
...,...,...,...,...,...,...
27989,2019-11-01,"Worcester, MA-CT",18.0,219.0,18.0,224.0
27990,2019-11-01,"Yakima, WA",52.0,823.0,53.0,844.0
27991,2019-11-01,"York-Hanover, PA",11.0,155.0,11.0,158.0
27992,2019-11-01,"Youngstown-Warren-Boardman, OH-PA",13.0,200.0,13.0,200.0


In [38]:
# # unify the names of the metro areas for new york city
# # replace all instances of 'New York-Newark-Jersey City, NY-NJ-PA' with 'New York-Newark-Jersey City, NY-NJ' 
# homebuilding['name'] = homebuilding['name'].str.replace('New York-Newark-Jersey City, NY-NJ-PA', 'New York-Newark-Jersey City, NY-NJ')

In [39]:
# sort by date with earliest date first
homebuilding = homebuilding.sort_values('date')



homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
27993,2019-11-01,"Yuma, AZ",80.0,1063.0,80.0,1063.0
27742,2019-11-01,"Fayetteville-Springdale-Rogers, AR-MO",495.0,6303.0,510.0,6400.0
27743,2019-11-01,"Flagstaff, AZ",48.0,650.0,49.0,655.0
27744,2019-11-01,"Flint, MI",18.0,371.0,18.0,372.0
27745,2019-11-01,"Florence, SC",36.0,664.0,37.0,682.0
...,...,...,...,...,...,...
608,2024-10-01,"Oak Harbor, WA",22.0,360.0,22.0,367.0
607,2024-10-01,"Norwich-New London-Willimantic, CT",12.0,446.0,12.0,455.0
606,2024-10-01,"Norwalk, OH",5.0,35.0,5.0,35.0
604,2024-10-01,"North Port-Bradenton-Sarasota, FL",1277.0,13171.0,1284.0,13247.0


In [40]:
# Change the name column so 'Austin-Round Rock-Georgetown, TX' becomes 'Austin, TX'
# Delete every character between the first '-' and the ','
homebuilding['name'] = homebuilding['name'].str.replace(r'-.*?,', ',')
homebuilding

  homebuilding['name'] = homebuilding['name'].str.replace(r'-.*?,', ',')


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
27993,2019-11-01,"Yuma, AZ",80.0,1063.0,80.0,1063.0
27742,2019-11-01,"Fayetteville, AR-MO",495.0,6303.0,510.0,6400.0
27743,2019-11-01,"Flagstaff, AZ",48.0,650.0,49.0,655.0
27744,2019-11-01,"Flint, MI",18.0,371.0,18.0,372.0
27745,2019-11-01,"Florence, SC",36.0,664.0,37.0,682.0
...,...,...,...,...,...,...
608,2024-10-01,"Oak Harbor, WA",22.0,360.0,22.0,367.0
607,2024-10-01,"Norwich, CT",12.0,446.0,12.0,455.0
606,2024-10-01,"Norwalk, OH",5.0,35.0,5.0,35.0
604,2024-10-01,"North Port, FL",1277.0,13171.0,1284.0,13247.0


In [41]:
# change the name of metro areas to match the names in the metro_areas.json file
homebuilding['name'] = homebuilding['name'].str.replace(r'-.*', '')
homebuilding

  homebuilding['name'] = homebuilding['name'].str.replace(r'-.*', '')


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
27993,2019-11-01,"Yuma, AZ",80.0,1063.0,80.0,1063.0
27742,2019-11-01,"Fayetteville, AR",495.0,6303.0,510.0,6400.0
27743,2019-11-01,"Flagstaff, AZ",48.0,650.0,49.0,655.0
27744,2019-11-01,"Flint, MI",18.0,371.0,18.0,372.0
27745,2019-11-01,"Florence, SC",36.0,664.0,37.0,682.0
...,...,...,...,...,...,...
608,2024-10-01,"Oak Harbor, WA",22.0,360.0,22.0,367.0
607,2024-10-01,"Norwich, CT",12.0,446.0,12.0,455.0
606,2024-10-01,"Norwalk, OH",5.0,35.0,5.0,35.0
604,2024-10-01,"North Port, FL",1277.0,13171.0,1284.0,13247.0


In [42]:
# Ensure the DataFrame is sorted by 'name' and 'date'
homebuilding = homebuilding.sort_values(by=['name', 'date'])

# Create a 12-month running total column
homebuilding['rt'] = (
    homebuilding.groupby('name')['total']
    .rolling(window=12, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
)

# Create a 12-month running total column for multi-unit structures
homebuilding['multi_rt'] = (
    homebuilding.groupby('name')['multi_total']
    .rolling(window=12, min_periods=1)
    .sum()
    .reset_index(level=0, drop=True)
)

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt
8289,2024-01-01,"Aberdeen, SD",2.0,2.0,2.0,2.0,2.0,2.0
7368,2024-02-01,"Aberdeen, SD",2.0,4.0,2.0,4.0,4.0,4.0
6447,2024-03-01,"Aberdeen, SD",4.0,8.0,4.0,8.0,8.0,8.0
5526,2024-04-01,"Aberdeen, SD",15.0,23.0,15.0,23.0,23.0,23.0
4605,2024-05-01,"Aberdeen, SD",1.0,24.0,1.0,24.0,24.0,24.0
...,...,...,...,...,...,...,...,...
4604,2024-06-01,"Zanesville, OH",19.0,53.0,20.0,56.0,53.0,56.0
3683,2024-07-01,"Zanesville, OH",14.0,67.0,15.0,71.0,67.0,71.0
2762,2024-08-01,"Zanesville, OH",17.0,82.0,18.0,87.0,84.0,89.0
1841,2024-09-01,"Zanesville, OH",15.0,97.0,16.0,103.0,99.0,105.0


In [43]:
# show the df sorted by multi_total_12_month_running in the max date
homebuilding[homebuilding['date'] == homebuilding['date'].max()].sort_values('multi_rt', ascending=False)

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt
206,2024-10-01,"Dallas, TX",5893.0,61723.0,5951.0,62233.0,68778.0,69311.0
380,2024-10-01,"Houston, TX",6757.0,55935.0,6791.0,56223.0,63568.0,63842.0
596,2024-10-01,"New York, NY",4168.0,48063.0,4290.0,49068.0,54042.0,55326.0
653,2024-10-01,"Phoenix, AZ",3701.0,38818.0,3729.0,39096.0,46554.0,46950.0
48,2024-10-01,"Atlanta, GA",2634.0,34555.0,2660.0,35306.0,37734.0,38529.0
...,...,...,...,...,...,...,...,...
314,2024-10-01,"Gallup, NM",0.0,1.0,0.0,1.0,1.0,1.0
496,2024-10-01,"Macomb, IL",0.0,0.0,0.0,0.0,0.0,0.0
459,2024-10-01,"Las Vegas, NM",0.0,0.0,0.0,0.0,0.0,0.0
312,2024-10-01,"Galesburg, IL",0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# population

# read in the population data
metros = pd.read_csv('data/zori_metro_long_clean.csv')

# change 'date' to datetime
metros['date'] = pd.to_datetime(metros['date'])

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023
0,"New York, NY",2015-02-01,2255.133793,40.6943,-73.9249,19498249.0
1,"Los Angeles, CA",2015-02-01,1814.136486,34.1141,-118.4068,12799100.0
2,"Chicago, IL",2015-02-01,1356.915213,41.8375,-87.6866,9262825.0
3,"Dallas, TX",2015-02-01,1090.919667,32.7935,-96.7667,8100037.0
4,"Houston, TX",2015-02-01,1229.410303,29.7860,-95.3885,7510253.0
...,...,...,...,...,...,...
74608,"Mitchell, SD",2024-12-01,737.500000,43.7294,-98.0337,
74609,"Wahpeton, ND",2024-12-01,766.500000,46.2722,-96.6118,
74610,"Jamestown, ND",2024-12-01,1013.611111,46.9063,-98.6937,
74611,"Portales, NM",2024-12-01,1015.158730,34.1754,-103.3565,


In [45]:
# merge the population data with the homebuilding data
# merge on 'name' and 'date'
# only include rows where both 'name' and 'date' are in both dataframes
homebuilding_zori = pd.merge(
    homebuilding,
    metros,
    on=['name', 'date'],
    how='inner'
)

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023
0,2024-01-01,"Aberdeen, SD",2.0,2.0,2.0,2.0,2.0,2.0,,45.4649,-98.4686,
1,2024-02-01,"Aberdeen, SD",2.0,4.0,2.0,4.0,4.0,4.0,,45.4649,-98.4686,
2,2024-03-01,"Aberdeen, SD",4.0,8.0,4.0,8.0,8.0,8.0,,45.4649,-98.4686,
3,2024-04-01,"Aberdeen, SD",15.0,23.0,15.0,23.0,23.0,23.0,,45.4649,-98.4686,
4,2024-05-01,"Aberdeen, SD",1.0,24.0,1.0,24.0,24.0,24.0,,45.4649,-98.4686,
...,...,...,...,...,...,...,...,...,...,...,...,...
24705,2024-06-01,"Zanesville, OH",19.0,53.0,20.0,56.0,53.0,56.0,959.566063,39.9565,-82.0132,
24706,2024-07-01,"Zanesville, OH",14.0,67.0,15.0,71.0,67.0,71.0,964.356198,39.9565,-82.0132,
24707,2024-08-01,"Zanesville, OH",17.0,82.0,18.0,87.0,84.0,89.0,968.839603,39.9565,-82.0132,
24708,2024-09-01,"Zanesville, OH",15.0,97.0,16.0,103.0,99.0,105.0,979.434252,39.9565,-82.0132,


In [46]:
homebuilding_zori.columns

Index(['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd',
       'rt', 'multi_rt', 'zori', 'lat', 'lng', 'pop_2023'],
      dtype='object')

In [48]:
# create 'total_per_capita' and 'total_ytd_per_capita' columns that divide 'total' and 'total_ytd' by 'pop_2023'
homebuilding_zori['total_pc'] = homebuilding_zori['total'] / homebuilding_zori['pop_2023']
homebuilding_zori['total_ytd_pc'] = homebuilding_zori['total_ytd'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_pc'] = homebuilding_zori['multi_total'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_ytd_pc'] = homebuilding_zori['multi_total_ytd'] / homebuilding_zori['pop_2023']
homebuilding_zori['rt_pc'] = homebuilding_zori['rt'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_rt_pc'] = homebuilding_zori['multi_rt'] / homebuilding_zori['pop_2023']


# change per capita columns to per 1000
for col in homebuilding_zori.columns:
    if 'pc' in col:
        homebuilding_zori[col] = homebuilding_zori[col] * 1000

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc
0,2024-01-01,"Aberdeen, SD",2.0,2.0,2.0,2.0,2.0,2.0,,45.4649,-98.4686,,,,,,,
1,2024-02-01,"Aberdeen, SD",2.0,4.0,2.0,4.0,4.0,4.0,,45.4649,-98.4686,,,,,,,
2,2024-03-01,"Aberdeen, SD",4.0,8.0,4.0,8.0,8.0,8.0,,45.4649,-98.4686,,,,,,,
3,2024-04-01,"Aberdeen, SD",15.0,23.0,15.0,23.0,23.0,23.0,,45.4649,-98.4686,,,,,,,
4,2024-05-01,"Aberdeen, SD",1.0,24.0,1.0,24.0,24.0,24.0,,45.4649,-98.4686,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24705,2024-06-01,"Zanesville, OH",19.0,53.0,20.0,56.0,53.0,56.0,959.566063,39.9565,-82.0132,,,,,,,
24706,2024-07-01,"Zanesville, OH",14.0,67.0,15.0,71.0,67.0,71.0,964.356198,39.9565,-82.0132,,,,,,,
24707,2024-08-01,"Zanesville, OH",17.0,82.0,18.0,87.0,84.0,89.0,968.839603,39.9565,-82.0132,,,,,,,
24708,2024-09-01,"Zanesville, OH",15.0,97.0,16.0,103.0,99.0,105.0,979.434252,39.9565,-82.0132,,,,,,,


In [49]:
# create 'zori_yoy' column that calculates the year-over-year change in 'zori' for each metro area
homebuilding_zori['zori_yoy'] = homebuilding_zori.groupby('name')['zori'].pct_change(12)
homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
0,2024-01-01,"Aberdeen, SD",2.0,2.0,2.0,2.0,2.0,2.0,,45.4649,-98.4686,,,,,,,,
1,2024-02-01,"Aberdeen, SD",2.0,4.0,2.0,4.0,4.0,4.0,,45.4649,-98.4686,,,,,,,,
2,2024-03-01,"Aberdeen, SD",4.0,8.0,4.0,8.0,8.0,8.0,,45.4649,-98.4686,,,,,,,,
3,2024-04-01,"Aberdeen, SD",15.0,23.0,15.0,23.0,23.0,23.0,,45.4649,-98.4686,,,,,,,,
4,2024-05-01,"Aberdeen, SD",1.0,24.0,1.0,24.0,24.0,24.0,,45.4649,-98.4686,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24705,2024-06-01,"Zanesville, OH",19.0,53.0,20.0,56.0,53.0,56.0,959.566063,39.9565,-82.0132,,,,,,,,
24706,2024-07-01,"Zanesville, OH",14.0,67.0,15.0,71.0,67.0,71.0,964.356198,39.9565,-82.0132,,,,,,,,
24707,2024-08-01,"Zanesville, OH",17.0,82.0,18.0,87.0,84.0,89.0,968.839603,39.9565,-82.0132,,,,,,,,
24708,2024-09-01,"Zanesville, OH",15.0,97.0,16.0,103.0,99.0,105.0,979.434252,39.9565,-82.0132,,,,,,,,


In [51]:
homebuilding_zori[homebuilding_zori['name'] == 'Austin, TX']

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
1440,2019-11-01,"Austin, TX",1967.0,29479.0,1972.0,29815.0,1967.0,1972.0,1426.857721,30.3005,-97.7522,2473275.0,0.795302,11.919014,0.797323,12.054867,0.795302,0.797323,
1441,2019-12-01,"Austin, TX",2607.0,32025.0,2643.0,32394.0,4574.0,4615.0,1423.209683,30.3005,-97.7522,2473275.0,1.054068,12.948419,1.068624,13.097613,1.84937,1.865947,
1442,2020-01-01,"Austin, TX",4393.0,4393.0,4457.0,4457.0,8967.0,9072.0,1424.692204,30.3005,-97.7522,2473275.0,1.776187,1.776187,1.802064,1.802064,3.625557,3.668011,
1443,2020-02-01,"Austin, TX",3446.0,7429.0,3487.0,7524.0,12413.0,12559.0,1427.926264,30.3005,-97.7522,2473275.0,1.393294,3.00371,1.409872,3.04212,5.018852,5.077883,
1444,2020-03-01,"Austin, TX",2700.0,10546.0,2714.0,10654.0,15113.0,15273.0,1434.921945,30.3005,-97.7522,2473275.0,1.09167,4.263982,1.09733,4.307649,6.110521,6.175213,
1445,2020-04-01,"Austin, TX",3309.0,13875.0,3341.0,14015.0,18422.0,18614.0,1442.682331,30.3005,-97.7522,2473275.0,1.337902,5.609971,1.35084,5.666576,7.448424,7.526054,
1446,2020-05-01,"Austin, TX",2935.0,16782.0,2967.0,16953.0,21357.0,21581.0,1440.253657,30.3005,-97.7522,2473275.0,1.186686,6.785335,1.199624,6.854474,8.635109,8.725677,
1447,2020-06-01,"Austin, TX",2254.0,19052.0,2268.0,19237.0,23611.0,23849.0,1431.306248,30.3005,-97.7522,2473275.0,0.911342,7.703147,0.917003,7.777946,9.546452,9.64268,
1448,2020-07-01,"Austin, TX",3979.0,23073.0,4033.0,23312.0,27590.0,27882.0,1424.48081,30.3005,-97.7522,2473275.0,1.608798,9.328926,1.630631,9.425559,11.15525,11.273312,
1449,2020-08-01,"Austin, TX",3253.0,26325.0,3288.0,26599.0,30843.0,31170.0,1427.103889,30.3005,-97.7522,2473275.0,1.31526,10.643782,1.329411,10.754566,12.47051,12.602723,


In [52]:
# sort by multi_rt_pc in the max date and filter to this date
homebuilding_zori[homebuilding_zori['date'] == homebuilding_zori['date'].max()].sort_values('multi_rt_pc', ascending=False)

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,lng,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy
18475,2024-10-01,"Raleigh, NC",1454.0,17208.0,1460.0,17292.0,21084.0,21218.0,1755.977429,35.8324,-78.6429,1509231.0,0.963405,11.401833,0.967380,11.457491,13.970028,14.058815,0.001350
1499,2024-10-01,"Austin, TX",2696.0,27343.0,2715.0,27603.0,32893.0,33176.0,1725.891345,30.3005,-97.7522,2473275.0,1.090053,11.055382,1.097735,11.160506,13.299370,13.413793,-0.033223
15931,2024-10-01,"Nashville, TN",1120.0,17069.0,1124.0,17158.0,19783.0,19899.0,1877.896629,36.1715,-86.7842,2102573.0,0.532681,8.118149,0.534583,8.160478,9.408948,9.464118,0.015124
11045,2024-10-01,"Jacksonville, FL",1437.0,13158.0,1439.0,13230.0,16079.0,16184.0,1711.673640,30.3322,-81.6749,1713240.0,0.838762,7.680185,0.839929,7.722211,9.385142,9.446429,0.010405
4283,2024-10-01,"Charlotte, NC",1745.0,22028.0,1749.0,22195.0,26032.0,26224.0,1784.142216,35.2083,-80.8303,2805115.0,0.622078,7.852797,0.623504,7.912332,9.280190,9.348636,0.014107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24545,2024-10-01,"York, PA",147.0,1169.0,148.0,1192.0,1319.0,1343.0,1310.424632,39.9651,-76.7315,,,,,,,,0.057850
24605,2024-10-01,"Youngstown, OH",36.0,245.0,36.0,245.0,291.0,291.0,1010.451213,41.0993,-80.6463,,,,,,,,0.076995
24639,2024-10-01,"Yuba City, CA",57.0,721.0,57.0,723.0,781.0,783.0,2121.886939,39.1357,-121.6383,,,,,,,,0.049433
24699,2024-10-01,"Yuma, AZ",129.0,1192.0,131.0,1198.0,1426.0,1437.0,1505.926194,32.5995,-114.5491,,,,,,,,0.047207


In [53]:
# save the final dataset to a csv file
homebuilding_zori.to_csv('data/homebuilding_zori.csv', index=False)