In [35]:
import pandas as pd
from datetime import datetime, date
from dateutil.relativedelta import relativedelta
from typing import List

In [36]:
def generate_year_month_range(end_date: date = None, years_back: int = 6) -> List[str]:
    """
    Generate a list of year-month combinations in 'yyyymm' format,
    starting from the specified end date and going back a specified number of years.

    Args:
        end_date (date, optional): The end date to start from. Defaults to today's date.
        years_back (int, optional): Number of years to go back. Defaults to 6.

    Returns:
        List[str]: List of year-month combinations in 'yyyymm' format, sorted in descending order.

    Example:
        >>> generate_year_month_range()  # If today is 2024-11-14
        ['202411', '202410', '202409', ..., '201812']
    """
    # If no end date is provided, use today's date
    if end_date is None:
        end_date = date.today()

    # Calculate start date
    start_date = end_date - relativedelta(years=years_back)

    # Initialize result list
    date_list = []

    # Current date for iteration
    current_date = end_date

    # Generate dates until we reach start date
    while current_date >= start_date:
        # Format date as 'yyyymm'
        date_str = current_date.strftime('%Y%m')
        date_list.append(date_str)
        # Move to previous month
        current_date -= relativedelta(months=1)

    return date_list

# Example usage
if __name__ == "__main__":
    date_list = generate_year_month_range()
    print(f"Generated {len(date_list)} year-month combinations:")
    print(date_list[:12])  # Print first year as example

Generated 73 year-month combinations:
['202411', '202410', '202409', '202408', '202407', '202406', '202405', '202404', '202403', '202402', '202401', '202312']


In [37]:
import requests

In [38]:
# read in the metro_areas.json file as a list
import json
with open('metro_areas.json') as f:
    metro_areas = json.load(f)

metro_areas

['Abilene, TX',
 'Akron, OH',
 'Albany, OR',
 'Albany-Schenectady-Troy, NY',
 'Albuquerque, NM',
 'Allentown-Bethlehem-Easton, PA-NJ',
 'Altoona, PA',
 'Amarillo, TX',
 'Ames, IA',
 'Amherst Town-Northampton, MA',
 'Ann Arbor, MI',
 'Appleton, WI',
 'Asheville, NC',
 'Athens-Clarke County, GA',
 'Atlanta-Sandy Springs-Roswell, GA',
 'Atlantic City-Hammonton, NJ',
 'Auburn-Opelika, AL',
 'Baltimore-Columbia-Towson, MD',
 'Barnstable Town, MA',
 'Baton Rouge, LA',
 'Battle Creek, MI',
 'Bay City, MI',
 'Birmingham, AL',
 'Bloomington, IL',
 'Bloomington, IN',
 'Boise City, ID',
 'Boston-Cambridge-Newton, MA-NH',
 'Boulder, CO',
 'Bowling Green, KY',
 'Bremerton-Silverdale-Port Orchard, WA',
 'Bridgeport-Stamford-Danbury, CT',
 'Brownsville-Harlingen, TX',
 'Buffalo-Cheektowaga, NY',
 'Burlington, NC',
 'Burlington-South Burlington, VT',
 'Canton-Massillon, OH',
 'Cape Coral-Fort Myers, FL',
 'Cape Girardeau, MO-IL',
 'Carson City, NV',
 'Cedar Rapids, IA',
 'Chambersburg, PA',
 'Champaig

In [42]:
import pandas as pd
import requests
from datetime import datetime

def check_url_exists(url):
    """Check if a URL exists without downloading the full file"""
    try:
        response = requests.head(url)
        return response.status_code == 200
    except:
        return False

def process_2024_data(url):
    """Process data from 2024 onwards"""
    df = pd.read_excel(url, skiprows=7)
    
    # Clean columns
    df.drop(columns=['Metro /Micro Code', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_2022_2023_data(url):
    """Process data from 2022-2023"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Unnamed: 9'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

def process_pre_2022_data(url):
    """Process data from 2021 and earlier"""
    df = pd.read_excel(url, skiprows=7)
    
    # Remove first row and clean columns
    df = df.iloc[1:]
    df.drop(columns=['Monthly Coverage Percent*', 'Unnamed: 10'], inplace=True)
    
    # Clean column names and data
    df.columns = df.columns.str.replace('.1', '_ytd')
    df['Name'] = df['Name'].str.rstrip()
    
    # Add date
    df['date'] = url[-10:-4]
    
    return df

# Initialize an empty list to store all dataframes
all_dfs = []

# Process each date
for date_str in date_list:
    year = int(date_str[:4])
    
    try:
        # Determine URL format and processing function based on year
        if year >= 2024:
            url = f'https://www.census.gov/construction/bps/xls/cbsamonthly_{date_str}.xls'
            process_func = process_2024_data
        else:
            url = f'https://www.census.gov/construction/bps/xls/msamonthly_{date_str}.xls'
            if year >= 2022:
                process_func = process_2022_2023_data
            else:
                process_func = process_pre_2022_data
        
        # Check if URL exists
        if check_url_exists(url):
            try:
                df = process_func(url)
                all_dfs.append(df)
                print(f"Successfully processed data for {date_str}")
            except Exception as e:
                print(f"Error processing {date_str}: {str(e)}")
                continue
        else:
            print(f"No data available for {date_str}")
            continue
            
    except Exception as e:
        print(f"Error with {date_str}: {str(e)}")
        continue

# Concatenate all dataframes if we have any data
if all_dfs:
    homebuilding = pd.concat(all_dfs, ignore_index=True)
    
    # Convert date column to datetime
    homebuilding['date'] = pd.to_datetime(homebuilding['date'], format='%Y%m')
    
    # Sort by date and other relevant columns
    homebuilding = homebuilding.sort_values(['date', 'Name'], ascending=[False, True])
    
    print(f"\nFinal dataset contains {len(homebuilding)} rows from {len(all_dfs)} different months")
    print(f"Date range: {homebuilding['date'].min()} to {homebuilding['date'].max()}")
else:
    print("No data was successfully processed")
    homebuilding = pd.DataFrame()

No data available for 202411
No data available for 202410


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202409


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202408


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202407
Error processing 202406: <urlopen error EOF occurred in violation of protocol (_ssl.c:1129)>


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202405


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202404


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202403


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202402


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202401


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202312


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202311


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202310


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202309


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202308


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202307


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202306


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202305


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202304


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202303


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202302


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202301


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202212


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202211


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202210


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202209


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202208


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202207


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202206


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202205


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202204


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202203


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202202


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202201


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202112


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202111


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202110


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202109


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202108


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202107


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202106


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202105


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202104


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202103


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202102


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202101


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202012


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202011


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202010


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202009


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202008


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202007


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202006


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202005


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202004


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202003


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202002


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 202001


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 201912


  df.columns = df.columns.str.replace('.1', '_ytd')


Successfully processed data for 201911
No data available for 201910
No data available for 201909
No data available for 201908
No data available for 201907
No data available for 201906
No data available for 201905
No data available for 201904
No data available for 201903
No data available for 201902
No data available for 201901
No data available for 201812
No data available for 201811

Final dataset contains 26152 rows from 58 different months
Date range: 2019-11-01 00:00:00 to 2024-09-01 00:00:00


In [43]:
# homebuilding['date'] = pd.to_datetime(homebuilding['date'])
homebuilding.dtypes

CSA                                                  float64
CBSA                                                 float64
Name                                                  object
Total                                                float64
1 Unit                                               float64
2 Units                                              float64
3 and 4 Units                                        float64
5 Units or More                                      float64
Num of Structures With 5 Units or More               float64
Total_ytd                                            float64
1 Unit_ytd                                           float64
2 Units_ytd                                          float64
3 and 4 Units_ytd                                    float64
5 Units or More_ytd                                  float64
Num of Structures With 5 Units or More_ytd           float64
date                                          datetime64[ns]
dtype: object

In [44]:
homebuilding.columns

Index(['CSA', 'CBSA', 'Name', 'Total', '1 Unit', '2 Units', '3 and 4 Units',
       '5 Units or More', 'Num of Structures With 5 Units or More',
       'Total_ytd', '1 Unit_ytd', '2 Units_ytd', '3 and 4 Units_ytd',
       '5 Units or More_ytd', 'Num of Structures With 5 Units or More_ytd',
       'date'],
      dtype='object')

In [45]:
# create a 'multi_total' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total'] = homebuilding['1 Unit'] + homebuilding['2 Units'] + homebuilding['3 and 4 Units'] + homebuilding['5 Units or More'] + homebuilding['Num of Structures With 5 Units or More']

# create a 'multi_total_ytd' column that sums the columns for different types of multi-unit structures
homebuilding['multi_total_ytd'] = homebuilding['1 Unit_ytd'] + homebuilding['2 Units_ytd'] + homebuilding['3 and 4 Units_ytd'] + homebuilding['5 Units or More_ytd'] + homebuilding['Num of Structures With 5 Units or More_ytd']

In [46]:
homebuilding

Unnamed: 0,CSA,CBSA,Name,Total,1 Unit,2 Units,3 and 4 Units,5 Units or More,Num of Structures With 5 Units or More,Total_ytd,1 Unit_ytd,2 Units_ytd,3 and 4 Units_ytd,5 Units or More_ytd,Num of Structures With 5 Units or More_ytd,date,multi_total,multi_total_ytd
0,999.0,10100.0,"Aberdeen, SD",3.0,3.0,0.0,0.0,0.0,0.0,65.0,57.0,4.0,4.0,0.0,0.0,2024-09-01,3.0,65.0
1,999.0,10140.0,"Aberdeen, WA",40.0,20.0,2.0,6.0,12.0,1.0,226.0,192.0,8.0,9.0,17.0,2.0,2024-09-01,41.0,228.0
2,101.0,10180.0,"Abilene, TX",38.0,32.0,6.0,0.0,0.0,0.0,374.0,302.0,72.0,0.0,0.0,0.0,2024-09-01,38.0,374.0
3,999.0,10220.0,"Ada, OK",1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2024-09-01,1.0,1.0
4,220.0,10300.0,"Adrian, MI",10.0,10.0,0.0,0.0,0.0,0.0,106.0,106.0,0.0,0.0,0.0,0.0,2024-09-01,10.0,106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26147,148.0,49340.0,"Worcester, MA-CT",18.0,16.0,2.0,0.0,0.0,0.0,219.0,172.0,6.0,0.0,41.0,5.0,2019-11-01,18.0,224.0
26148,999.0,49420.0,"Yakima, WA",52.0,40.0,6.0,0.0,6.0,1.0,823.0,368.0,62.0,23.0,370.0,21.0,2019-11-01,53.0,844.0
26149,276.0,49620.0,"York-Hanover, PA",11.0,7.0,0.0,4.0,0.0,0.0,155.0,96.0,6.0,11.0,42.0,3.0,2019-11-01,11.0,158.0
26150,566.0,49660.0,"Youngstown-Warren-Boardman, OH-PA",13.0,13.0,0.0,0.0,0.0,0.0,200.0,193.0,4.0,3.0,0.0,0.0,2019-11-01,13.0,200.0


In [47]:
# rename Name to name, Total to total, and Total_ytd to total_ytd
homebuilding.rename(
    columns={
        'Name': 'name',
        'Total': 'total',
        'Total_ytd': 'total_ytd'
    },
    inplace=True
)

homebuilding = homebuilding[['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd']]

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd
0,2024-09-01,"Aberdeen, SD",3.0,65.0,3.0,65.0
1,2024-09-01,"Aberdeen, WA",40.0,226.0,41.0,228.0
2,2024-09-01,"Abilene, TX",38.0,374.0,38.0,374.0
3,2024-09-01,"Ada, OK",1.0,1.0,1.0,1.0
4,2024-09-01,"Adrian, MI",10.0,106.0,10.0,106.0
...,...,...,...,...,...,...
26147,2019-11-01,"Worcester, MA-CT",18.0,219.0,18.0,224.0
26148,2019-11-01,"Yakima, WA",52.0,823.0,53.0,844.0
26149,2019-11-01,"York-Hanover, PA",11.0,155.0,11.0,158.0
26150,2019-11-01,"Youngstown-Warren-Boardman, OH-PA",13.0,200.0,13.0,200.0


In [None]:
# # unify the names of the metro areas for new york city
# # replace all instances of 'New York-Newark-Jersey City, NY-NJ-PA' with 'New York-Newark-Jersey City, NY-NJ' 
# homebuilding['name'] = homebuilding['name'].str.replace('New York-Newark-Jersey City, NY-NJ-PA', 'New York-Newark-Jersey City, NY-NJ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  homebuilding['name'] = homebuilding['name'].str.replace('New York-Newark-Jersey City, NY-NJ-PA', 'New York-Newark-Jersey City, NY-NJ')


In [None]:
# sort by date with earliest date first
homebuilding = homebuilding.sort_values('date')



homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,total_yoy,total_ytd_yoy,multi_total_yoy,multi_total_ytd_yoy
26151,2019-11-01,"Yuma, AZ",80.0,1063.0,80.0,1063.0,,,,
25900,2019-11-01,"Fayetteville-Springdale-Rogers, AR-MO",495.0,6303.0,510.0,6400.0,,,,
25901,2019-11-01,"Flagstaff, AZ",48.0,650.0,49.0,655.0,,,,
25902,2019-11-01,"Flint, MI",18.0,371.0,18.0,372.0,,,,
25903,2019-11-01,"Florence, SC",36.0,664.0,37.0,682.0,,,,
...,...,...,...,...,...,...,...,...,...,...
608,2024-09-01,"Oak Harbor, WA",43.0,327.0,46.0,334.0,,,,
607,2024-09-01,"Norwich-New London-Willimantic, CT",195.0,434.0,200.0,443.0,,,,
606,2024-09-01,"Norwalk, OH",4.0,30.0,4.0,30.0,,,,
604,2024-09-01,"North Port-Bradenton-Sarasota, FL",1260.0,11894.0,1272.0,11963.0,,,,


In [56]:
# Change the name column so 'Austin-Round Rock-Georgetown, TX' becomes 'Austin, TX'
# Delete every character between the first '-' and the ','
homebuilding['name'] = homebuilding['name'].str.replace(r'-.*?,', ',')
homebuilding

  homebuilding['name'] = homebuilding['name'].str.replace(r'-.*?,', ',')


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,total_yoy,total_ytd_yoy,multi_total_yoy,multi_total_ytd_yoy
26151,2019-11-01,"Yuma, AZ",80.0,1063.0,80.0,1063.0,,,,
25900,2019-11-01,"Fayetteville, AR-MO",495.0,6303.0,510.0,6400.0,,,,
25901,2019-11-01,"Flagstaff, AZ",48.0,650.0,49.0,655.0,,,,
25902,2019-11-01,"Flint, MI",18.0,371.0,18.0,372.0,,,,
25903,2019-11-01,"Florence, SC",36.0,664.0,37.0,682.0,,,,
...,...,...,...,...,...,...,...,...,...,...
608,2024-09-01,"Oak Harbor, WA",43.0,327.0,46.0,334.0,,,,
607,2024-09-01,"Norwich, CT",195.0,434.0,200.0,443.0,,,,
606,2024-09-01,"Norwalk, OH",4.0,30.0,4.0,30.0,,,,
604,2024-09-01,"North Port, FL",1260.0,11894.0,1272.0,11963.0,,,,


In [None]:
# change the name of metro areas to match the names in the metro_areas.json file
homebuilding['name'] = homebuilding['name'].str.replace(r'-.*', '')
homebuilding

  homebuilding['name'] = homebuilding['name'].str.replace(r'-.*', '')


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,total_yoy,total_ytd_yoy,multi_total_yoy,multi_total_ytd_yoy
26151,2019-11-01,"Yuma, AZ",80.0,1063.0,80.0,1063.0,,,,
25900,2019-11-01,"Fayetteville, AR",495.0,6303.0,510.0,6400.0,,,,
25901,2019-11-01,"Flagstaff, AZ",48.0,650.0,49.0,655.0,,,,
25902,2019-11-01,"Flint, MI",18.0,371.0,18.0,372.0,,,,
25903,2019-11-01,"Florence, SC",36.0,664.0,37.0,682.0,,,,
...,...,...,...,...,...,...,...,...,...,...
608,2024-09-01,"Oak Harbor, WA",43.0,327.0,46.0,334.0,,,,
607,2024-09-01,"Norwich, CT",195.0,434.0,200.0,443.0,,,,
606,2024-09-01,"Norwalk, OH",4.0,30.0,4.0,30.0,,,,
604,2024-09-01,"North Port, FL",1260.0,11894.0,1272.0,11963.0,,,,


In [59]:
# create a 'total_yoy' column that calculates the year-over-year change in 'total' for each metro area
homebuilding['total_yoy'] = homebuilding.groupby('name')['total'].pct_change(12)
homebuilding['total_ytd_yoy'] = homebuilding.groupby('name')['total_ytd'].pct_change(12)
homebuilding['multi_total_yoy'] = homebuilding.groupby('name')['multi_total'].pct_change(12)
homebuilding['multi_total_ytd_yoy'] = homebuilding.groupby('name')['multi_total_ytd'].pct_change(12)

In [62]:
homebuilding[homebuilding['name']=='New York, NY']

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,total_yoy,total_ytd_yoy,multi_total_yoy,multi_total_ytd_yoy
26023,2019-11-01,"New York, NY",6967.0,55381.0,7092.0,56746.0,,,,
25655,2019-12-01,"New York, NY",5708.0,60746.0,5840.0,62149.0,,,,
25287,2020-01-01,"New York, NY",5982.0,5982.0,6087.0,6087.0,,,,
24919,2020-02-01,"New York, NY",2929.0,8873.0,3018.0,9066.0,,,,
24551,2020-03-01,"New York, NY",3410.0,12366.0,3485.0,12633.0,,,,
24183,2020-04-01,"New York, NY",1571.0,14277.0,1647.0,14611.0,,,,
23815,2020-05-01,"New York, NY",2886.0,17936.0,2970.0,18346.0,,,,
23447,2020-06-01,"New York, NY",5370.0,22906.0,5473.0,23406.0,,,,
23079,2020-07-01,"New York, NY",4738.0,29027.0,4857.0,29645.0,,,,
22711,2020-08-01,"New York, NY",3382.0,33409.0,3478.0,34130.0,,,,


In [66]:
# population

# read in the population data
metros = pd.read_csv('heat_zori_metro_clean.csv', index_col=0)

metros

Unnamed: 0,RegionID,SizeRank,name,RegionType,StateName,pop_2023,date,heat,lat,lng,zori
0,102001,0,United States,country,,335893238,2018-01-31,49.0,,,1405.842990
1,394913,1,"New York, NY",msa,NY,19498249,2018-01-31,53.0,40.6943,-73.9249,2520.028880
2,753899,2,"Los Angeles, CA",msa,CA,12799100,2018-01-31,68.0,34.1141,-118.4068,2190.975308
3,394463,3,"Chicago, IL",msa,IL,9262825,2018-01-31,48.0,41.8375,-87.6866,1509.843282
4,394514,4,"Dallas, TX",msa,TX,8100037,2018-01-31,55.0,32.7935,-96.7667,1289.240890
...,...,...,...,...,...,...,...,...,...,...,...
4450,394669,49,"Hartford, CT",msa,CT,1151543,2024-09-30,83.0,41.7661,-72.6834,1920.139081
4451,395167,54,"Tucson, AZ",msa,AZ,1063162,2024-09-30,50.0,32.1541,-110.8787,1621.090639
4452,395031,52,"Rochester, NY",msa,NY,1052087,2024-09-30,137.0,43.1680,-77.6162,1456.431005
4453,753924,55,"Urban Honolulu, HI",msa,HI,989408,2024-09-30,37.0,,,2693.099648


In [67]:
metros = metros[['name', 'pop_2023', 'date', 'lat', 'lng', 'zori']]
metros

Unnamed: 0,name,pop_2023,date,lat,lng,zori
0,United States,335893238,2018-01-31,,,1405.842990
1,"New York, NY",19498249,2018-01-31,40.6943,-73.9249,2520.028880
2,"Los Angeles, CA",12799100,2018-01-31,34.1141,-118.4068,2190.975308
3,"Chicago, IL",9262825,2018-01-31,41.8375,-87.6866,1509.843282
4,"Dallas, TX",8100037,2018-01-31,32.7935,-96.7667,1289.240890
...,...,...,...,...,...,...
4450,"Hartford, CT",1151543,2024-09-30,41.7661,-72.6834,1920.139081
4451,"Tucson, AZ",1063162,2024-09-30,32.1541,-110.8787,1621.090639
4452,"Rochester, NY",1052087,2024-09-30,43.1680,-77.6162,1456.431005
4453,"Urban Honolulu, HI",989408,2024-09-30,,,2693.099648


In [68]:
# change 'date' from the last day of the month to the first day of the month
metros['date'] = pd.to_datetime(metros['date'])
metros['date'] = metros['date'] + pd.offsets.MonthBegin(-1)
metros

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metros['date'] = pd.to_datetime(metros['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  metros['date'] = metros['date'] + pd.offsets.MonthBegin(-1)


Unnamed: 0,name,pop_2023,date,lat,lng,zori
0,United States,335893238,2018-01-01,,,1405.842990
1,"New York, NY",19498249,2018-01-01,40.6943,-73.9249,2520.028880
2,"Los Angeles, CA",12799100,2018-01-01,34.1141,-118.4068,2190.975308
3,"Chicago, IL",9262825,2018-01-01,41.8375,-87.6866,1509.843282
4,"Dallas, TX",8100037,2018-01-01,32.7935,-96.7667,1289.240890
...,...,...,...,...,...,...
4450,"Hartford, CT",1151543,2024-09-01,41.7661,-72.6834,1920.139081
4451,"Tucson, AZ",1063162,2024-09-01,32.1541,-110.8787,1621.090639
4452,"Rochester, NY",1052087,2024-09-01,43.1680,-77.6162,1456.431005
4453,"Urban Honolulu, HI",989408,2024-09-01,,,2693.099648


In [69]:
# merge the population data with the homebuilding data
# merge on 'name' and 'date'
# only include rows where both 'name' and 'date' are in both dataframes
homebuilding_zori = pd.merge(
    homebuilding,
    metros,
    on=['name', 'date'],
    how='inner'
)

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,total_yoy,total_ytd_yoy,multi_total_yoy,multi_total_ytd_yoy,pop_2023,lat,lng,zori
0,2019-11-01,"Grand Rapids, MI",121.0,2779.0,121.0,2838.0,,,,,1162950,42.9619,-85.6562,1142.228554
1,2019-11-01,"Denver, CO",1314.0,16824.0,1319.0,16940.0,,,,,3005131,39.7620,-104.8758,1608.646985
2,2019-11-01,"Detroit, MI",588.0,7076.0,600.0,7205.0,,,,,4342304,42.3834,-83.1024,1076.702325
3,2019-11-01,"Jacksonville, FL",1011.0,13995.0,1015.0,14075.0,,,,,1713240,30.3322,-81.6749,1242.301895
4,2019-11-01,"Kansas City, MO",552.0,8319.0,559.0,8393.0,,,,,2221343,39.1238,-94.5541,1058.463568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3127,2024-09-01,"New York, NY",3666.0,43928.0,3774.0,44825.0,-0.161482,0.480752,-0.159840,0.474264,19498249,40.6943,-73.9249,3397.349448
3128,2024-09-01,"New Orleans, LA",142.0,1257.0,143.0,1264.0,-0.619303,-0.494978,-0.619681,-0.495409,962165,30.0687,-89.9288,1635.125963
3129,2024-09-01,"Nashville, TN",2205.0,15949.0,2216.0,16034.0,-0.220848,-0.127755,-0.230823,-0.133766,2102573,36.1715,-86.7842,1898.956138
3130,2024-09-01,"Orlando, FL",1994.0,18573.0,2012.0,18729.0,-0.105830,0.005794,-0.100983,0.006448,2817933,28.4773,-81.3370,2066.918191


In [72]:
# delete all columns with _yoy in the name
homebuilding_zori = homebuilding_zori[homebuilding_zori.columns.drop(list(homebuilding_zori.filter(regex='_yoy')))]

homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,pop_2023,lat,lng,zori
0,2019-11-01,"Grand Rapids, MI",121.0,2779.0,121.0,2838.0,1162950,42.9619,-85.6562,1142.228554
1,2019-11-01,"Denver, CO",1314.0,16824.0,1319.0,16940.0,3005131,39.7620,-104.8758,1608.646985
2,2019-11-01,"Detroit, MI",588.0,7076.0,600.0,7205.0,4342304,42.3834,-83.1024,1076.702325
3,2019-11-01,"Jacksonville, FL",1011.0,13995.0,1015.0,14075.0,1713240,30.3322,-81.6749,1242.301895
4,2019-11-01,"Kansas City, MO",552.0,8319.0,559.0,8393.0,2221343,39.1238,-94.5541,1058.463568
...,...,...,...,...,...,...,...,...,...,...
3127,2024-09-01,"New York, NY",3666.0,43928.0,3774.0,44825.0,19498249,40.6943,-73.9249,3397.349448
3128,2024-09-01,"New Orleans, LA",142.0,1257.0,143.0,1264.0,962165,30.0687,-89.9288,1635.125963
3129,2024-09-01,"Nashville, TN",2205.0,15949.0,2216.0,16034.0,2102573,36.1715,-86.7842,1898.956138
3130,2024-09-01,"Orlando, FL",1994.0,18573.0,2012.0,18729.0,2817933,28.4773,-81.3370,2066.918191


In [73]:
homebuilding_zori.columns

Index(['date', 'name', 'total', 'total_ytd', 'multi_total', 'multi_total_ytd',
       'pop_2023', 'lat', 'lng', 'zori'],
      dtype='object')

In [None]:
# create 'total_per_capita' and 'total_ytd_per_capita' columns that divide 'total' and 'total_ytd' by 'pop_2023'
homebuilding_zori['total_per_capita'] = homebuilding_zori['total'] / homebuilding_zori['pop_2023']
homebuilding_zori['total_ytd_per_capita'] = homebuilding_zori['total_ytd'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_per_capita'] = homebuilding_zori['multi_total'] / homebuilding_zori['pop_2023']
homebuilding_zori['multi_total_ytd_per_capita'] = homebuilding_zori['multi_total_ytd'] / homebuilding_zori['pop_2023']

# change per capita columns to per 1000
for col in homebuilding_zori.columns:
    if 'per_capita' in col:
        homebuilding_zori[col] = homebuilding_zori[col] * 1000

homebuilding_zori

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  homebuilding_zori['total_per_capita'] = homebuilding_zori['total'] / homebuilding_zori['pop_2023']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  homebuilding_zori['total_ytd_per_capita'] = homebuilding_zori['total_ytd'] / homebuilding_zori['pop_2023']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,pop_2023,lat,lng,zori,total_per_capita,total_ytd_per_capita,multi_total_per_capita,multi_total_ytd_per_capita
0,2019-11-01,"Grand Rapids, MI",121.0,2779.0,121.0,2838.0,1162950,42.9619,-85.6562,1142.228554,0.000104,0.002390,0.000104,0.002440
1,2019-11-01,"Denver, CO",1314.0,16824.0,1319.0,16940.0,3005131,39.7620,-104.8758,1608.646985,0.000437,0.005598,0.000439,0.005637
2,2019-11-01,"Detroit, MI",588.0,7076.0,600.0,7205.0,4342304,42.3834,-83.1024,1076.702325,0.000135,0.001630,0.000138,0.001659
3,2019-11-01,"Jacksonville, FL",1011.0,13995.0,1015.0,14075.0,1713240,30.3322,-81.6749,1242.301895,0.000590,0.008169,0.000592,0.008215
4,2019-11-01,"Kansas City, MO",552.0,8319.0,559.0,8393.0,2221343,39.1238,-94.5541,1058.463568,0.000248,0.003745,0.000252,0.003778
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3127,2024-09-01,"New York, NY",3666.0,43928.0,3774.0,44825.0,19498249,40.6943,-73.9249,3397.349448,0.000188,0.002253,0.000194,0.002299
3128,2024-09-01,"New Orleans, LA",142.0,1257.0,143.0,1264.0,962165,30.0687,-89.9288,1635.125963,0.000148,0.001306,0.000149,0.001314
3129,2024-09-01,"Nashville, TN",2205.0,15949.0,2216.0,16034.0,2102573,36.1715,-86.7842,1898.956138,0.001049,0.007585,0.001054,0.007626
3130,2024-09-01,"Orlando, FL",1994.0,18573.0,2012.0,18729.0,2817933,28.4773,-81.3370,2066.918191,0.000708,0.006591,0.000714,0.006646


In [75]:
# create 'zori_yoy' column that calculates the year-over-year change in 'zori' for each metro area
homebuilding_zori['zori_yoy'] = homebuilding_zori.groupby('name')['zori'].pct_change(12)
homebuilding_zori

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,pop_2023,lat,lng,zori,total_per_capita,total_ytd_per_capita,multi_total_per_capita,multi_total_ytd_per_capita,zori_yoy
0,2019-11-01,"Grand Rapids, MI",121.0,2779.0,121.0,2838.0,1162950,42.9619,-85.6562,1142.228554,0.000104,0.002390,0.000104,0.002440,
1,2019-11-01,"Denver, CO",1314.0,16824.0,1319.0,16940.0,3005131,39.7620,-104.8758,1608.646985,0.000437,0.005598,0.000439,0.005637,
2,2019-11-01,"Detroit, MI",588.0,7076.0,600.0,7205.0,4342304,42.3834,-83.1024,1076.702325,0.000135,0.001630,0.000138,0.001659,
3,2019-11-01,"Jacksonville, FL",1011.0,13995.0,1015.0,14075.0,1713240,30.3322,-81.6749,1242.301895,0.000590,0.008169,0.000592,0.008215,
4,2019-11-01,"Kansas City, MO",552.0,8319.0,559.0,8393.0,2221343,39.1238,-94.5541,1058.463568,0.000248,0.003745,0.000252,0.003778,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3127,2024-09-01,"New York, NY",3666.0,43928.0,3774.0,44825.0,19498249,40.6943,-73.9249,3397.349448,0.000188,0.002253,0.000194,0.002299,0.031432
3128,2024-09-01,"New Orleans, LA",142.0,1257.0,143.0,1264.0,962165,30.0687,-89.9288,1635.125963,0.000148,0.001306,0.000149,0.001314,0.035785
3129,2024-09-01,"Nashville, TN",2205.0,15949.0,2216.0,16034.0,2102573,36.1715,-86.7842,1898.956138,0.001049,0.007585,0.001054,0.007626,0.012344
3130,2024-09-01,"Orlando, FL",1994.0,18573.0,2012.0,18729.0,2817933,28.4773,-81.3370,2066.918191,0.000708,0.006591,0.000714,0.006646,0.011371


In [80]:

# Sort values by 'name' and 'date' to facilitate proper group operations
homebuilding_zori = homebuilding_zori.sort_values(by=['name', 'date'])

# Remove duplicate date entries within each 'name' group by aggregating
homebuilding_zori = homebuilding_zori.groupby(['name', 'date'], as_index=False).agg({'total': 'sum'})

# Create a column for 'total_lastyear' using rolling sum grouped by 'name'
homebuilding_zori['total_lastyear'] = (
    homebuilding_zori.set_index('date')  # Temporarily set 'date' as index
    .groupby('name')['total']           # Group by 'name'
    .rolling('365D', closed='both')     # Rolling window of 365 days
    .sum()                              # Calculate the rolling sum
    .reset_index(level=0, drop=True)    # Drop the index to align with original DataFrame
)

homebuilding_zori

  homebuilding_zori['total_lastyear'] = (


ValueError: cannot reindex on an axis with duplicate labels

In [77]:
# save the final dataset to a csv file
homebuilding_zori.to_csv('homebuilding_zori.csv', index=False)