In [1]:
import pandas as pd

In [2]:
metros = pd.read_csv('data/zori_metro_long_clean.csv')

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023
0,"New York, NY",2015-02-01,2233.141451,40.6943,-73.9249,19498249.0
1,"Los Angeles, CA",2015-02-01,1809.922576,34.1141,-118.4068,12799100.0
2,"Chicago, IL",2015-02-01,1400.342984,41.8375,-87.6866,9262825.0
3,"Dallas, TX",2015-02-01,1095.378833,32.7935,-96.7667,8100037.0
4,"Houston, TX",2015-02-01,1217.199895,29.7860,-95.3885,7510253.0
...,...,...,...,...,...,...
6095,"Salt Lake City, UT",2025-03-01,1662.887724,40.7776,-111.9311,1267864.0
6096,"Hartford, CT",2025-03-01,1861.878310,41.7661,-72.6834,1151543.0
6097,"Buffalo, NY",2025-03-01,1331.626117,42.9018,-78.8487,1155604.0
6098,"Birmingham, AL",2025-03-01,1388.316805,33.5279,-86.7971,1184290.0


In [3]:
# get state name col from `name` col with everything after the last comma and space
metros['state'] = metros['name'].str.extract(r',\s(.*)$')

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023,state
0,"New York, NY",2015-02-01,2233.141451,40.6943,-73.9249,19498249.0,NY
1,"Los Angeles, CA",2015-02-01,1809.922576,34.1141,-118.4068,12799100.0,CA
2,"Chicago, IL",2015-02-01,1400.342984,41.8375,-87.6866,9262825.0,IL
3,"Dallas, TX",2015-02-01,1095.378833,32.7935,-96.7667,8100037.0,TX
4,"Houston, TX",2015-02-01,1217.199895,29.7860,-95.3885,7510253.0,TX
...,...,...,...,...,...,...,...
6095,"Salt Lake City, UT",2025-03-01,1662.887724,40.7776,-111.9311,1267864.0,UT
6096,"Hartford, CT",2025-03-01,1861.878310,41.7661,-72.6834,1151543.0,CT
6097,"Buffalo, NY",2025-03-01,1331.626117,42.9018,-78.8487,1155604.0,NY
6098,"Birmingham, AL",2025-03-01,1388.316805,33.5279,-86.7971,1184290.0,AL


In [4]:
# import regions df
regions = pd.read_csv('data/regions.csv')

In [5]:
# merge metros and regions on state
metros = metros.merge(regions, on='state', how='left')

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023,state,region
0,"New York, NY",2015-02-01,2233.141451,40.6943,-73.9249,19498249.0,NY,Northest
1,"Los Angeles, CA",2015-02-01,1809.922576,34.1141,-118.4068,12799100.0,CA,West
2,"Chicago, IL",2015-02-01,1400.342984,41.8375,-87.6866,9262825.0,IL,Midwest
3,"Dallas, TX",2015-02-01,1095.378833,32.7935,-96.7667,8100037.0,TX,Southwest
4,"Houston, TX",2015-02-01,1217.199895,29.7860,-95.3885,7510253.0,TX,Southwest
...,...,...,...,...,...,...,...,...
6095,"Salt Lake City, UT",2025-03-01,1662.887724,40.7776,-111.9311,1267864.0,UT,West
6096,"Hartford, CT",2025-03-01,1861.878310,41.7661,-72.6834,1151543.0,CT,Northest
6097,"Buffalo, NY",2025-03-01,1331.626117,42.9018,-78.8487,1155604.0,NY,Northest
6098,"Birmingham, AL",2025-03-01,1388.316805,33.5279,-86.7971,1184290.0,AL,Southeast


In [6]:
# Set the base zori value for each name in January 2016 to 100
base_zori = metros[metros['date'] == '2020-01-01'].set_index('name')['zori']

# Calculate the zori_index
metros['zori_index'] = metros.apply(lambda row: (row['zori'] / base_zori[row['name']]) * 100 if row['name'] in base_zori else None, axis=1)

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023,state,region,zori_index
0,"New York, NY",2015-02-01,2233.141451,40.6943,-73.9249,19498249.0,NY,Northest,88.184278
1,"Los Angeles, CA",2015-02-01,1809.922576,34.1141,-118.4068,12799100.0,CA,West,77.573702
2,"Chicago, IL",2015-02-01,1400.342984,41.8375,-87.6866,9262825.0,IL,Midwest,88.188961
3,"Dallas, TX",2015-02-01,1095.378833,32.7935,-96.7667,8100037.0,TX,Southwest,81.067659
4,"Houston, TX",2015-02-01,1217.199895,29.7860,-95.3885,7510253.0,TX,Southwest,89.899152
...,...,...,...,...,...,...,...,...,...
6095,"Salt Lake City, UT",2025-03-01,1662.887724,40.7776,-111.9311,1267864.0,UT,West,134.304165
6096,"Hartford, CT",2025-03-01,1861.878310,41.7661,-72.6834,1151543.0,CT,Northest,147.002740
6097,"Buffalo, NY",2025-03-01,1331.626117,42.9018,-78.8487,1155604.0,NY,Northest,141.076102
6098,"Birmingham, AL",2025-03-01,1388.316805,33.5279,-86.7971,1184290.0,AL,Southeast,134.122980


In [7]:
# in name, remove the state name and the last comma and space
metros['name'] = metros['name'].str.replace(r',\s.*$', '', regex=True)

metros

Unnamed: 0,name,date,zori,lat,lng,pop_2023,state,region,zori_index
0,New York,2015-02-01,2233.141451,40.6943,-73.9249,19498249.0,NY,Northest,88.184278
1,Los Angeles,2015-02-01,1809.922576,34.1141,-118.4068,12799100.0,CA,West,77.573702
2,Chicago,2015-02-01,1400.342984,41.8375,-87.6866,9262825.0,IL,Midwest,88.188961
3,Dallas,2015-02-01,1095.378833,32.7935,-96.7667,8100037.0,TX,Southwest,81.067659
4,Houston,2015-02-01,1217.199895,29.7860,-95.3885,7510253.0,TX,Southwest,89.899152
...,...,...,...,...,...,...,...,...,...
6095,Salt Lake City,2025-03-01,1662.887724,40.7776,-111.9311,1267864.0,UT,West,134.304165
6096,Hartford,2025-03-01,1861.878310,41.7661,-72.6834,1151543.0,CT,Northest,147.002740
6097,Buffalo,2025-03-01,1331.626117,42.9018,-78.8487,1155604.0,NY,Northest,141.076102
6098,Birmingham,2025-03-01,1388.316805,33.5279,-86.7971,1184290.0,AL,Southeast,134.122980


In [8]:
metros.to_csv('data/zori_metro_indexed_long.csv', index=False)

In [9]:
# import the homebuilding data
homebuilding = pd.read_csv('data/homebuilding_zori.csv')

homebuilding

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,...,pop_2023,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state,region
0,2019-11-01,"Atlanta, GA",1819.0,30513.0,117.0,6602.0,1819.0,117.0,1371.279930,33.7628,...,6307261.0,0.288398,4.837758,0.018550,1.046730,0.288398,0.018550,,GA,Southeast
1,2019-12-01,"Atlanta, GA",2388.0,32729.0,213.0,6732.0,4207.0,330.0,1368.872179,33.7628,...,6307261.0,0.378611,5.189099,0.033771,1.067341,0.667009,0.052321,,GA,Southeast
2,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,...,6307261.0,0.461690,0.461690,0.154267,0.154267,1.128699,0.206587,,GA,Southeast
3,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,...,6307261.0,0.434737,0.897537,0.103214,0.247651,1.563436,0.309802,,GA,Southeast
4,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,...,6307261.0,0.351341,1.249829,0.020136,0.258908,1.914777,0.329937,,GA,Southeast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,...,6304975.0,0.142744,2.613333,0.035210,1.228712,3.471068,1.758770,0.048862,DC,Northest
3146,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,...,6304975.0,0.285647,3.014128,0.146234,1.372091,3.463614,1.728318,0.047143,DC,Northest
3147,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,...,6304975.0,0.178272,3.181297,0.050436,1.412060,3.403186,1.644574,0.044723,DC,Northest
3148,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,...,6304975.0,0.238859,3.478364,0.122443,1.583353,3.360521,1.589697,0.045161,DC,Northest


In [10]:
# Pivot the homebuilding dataframe to have dates as rows and metro areas as columns
pivot_homebuilding = homebuilding.pivot(index='date', columns='name', values='total')

# Find the earliest date where all values are non-null
earliest_date = pivot_homebuilding.dropna().index.min()

print(earliest_date)

2019-11-01


In [11]:
# Filter the data to include only records from 2020 onwards
homebuilding_2020 = homebuilding[homebuilding['date'] >= '2020-01-01']

# Calculate the running total for each metro area
homebuilding_2020['running_total'] = homebuilding_2020.groupby('name')['multi_total'].cumsum()

homebuilding_2020

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  homebuilding_2020['running_total'] = homebuilding_2020.groupby('name')['multi_total'].cumsum()


Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,...,total_pc,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state,region,running_total
2,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,...,0.461690,0.461690,0.154267,0.154267,1.128699,0.206587,,GA,Southeast,973.0
3,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,...,0.434737,0.897537,0.103214,0.247651,1.563436,0.309802,,GA,Southeast,1624.0
4,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,...,0.351341,1.249829,0.020136,0.258908,1.914777,0.329937,,GA,Southeast,1751.0
5,2020-04-01,"Atlanta, GA",1827.0,9633.0,108.0,1648.0,13904.0,2189.0,1390.457345,33.7628,...,0.289666,1.527287,0.017123,0.261286,2.204443,0.347060,,GA,Southeast,1859.0
6,2020-05-01,"Atlanta, GA",1970.0,11641.0,354.0,1922.0,15874.0,2543.0,1393.296553,33.7628,...,0.312338,1.845651,0.056126,0.304728,2.516782,0.403186,,GA,Southeast,2213.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,...,0.142744,2.613333,0.035210,1.228712,3.471068,1.758770,0.048862,DC,Northest,66651.0
3146,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,...,0.285647,3.014128,0.146234,1.372091,3.463614,1.728318,0.047143,DC,Northest,67573.0
3147,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,...,0.178272,3.181297,0.050436,1.412060,3.403186,1.644574,0.044723,DC,Northest,67891.0
3148,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,...,0.238859,3.478364,0.122443,1.583353,3.360521,1.589697,0.045161,DC,Northest,68663.0


In [12]:
# add the zori_index from the metros df to the homebuilding data. They will be merged on the date and name columns
homebuilding_2020 = homebuilding_2020.merge(metros[['name', 'date', 'zori_index']], on=['name', 'date'], how='left')

homebuilding_2020

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,...,total_ytd_pc,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state,region,running_total,zori_index
0,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,...,0.461690,0.154267,0.154267,1.128699,0.206587,,GA,Southeast,973.0,
1,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,...,0.897537,0.103214,0.247651,1.563436,0.309802,,GA,Southeast,1624.0,
2,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,...,1.249829,0.020136,0.258908,1.914777,0.329937,,GA,Southeast,1751.0,
3,2020-04-01,"Atlanta, GA",1827.0,9633.0,108.0,1648.0,13904.0,2189.0,1390.457345,33.7628,...,1.527287,0.017123,0.261286,2.204443,0.347060,,GA,Southeast,1859.0,
4,2020-05-01,"Atlanta, GA",1970.0,11641.0,354.0,1922.0,15874.0,2543.0,1393.296553,33.7628,...,1.845651,0.056126,0.304728,2.516782,0.403186,,GA,Southeast,2213.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3045,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,...,2.613333,0.035210,1.228712,3.471068,1.758770,0.048862,DC,Northest,66651.0,
3046,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,...,3.014128,0.146234,1.372091,3.463614,1.728318,0.047143,DC,Northest,67573.0,
3047,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,...,3.181297,0.050436,1.412060,3.403186,1.644574,0.044723,DC,Northest,67891.0,
3048,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,...,3.478364,0.122443,1.583353,3.360521,1.589697,0.045161,DC,Northest,68663.0,


In [13]:
# create running total per 10k people (pop_2023 is population)
homebuilding_2020['running_total_per_10k'] = (homebuilding_2020['running_total'] / homebuilding_2020['pop_2023']) * 10000

homebuilding_2020

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,...,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state,region,running_total,zori_index,running_total_per_10k
0,2020-01-01,"Atlanta, GA",2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,...,0.154267,0.154267,1.128699,0.206587,,GA,Southeast,973.0,,1.542666
1,2020-02-01,"Atlanta, GA",2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,...,0.103214,0.247651,1.563436,0.309802,,GA,Southeast,1624.0,,2.574810
2,2020-03-01,"Atlanta, GA",2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,...,0.020136,0.258908,1.914777,0.329937,,GA,Southeast,1751.0,,2.776165
3,2020-04-01,"Atlanta, GA",1827.0,9633.0,108.0,1648.0,13904.0,2189.0,1390.457345,33.7628,...,0.017123,0.261286,2.204443,0.347060,,GA,Southeast,1859.0,,2.947397
4,2020-05-01,"Atlanta, GA",1970.0,11641.0,354.0,1922.0,15874.0,2543.0,1393.296553,33.7628,...,0.056126,0.304728,2.516782,0.403186,,GA,Southeast,2213.0,,3.508655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3045,2024-09-01,"Washington, DC",900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,...,0.035210,1.228712,3.471068,1.758770,0.048862,DC,Northest,66651.0,,105.711759
3046,2024-10-01,"Washington, DC",1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,...,0.146234,1.372091,3.463614,1.728318,0.047143,DC,Northest,67573.0,,107.174097
3047,2024-11-01,"Washington, DC",1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,...,0.050436,1.412060,3.403186,1.644574,0.044723,DC,Northest,67891.0,,107.678460
3048,2024-12-01,"Washington, DC",1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,...,0.122443,1.583353,3.360521,1.589697,0.045161,DC,Northest,68663.0,,108.902890


In [14]:
# in name, remove the state name and the last comma and space
homebuilding_2020['name'] = homebuilding_2020['name'].str.replace(r',\s.*$', '', regex=True)

In [15]:
homebuilding_2020

Unnamed: 0,date,name,total,total_ytd,multi_total,multi_total_ytd,rt,multi_rt,zori,lat,...,multi_total_pc,multi_total_ytd_pc,rt_pc,multi_rt_pc,zori_yoy,state,region,running_total,zori_index,running_total_per_10k
0,2020-01-01,Atlanta,2912.0,2912.0,973.0,973.0,7119.0,1303.0,1368.798323,33.7628,...,0.154267,0.154267,1.128699,0.206587,,GA,Southeast,973.0,,1.542666
1,2020-02-01,Atlanta,2742.0,5661.0,651.0,1562.0,9861.0,1954.0,1373.768759,33.7628,...,0.103214,0.247651,1.563436,0.309802,,GA,Southeast,1624.0,,2.574810
2,2020-03-01,Atlanta,2216.0,7883.0,127.0,1633.0,12077.0,2081.0,1382.102627,33.7628,...,0.020136,0.258908,1.914777,0.329937,,GA,Southeast,1751.0,,2.776165
3,2020-04-01,Atlanta,1827.0,9633.0,108.0,1648.0,13904.0,2189.0,1390.457345,33.7628,...,0.017123,0.261286,2.204443,0.347060,,GA,Southeast,1859.0,,2.947397
4,2020-05-01,Atlanta,1970.0,11641.0,354.0,1922.0,15874.0,2543.0,1393.296553,33.7628,...,0.056126,0.304728,2.516782,0.403186,,GA,Southeast,2213.0,,3.508655
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3045,2024-09-01,Washington,900.0,16477.0,222.0,7747.0,21885.0,11089.0,2383.224478,38.9047,...,0.035210,1.228712,3.471068,1.758770,0.048862,DC,Northest,66651.0,,105.711759
3046,2024-10-01,Washington,1801.0,19004.0,922.0,8651.0,21838.0,10897.0,2382.780218,38.9047,...,0.146234,1.372091,3.463614,1.728318,0.047143,DC,Northest,67573.0,,107.174097
3047,2024-11-01,Washington,1124.0,20058.0,318.0,8903.0,21457.0,10369.0,2372.798426,38.9047,...,0.050436,1.412060,3.403186,1.644574,0.044723,DC,Northest,67891.0,,107.678460
3048,2024-12-01,Washington,1506.0,21931.0,772.0,9983.0,21188.0,10023.0,2365.752183,38.9047,...,0.122443,1.583353,3.360521,1.589697,0.045161,DC,Northest,68663.0,,108.902890


In [16]:
homebuilding_2020.to_csv('data/homebuilding_zori_indexed.csv', index=False)

In [60]:
# set realtor url
realtor_url = "https://econdata.s3-us-west-2.amazonaws.com/Reports/Core/RDC_Inventory_Core_Metrics_Metro_History.csv"

realtor_data = pd.read_csv(realtor_url)

realtor_data

Unnamed: 0,month_date_yyyymm,cbsa_code,cbsa_title,HouseholdRank,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag
0,202412,35620,"New York-Newark-Jersey City, NY-NJ-PA",1,749000,-0.0013,0.0202,29297,-0.1344,0.0028,...,1648167,0.0067,-0.0630,50240,-0.0274,0.1692,0.7317,0.1921,0.2503,0.0
1,202412,31080,"Los Angeles-Long Beach-Anaheim, CA",2,1094000,-0.0319,-0.0055,12686,-0.1700,0.2630,...,2398115,0.0180,-0.0783,18822,-0.1603,0.1937,0.4864,0.0222,-0.0788,0.0
2,202412,16980,"Chicago-Naperville-Elgin, IL-IN-WI",3,347450,-0.0346,-0.0073,14155,-0.1598,0.0636,...,515696,-0.0257,-0.0355,22540,-0.1740,0.0425,0.5951,-0.0286,-0.0320,0.0
3,202412,19100,"Dallas-Fort Worth-Arlington, TX",4,422450,-0.0130,-0.0289,22744,-0.1079,0.3111,...,600314,-0.0159,-0.0448,30828,-0.1221,0.2187,0.3557,-0.0229,-0.0994,0.0
4,202412,26420,"Houston-The Woodlands-Sugar Land, TX",5,361405,-0.0098,0.0039,26614,-0.0360,0.2450,...,487031,-0.0129,-0.0344,35275,-0.0490,0.1991,0.3262,-0.0203,-0.0492,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94549,201607,18780,"Craig, CO",923,174900,,,173,,,...,204604,,,195,,,0.1272,,,
94550,201607,46900,"Vernon, TX",924,139000,,,3,,,...,172800,,,5,,,0.6667,,,
94551,201607,49820,"Zapata, TX",925,104000,,,27,,,...,121803,,,27,,,,,,
94552,201607,37780,"Pecos, TX",926,117700,,,5,,,...,101975,,,5,,,,,,


In [61]:
# Extract the first part of the cbsa_title before the first hyphen and the first state abbreviation
realtor_data['cbsa_title'] = realtor_data['cbsa_title'].str.replace(r'^(.*?)-.*?,\s(\w{2}).*$', r'\1, \2')

realtor_data

  realtor_data['cbsa_title'] = realtor_data['cbsa_title'].str.replace(r'^(.*?)-.*?,\s(\w{2}).*$', r'\1, \2')


Unnamed: 0,month_date_yyyymm,cbsa_code,cbsa_title,HouseholdRank,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag
0,202412,35620,"New York, NY",1,749000,-0.0013,0.0202,29297,-0.1344,0.0028,...,1648167,0.0067,-0.0630,50240,-0.0274,0.1692,0.7317,0.1921,0.2503,0.0
1,202412,31080,"Los Angeles, CA",2,1094000,-0.0319,-0.0055,12686,-0.1700,0.2630,...,2398115,0.0180,-0.0783,18822,-0.1603,0.1937,0.4864,0.0222,-0.0788,0.0
2,202412,16980,"Chicago, IL",3,347450,-0.0346,-0.0073,14155,-0.1598,0.0636,...,515696,-0.0257,-0.0355,22540,-0.1740,0.0425,0.5951,-0.0286,-0.0320,0.0
3,202412,19100,"Dallas, TX",4,422450,-0.0130,-0.0289,22744,-0.1079,0.3111,...,600314,-0.0159,-0.0448,30828,-0.1221,0.2187,0.3557,-0.0229,-0.0994,0.0
4,202412,26420,"Houston, TX",5,361405,-0.0098,0.0039,26614,-0.0360,0.2450,...,487031,-0.0129,-0.0344,35275,-0.0490,0.1991,0.3262,-0.0203,-0.0492,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94549,201607,18780,"Craig, CO",923,174900,,,173,,,...,204604,,,195,,,0.1272,,,
94550,201607,46900,"Vernon, TX",924,139000,,,3,,,...,172800,,,5,,,0.6667,,,
94551,201607,49820,"Zapata, TX",925,104000,,,27,,,...,121803,,,27,,,,,,
94552,201607,37780,"Pecos, TX",926,117700,,,5,,,...,101975,,,5,,,,,,


In [62]:
# convert 'month_date_yyyymm' to datetime
realtor_data['month_date_yyyymm'] = pd.to_datetime(realtor_data['month_date_yyyymm'], format='%Y%m')

realtor_data

Unnamed: 0,month_date_yyyymm,cbsa_code,cbsa_title,HouseholdRank,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag
0,2024-12-01,35620,"New York, NY",1,749000,-0.0013,0.0202,29297,-0.1344,0.0028,...,1648167,0.0067,-0.0630,50240,-0.0274,0.1692,0.7317,0.1921,0.2503,0.0
1,2024-12-01,31080,"Los Angeles, CA",2,1094000,-0.0319,-0.0055,12686,-0.1700,0.2630,...,2398115,0.0180,-0.0783,18822,-0.1603,0.1937,0.4864,0.0222,-0.0788,0.0
2,2024-12-01,16980,"Chicago, IL",3,347450,-0.0346,-0.0073,14155,-0.1598,0.0636,...,515696,-0.0257,-0.0355,22540,-0.1740,0.0425,0.5951,-0.0286,-0.0320,0.0
3,2024-12-01,19100,"Dallas, TX",4,422450,-0.0130,-0.0289,22744,-0.1079,0.3111,...,600314,-0.0159,-0.0448,30828,-0.1221,0.2187,0.3557,-0.0229,-0.0994,0.0
4,2024-12-01,26420,"Houston, TX",5,361405,-0.0098,0.0039,26614,-0.0360,0.2450,...,487031,-0.0129,-0.0344,35275,-0.0490,0.1991,0.3262,-0.0203,-0.0492,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94549,2016-07-01,18780,"Craig, CO",923,174900,,,173,,,...,204604,,,195,,,0.1272,,,
94550,2016-07-01,46900,"Vernon, TX",924,139000,,,3,,,...,172800,,,5,,,0.6667,,,
94551,2016-07-01,49820,"Zapata, TX",925,104000,,,27,,,...,121803,,,27,,,,,,
94552,2016-07-01,37780,"Pecos, TX",926,117700,,,5,,,...,101975,,,5,,,,,,


In [63]:
realtor_data.columns

Index(['month_date_yyyymm', 'cbsa_code', 'cbsa_title', 'HouseholdRank',
       'median_listing_price', 'median_listing_price_mm',
       'median_listing_price_yy', 'active_listing_count',
       'active_listing_count_mm', 'active_listing_count_yy',
       'median_days_on_market', 'median_days_on_market_mm',
       'median_days_on_market_yy', 'new_listing_count', 'new_listing_count_mm',
       'new_listing_count_yy', 'price_increased_count',
       'price_increased_count_mm', 'price_increased_count_yy',
       'price_reduced_count', 'price_reduced_count_mm',
       'price_reduced_count_yy', 'pending_listing_count',
       'pending_listing_count_mm', 'pending_listing_count_yy',
       'median_listing_price_per_square_foot',
       'median_listing_price_per_square_foot_mm',
       'median_listing_price_per_square_foot_yy', 'median_square_feet',
       'median_square_feet_mm', 'median_square_feet_yy',
       'average_listing_price', 'average_listing_price_mm',
       'average_listing_price

In [64]:
realtor_data = realtor_data[['month_date_yyyymm','cbsa_code','cbsa_title', 'new_listing_count']]

realtor_data.columns = ['date', 'id', 'name', 'new_listings']

realtor_data

Unnamed: 0,date,id,name,new_listings
0,2024-12-01,35620,"New York, NY",7948
1,2024-12-01,31080,"Los Angeles, CA",4324
2,2024-12-01,16980,"Chicago, IL",4936
3,2024-12-01,19100,"Dallas, TX",6278
4,2024-12-01,26420,"Houston, TX",7070
...,...,...,...,...
94549,2016-07-01,18780,"Craig, CO",28
94550,2016-07-01,46900,"Vernon, TX",0
94551,2016-07-01,49820,"Zapata, TX",4
94552,2016-07-01,37780,"Pecos, TX",0


In [65]:
# import the top_metros data. It's a JSON with a list.
import json

with open('data/top_metros.json') as f:
    top_metros = json.load(f)

# restrict the realtor data to only the top metros
realtor_data = realtor_data[realtor_data['name'].isin(top_metros)]

realtor_data

Unnamed: 0,date,id,name,new_listings
0,2024-12-01,35620,"New York, NY",7948
1,2024-12-01,31080,"Los Angeles, CA",4324
2,2024-12-01,16980,"Chicago, IL",4936
3,2024-12-01,19100,"Dallas, TX",6278
4,2024-12-01,26420,"Houston, TX",7070
...,...,...,...,...
93673,2016-07-01,15380,"Buffalo, NY",1248
93674,2016-07-01,25540,"Hartford, CT",1956
93676,2016-07-01,13820,"Birmingham, AL",1936
93678,2016-07-01,41620,"Salt Lake City, UT",1984


In [69]:
# convert metros.date to datetime
metros['date'] = pd.to_datetime(metros['date'])

metros.dtypes

name                  object
date          datetime64[ns]
zori                 float64
lat                  float64
lng                  float64
pop_2023             float64
state                 object
region                object
zori_index           float64
dtype: object

In [70]:
# add the pop_2023 column to the realtor data. It will be merged on the name and date columns
realtor_data = realtor_data.merge(metros[['name', 'date', 'pop_2023']], on=['name', 'date'], how='left')

In [71]:
# sort by date, then name with earliest date first
realtor_data = realtor_data.sort_values(['date', 'name'])

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023
4553,2016-07-01,12060,"Atlanta, GA",11888,6307261.0
4570,2016-07-01,12420,"Austin, TX",3668,2473275.0
4564,2016-07-01,12580,"Baltimore, MD",5048,2834316.0
4587,2016-07-01,13820,"Birmingham, AL",1936,1184290.0
4554,2016-07-01,14460,"Boston, MA",6268,4919179.0
...,...,...,...,...,...
35,2024-12-01,41940,"San Jose, CA",352,1945767.0
13,2024-12-01,42660,"Seattle, WA",1608,4044837.0
16,2024-12-01,45300,"Tampa, FL",4614,3342963.0
32,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0


In [72]:
# Calculate the cumulative sum of new_listings for each id since the earliest date
realtor_data['cumulative_new_listings'] = realtor_data.groupby('id')['new_listings'].cumsum()

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings
4553,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888
4570,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668
4564,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048
4587,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936
4554,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268
...,...,...,...,...,...,...
35,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238
13,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438
16,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218
32,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784


In [73]:
# add a state column that extracts the state from the name column
realtor_data['state'] = realtor_data['name'].str.extract(r',\s(.*)$')

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state
4553,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA
4570,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX
4564,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD
4587,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL
4554,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA
...,...,...,...,...,...,...,...
35,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA
13,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA
16,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL
32,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA


In [75]:
# add region col
realtor_data = realtor_data.merge(regions, on='state', how='left')

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state,region
0,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA,Southeast
1,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX,Southwest
2,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD,Northest
3,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL,Southeast
4,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA,Northest
...,...,...,...,...,...,...,...,...
4585,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA,West
4586,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA,West
4587,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL,Southeast
4588,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA,Southeast


In [76]:
# add zori and zori_index from metros to realtor_data
realtor_data = realtor_data.merge(metros[['name', 'date', 'zori', 'zori_index']], on=['name', 'date'], how='left')

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state,region,zori,zori_index
0,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA,Southeast,1133.885137,82.561987
1,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX,Southwest,1286.087094,90.271224
2,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD,Northest,1374.881054,94.719129
3,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL,Southeast,922.773482,90.478772
4,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA,Northest,2129.591065,91.341245
...,...,...,...,...,...,...,...,...,...,...
4585,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA,West,3296.470892,111.343869
4586,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA,West,2220.617862,124.087570
4587,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL,Southeast,2057.590695,152.851085
4588,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA,Southeast,1764.030482,142.402366


In [77]:
# make the zori_index an index of zori where july 2016 is 100
base_zori = metros[metros['date'] == '2016-07-01'].set_index('name')['zori']

realtor_data['zori_index'] = realtor_data.apply(lambda row: (row['zori'] / base_zori[row['name']]) * 100 if row['name'] in base_zori else None, axis=1)

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state,region,zori,zori_index
0,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA,Southeast,1133.885137,100.000000
1,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX,Southwest,1286.087094,100.000000
2,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD,Northest,1374.881054,100.000000
3,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL,Southeast,922.773482,100.000000
4,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA,Northest,2129.591065,100.000000
...,...,...,...,...,...,...,...,...,...,...
4585,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA,West,3296.470892,120.084045
4586,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA,West,2220.617862,143.926821
4587,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL,Southeast,2057.590695,180.133399
4588,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA,Southeast,1764.030482,156.396560


In [78]:
# create a new_listings per 10k people column and cumulative_new_listings per 10k people column
realtor_data['new_per_10k'] = (realtor_data['new_listings'] / realtor_data['pop_2023']) * 10000

realtor_data['cumulative_per_10k'] = (realtor_data['cumulative_new_listings'] / realtor_data['pop_2023']) * 10000

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state,region,zori,zori_index,new_per_10k,cumulative_per_10k
0,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA,Southeast,1133.885137,100.000000,18.848118,18.848118
1,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX,Southwest,1286.087094,100.000000,14.830538,14.830538
2,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD,Northest,1374.881054,100.000000,17.810294,17.810294
3,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL,Southeast,922.773482,100.000000,16.347347,16.347347
4,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA,Northest,2129.591065,100.000000,12.741964,12.741964
...,...,...,...,...,...,...,...,...,...,...,...,...
4585,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA,West,3296.470892,120.084045,1.809055,669.340163
4586,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA,West,2220.617862,143.926821,3.975438,1224.865180
4587,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL,Southeast,2057.590695,180.133399,13.802127,1846.320166
4588,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA,Southeast,1764.030482,156.396560,7.900764,1565.515069


In [79]:
# add a yoy change column for zori
realtor_data['zori_yoy'] = realtor_data.groupby('name')['zori'].pct_change(12)

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state,region,zori,zori_index,new_per_10k,cumulative_per_10k,zori_yoy
0,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA,Southeast,1133.885137,100.000000,18.848118,18.848118,
1,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX,Southwest,1286.087094,100.000000,14.830538,14.830538,
2,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD,Northest,1374.881054,100.000000,17.810294,17.810294,
3,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL,Southeast,922.773482,100.000000,16.347347,16.347347,
4,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA,Northest,2129.591065,100.000000,12.741964,12.741964,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4585,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA,West,3296.470892,120.084045,1.809055,669.340163,0.046295
4586,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA,West,2220.617862,143.926821,3.975438,1224.865180,0.038656
4587,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL,Southeast,2057.590695,180.133399,13.802127,1846.320166,0.024556
4588,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA,Southeast,1764.030482,156.396560,7.900764,1565.515069,0.050527


In [80]:
# create a new_listings_yoy column
realtor_data['new_listings_yoy'] = realtor_data.groupby('name')['new_listings'].pct_change(12)

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state,region,zori,zori_index,new_per_10k,cumulative_per_10k,zori_yoy,new_listings_yoy
0,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA,Southeast,1133.885137,100.000000,18.848118,18.848118,,
1,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX,Southwest,1286.087094,100.000000,14.830538,14.830538,,
2,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD,Northest,1374.881054,100.000000,17.810294,17.810294,,
3,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL,Southeast,922.773482,100.000000,16.347347,16.347347,,
4,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA,Northest,2129.591065,100.000000,12.741964,12.741964,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4585,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA,West,3296.470892,120.084045,1.809055,669.340163,0.046295,-0.185185
4586,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA,West,2220.617862,143.926821,3.975438,1224.865180,0.038656,-0.073733
4587,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL,Southeast,2057.590695,180.133399,13.802127,1846.320166,0.024556,0.080056
4588,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA,Southeast,1764.030482,156.396560,7.900764,1565.515069,0.050527,0.168874


In [82]:
# Create a mask for dates since January 2020
mask = realtor_data['date'] >= '2020-01-01'

# Initialize the new column with NaN values
realtor_data['cumulative_since_2020'] = pd.NA

# Calculate the cumulative sum for dates since January 2020
realtor_data.loc[mask, 'cumulative_since_2020'] = realtor_data[mask].groupby('name')['new_listings'].cumsum()

realtor_data

Unnamed: 0,date,id,name,new_listings,pop_2023,cumulative_new_listings,state,region,zori,zori_index,new_per_10k,cumulative_per_10k,zori_yoy,new_listings_yoy,cumulative_since_2020
0,2016-07-01,12060,"Atlanta, GA",11888,6307261.0,11888,GA,Southeast,1133.885137,100.000000,18.848118,18.848118,,,
1,2016-07-01,12420,"Austin, TX",3668,2473275.0,3668,TX,Southwest,1286.087094,100.000000,14.830538,14.830538,,,
2,2016-07-01,12580,"Baltimore, MD",5048,2834316.0,5048,MD,Northest,1374.881054,100.000000,17.810294,17.810294,,,
3,2016-07-01,13820,"Birmingham, AL",1936,1184290.0,1936,AL,Southeast,922.773482,100.000000,16.347347,16.347347,,,
4,2016-07-01,14460,"Boston, MA",6268,4919179.0,6268,MA,Northest,2129.591065,100.000000,12.741964,12.741964,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4585,2024-12-01,41940,"San Jose, CA",352,1945767.0,130238,CA,West,3296.470892,120.084045,1.809055,669.340163,0.046295,-0.185185,75106
4586,2024-12-01,42660,"Seattle, WA",1608,4044837.0,495438,WA,West,2220.617862,143.926821,3.975438,1224.865180,0.038656,-0.073733,258246
4587,2024-12-01,45300,"Tampa, FL",4614,3342963.0,617218,FL,Southeast,2057.590695,180.133399,13.802127,1846.320166,0.024556,0.080056,349814
4588,2024-12-01,47260,"Virginia Beach, VA",1412,1787169.0,279784,VA,Southeast,1764.030482,156.396560,7.900764,1565.515069,0.050527,0.168874,153312


In [85]:
# make it numeric, coerce errors to NaN
realtor_data['cumulative_since_2020'] = pd.to_numeric(realtor_data['cumulative_since_2020'], errors='coerce')

realtor_data.dtypes

date                       datetime64[ns]
id                                  int64
name                               object
new_listings                        int64
pop_2023                          float64
cumulative_new_listings             int64
state                              object
region                             object
zori                              float64
zori_index                        float64
new_per_10k                       float64
cumulative_per_10k                float64
zori_yoy                          float64
new_listings_yoy                  float64
cumulative_since_2020             float64
dtype: object

In [87]:
# make it per 10k people
realtor_data['cumulative_since_2020'] = (realtor_data['cumulative_since_2020'] / realtor_data['pop_2023']) * 10000

In [88]:
realtor_data.to_csv('data/realtor_zori_indexed.csv', index=False)