### Section 1: Redfin data wrangling
Turn the Redfin data into a limited dataset that has year, quarter, zip, and median sale price.
This data only goes back to 2012.

In [178]:
import pandas as pd
df = pd.read_csv("data/zip_code_market_tracker.tsv000", sep='\t')

In [179]:
df.head()

Unnamed: 0,period_begin,period_end,period_duration,region_type,region_type_id,table_id,is_seasonally_adjusted,region,city,state,...,sold_above_list_yoy,price_drops,price_drops_mom,price_drops_yoy,off_market_in_two_weeks,off_market_in_two_weeks_mom,off_market_in_two_weeks_yoy,parent_metro_region,parent_metro_region_metro_code,last_updated
0,2022-06-01,2022-08-31,90,zip code,2,30725,f,Zip Code: 71923,,Arkansas,...,-0.090769,,,,0.25,-0.016667,0.083333,"Arkadelphia, AR",11660,2023-04-09 14:55:08
1,2019-12-01,2020-02-29,90,zip code,2,21126,f,Zip Code: 49265,,Michigan,...,-0.185294,,,,0.2,0.2,-0.3,"Adrian, MI",10300,2023-04-09 14:55:08
2,2012-07-01,2012-09-30,90,zip code,2,34119,f,Zip Code: 78727,,Texas,...,0.0,,,,,,,"Austin, TX",12420,2023-04-09 14:55:08
3,2012-11-01,2013-01-31,90,zip code,2,13119,f,Zip Code: 31503,,Georgia,...,,,,,,,,"Waycross, GA",48180,2023-04-09 14:55:08
4,2016-04-01,2016-06-30,90,zip code,2,34063,f,Zip Code: 78648,,Texas,...,,,,,,,,"Austin, TX",12420,2023-04-09 14:55:08


In [180]:
len(df)
min(df['period_end'])

'2012-03-31'

In [181]:
df.columns

Index(['period_begin', 'period_end', 'period_duration', 'region_type',
       'region_type_id', 'table_id', 'is_seasonally_adjusted', 'region',
       'city', 'state', 'state_code', 'property_type', 'property_type_id',
       'median_sale_price', 'median_sale_price_mom', 'median_sale_price_yoy',
       'median_list_price', 'median_list_price_mom', 'median_list_price_yoy',
       'median_ppsf', 'median_ppsf_mom', 'median_ppsf_yoy', 'median_list_ppsf',
       'median_list_ppsf_mom', 'median_list_ppsf_yoy', 'homes_sold',
       'homes_sold_mom', 'homes_sold_yoy', 'pending_sales',
       'pending_sales_mom', 'pending_sales_yoy', 'new_listings',
       'new_listings_mom', 'new_listings_yoy', 'inventory', 'inventory_mom',
       'inventory_yoy', 'months_of_supply', 'months_of_supply_mom',
       'months_of_supply_yoy', 'median_dom', 'median_dom_mom',
       'median_dom_yoy', 'avg_sale_to_list', 'avg_sale_to_list_mom',
       'avg_sale_to_list_yoy', 'sold_above_list', 'sold_above_list_mom',
 

In [182]:
df['property_type'].unique()

array(['All Residential', 'Townhouse', 'Single Family Residential',
       'Multi-Family (2-4 Unit)', 'Condo/Co-op'], dtype=object)

In [183]:
count = df['property_type'].value_counts()
count

All Residential              2318669
Single Family Residential    2274100
Condo/Co-op                   894931
Townhouse                     737934
Multi-Family (2-4 Unit)       686305
Name: property_type, dtype: int64

In [184]:
# create year and quarter columns from the dates
df['period_begin'] = pd.to_datetime(df['period_begin'])
df['period_end'] = pd.to_datetime(df['period_end'])
df['year'] = df['period_end'].dt.year
df['quarter'] = df['period_end'].dt.quarter

In [185]:
df['zip_code']= df['region'].str.split(':').str[-1].str.strip()
df['zip_code']=df['zip_code'].astype(float)

In [186]:
quarterly_prices_by_zip = df[['state', 'zip_code', 'year', 'quarter', 'median_sale_price', 'homes_sold', 'property_type', 'months_of_supply']]

In [187]:
quarterly_prices_by_zip.to_csv('data/quarterly_prices_by_zip.csv', index = False)

In [188]:
quarterly_prices_by_zip.head()

Unnamed: 0,state,zip_code,year,quarter,median_sale_price,homes_sold,property_type,months_of_supply
0,Arkansas,71923.0,2022,3,167500.0,50.0,All Residential,
1,Michigan,49265.0,2020,1,294000.0,20.0,All Residential,
2,Texas,78727.0,2012,3,140000.0,1.0,Townhouse,
3,Georgia,31503.0,2013,1,60000.0,1.0,Single Family Residential,
4,Texas,78648.0,2016,2,229500.0,1.0,Multi-Family (2-4 Unit),


In [189]:
# Check for duplicate entries by zip code for a given year and quarter
#duplicates = quarterly_prices_by_zip.duplicated(subset=['zip_code', 'year', 'quarter'], keep=False)

# Print the duplicate entries
#print(duplicates)
# here we find that every row is a duplicate. 


# data exploration to see how to break out and sum values by zip code

In [190]:
oregon_test = quarterly_prices_by_zip[(quarterly_prices_by_zip["zip_code"]==97202) & (quarterly_prices_by_zip['year']==2018) & (quarterly_prices_by_zip['quarter']==3)]

In [191]:
oregon_test

Unnamed: 0,state,zip_code,year,quarter,median_sale_price,homes_sold,property_type,months_of_supply
903510,Oregon,97202.0,2018,3,550000.0,155.0,Single Family Residential,
1750930,Oregon,97202.0,2018,3,536000.0,151.0,Single Family Residential,
1853912,Oregon,97202.0,2018,3,355500.0,12.0,Condo/Co-op,
2011276,Oregon,97202.0,2018,3,532500.0,201.0,All Residential,
2287109,Oregon,97202.0,2018,3,540000.0,15.0,Townhouse,
3356739,Oregon,97202.0,2018,3,484900.0,11.0,Townhouse,
3555966,Oregon,97202.0,2018,3,545000.0,161.0,Single Family Residential,
4394259,Oregon,97202.0,2018,3,525000.0,194.0,All Residential,
4630975,Oregon,97202.0,2018,3,511500.0,13.0,Multi-Family (2-4 Unit),
4948472,Oregon,97202.0,2018,3,525000.0,195.0,All Residential,


In [192]:
oregon_test.describe()

Unnamed: 0,zip_code,year,quarter,median_sale_price,homes_sold,months_of_supply
count,15.0,15.0,15.0,15.0,15.0,0.0
mean,97202.0,2018.0,3.0,491426.666667,78.666667,
std,0.0,0.0,0.0,68363.317405,83.533284,
min,97202.0,2018.0,3.0,335000.0,11.0,
25%,97202.0,2018.0,3.0,479325.0,13.5,
50%,97202.0,2018.0,3.0,525000.0,16.0,
75%,97202.0,2018.0,3.0,534250.0,158.0,
max,97202.0,2018.0,3.0,550000.0,201.0,


In [193]:
oregon_test['homes_sold'].sum()

oregon_test.groupby(['property_type']).agg({'property_type':'count'})

Unnamed: 0_level_0,property_type
property_type,Unnamed: 1_level_1
All Residential,3
Condo/Co-op,3
Multi-Family (2-4 Unit),3
Single Family Residential,3
Townhouse,3


In [194]:
oregon_test2 = quarterly_prices_by_zip[(quarterly_prices_by_zip["zip_code"]==97405) & (quarterly_prices_by_zip['year']>2020)]
oregon_test2

Unnamed: 0,state,zip_code,year,quarter,median_sale_price,homes_sold,property_type,months_of_supply
969033,Oregon,97405.0,2021,2,210000.0,9.0,Condo/Co-op,
1009962,Oregon,97405.0,2021,1,335000.0,3.0,Townhouse,
1216542,Oregon,97405.0,2022,4,377000.0,2.0,Townhouse,
1501749,Oregon,97405.0,2022,1,360000.0,5.0,Townhouse,
1623045,Oregon,97405.0,2022,4,535000.0,5.0,Multi-Family (2-4 Unit),
...,...,...,...,...,...,...,...,...
6749298,Oregon,97405.0,2021,1,435000.0,147.0,Single Family Residential,
6761902,Oregon,97405.0,2022,4,727000.0,5.0,Multi-Family (2-4 Unit),
6764936,Oregon,97405.0,2022,1,320000.0,11.0,Condo/Co-op,
6775911,Oregon,97405.0,2021,4,481431.0,187.0,Single Family Residential,


In [195]:
oregon_grouped = oregon_test2.groupby(['zip_code', 'year']).agg({'median_sale_price': 'mean', 'homes_sold': 'sum'}).reset_index()
print(oregon_grouped)

   zip_code  year  median_sale_price  homes_sold
0   97405.0  2021      416562.183333      4730.0
1   97405.0  2022      468069.093220      4222.0
2   97405.0  2023      435043.666667       630.0


In [196]:
oregon_grouped

Unnamed: 0,zip_code,year,median_sale_price,homes_sold
0,97405.0,2021,416562.183333,4730.0
1,97405.0,2022,468069.09322,4222.0
2,97405.0,2023,435043.666667,630.0


# creating the yearly dataset we want
We will be dropping multi-family from the analysis and just looking at single family
We will be summing sales reported per quarter, but then taking a weighted median value based on the number of observations
This should give us a dataset that has zipcode, year, normalized sale price, and homes sold, that are not multifamily
Then we can merge this with the rest of the data and use these prices as baselines for estimating previous year prices off of HPI data

In [197]:
# dropping the multi-family
quarterly_single_family_by_zip = quarterly_prices_by_zip.drop(quarterly_prices_by_zip[quarterly_prices_by_zip['property_type'] == 'Multi-Family (2-4 Unit)'].index)
quarterly_single_family_by_zip['weighted_mean_sale_price'] = quarterly_single_family_by_zip['median_sale_price']*quarterly_single_family_by_zip['homes_sold']
yearly_single_family_by_zip = quarterly_single_family_by_zip.groupby(['zip_code', 'year']).agg({'weighted_mean_sale_price':'sum', 'homes_sold': 'sum'}).reset_index()

In [269]:
yearly_single_family_by_zip['weighted_mean_sale_price'] = yearly_single_family_by_zip['weighted_mean_sale_price']/yearly_single_family_by_zip['homes_sold']
yearly_single_family_by_zip['zip_code'] = yearly_single_family_by_zip['zip_code'].astype(float)
yearly_single_family_by_zip

Unnamed: 0,zip_code,year,weighted_mean_sale_price,homes_sold
0,501.0,2012,5000.000000,6.0
1,501.0,2013,7083.333333,6.0
2,501.0,2015,2465.240000,10.0
3,501.0,2016,51500.000000,2.0
4,501.0,2017,9710.437500,4.0
...,...,...,...,...
237993,99725.0,2016,6250.000000,6.0
237994,99725.0,2018,3333.333333,6.0
237995,99725.0,2020,1800.000000,6.0
237996,99725.0,2021,4083.333333,6.0


In [270]:
oregon_test3 = yearly_single_family_by_zip[(yearly_single_family_by_zip["zip_code"]==97405) & (yearly_single_family_by_zip['year']>2010)]
oregon_test3['weighted_mean_sale_price']= round(oregon_test3['weighted_mean_sale_price'], 0).astype(int)
oregon_test3['homes_sold']=oregon_test3['homes_sold'].astype(int)
oregon_test3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oregon_test3['weighted_mean_sale_price']= round(oregon_test3['weighted_mean_sale_price'], 0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oregon_test3['homes_sold']=oregon_test3['homes_sold'].astype(int)


Unnamed: 0,zip_code,year,weighted_mean_sale_price,homes_sold
230360,97405.0,2012,0,2953
230361,97405.0,2013,0,3882
230362,97405.0,2014,0,3763
230363,97405.0,2015,0,4750
230364,97405.0,2016,0,5086
230365,97405.0,2017,0,4948
230366,97405.0,2018,0,4928
230367,97405.0,2019,0,4557
230368,97405.0,2020,0,4201
230369,97405.0,2021,0,4646


### Section 2: merge HPI data going back as far as the data exists


In [271]:
# read in the house price index by zip code (this is yearly not quarterly)
hpi_zip5_df = pd.read_excel('data/HPI_AT_BDL_ZIP5.xlsx', skiprows=6, sheet_name='ZIP5')
#hpi_zip5_df.head()

In [272]:
# calculate an HPI with a 2012 base

#hpi_zip5_df.head(30)

In [273]:
#problem = hpi_zip5_df[hpi_zip5_df['HPI'] == '.']
#problem

# replacing string '.' with a value i can find later as i make everything into floats later on
hpi_zip5_df['HPI'] = hpi_zip5_df['HPI'].replace('.', '-1')


In [274]:
# Filter the data to include only the rows with year 2012
df_2012 = hpi_zip5_df[hpi_zip5_df['Year'] == 2012]
df_2012['HPI'] = df_2012['HPI'].astype(float)
# Find the HPI value for 2012 for each zip code
df_2012['HPI with 2012 base'] = 100
df_2012['HPI from 2012'] = df_2012['HPI']
df_2012

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2012['HPI'] = df_2012['HPI'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2012['HPI with 2012 base'] = 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2012['HPI from 2012'] = df_2012['HPI']


Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base,HPI with 2012 base,HPI from 2012
28,1001,2012,-4.40,279.95,149.40,146.43,100,279.95
68,1002,2012,-2.37,348.54,184.84,153.00,100,348.54
99,1005,2012,-4.70,150.95,.,122.34,100,150.95
137,1007,2012,-1.67,262.24,164.68,149.66,100,262.24
158,1008,2012,-0.03,122.54,.,.,100,122.54
...,...,...,...,...,...,...,...,...
630422,99712,2012,-0.03,152.05,202.00,138.88,100,152.05
630466,99801,2012,2.80,353.65,254.27,152.90,100,353.65
630491,99824,2012,4.83,156.12,.,138.38,100,156.12
630529,99835,2012,2.80,298.33,276.46,157.45,100,298.33


In [275]:
# merge the 2012 value onto the full zip hpi data
hpi_zip5_df_merged = pd.merge(hpi_zip5_df, df_2012[['Five-Digit ZIP Code', 'HPI from 2012']], on= 'Five-Digit ZIP Code')

In [276]:
hpi_zip5_df_merged.head(100)

Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base,HPI from 2012
0,1001,1984,.,100.00,53.37,52.31,279.95
1,1001,1985,16.00,116.00,61.91,60.68,279.95
2,1001,1986,14.21,132.48,70.70,69.30,279.95
3,1001,1987,21.08,160.41,85.61,83.90,279.95
4,1001,1988,17.63,188.68,100.69,98.69,279.95
...,...,...,...,...,...,...,...
95,1005,2008,2.67,201.24,.,163.10,150.95
96,1005,2009,-11.86,177.37,.,143.75,150.95
97,1005,2010,-2.56,172.83,.,140.07,150.95
98,1005,2011,-8.36,158.38,.,128.37,150.95


In [277]:
# now calculate hpi based on 2012 for every row
hpi_zip5_df_merged['HPI']=hpi_zip5_df_merged['HPI'].astype(float)
hpi_zip5_df_merged['HPI with 2012 base'] = hpi_zip5_df_merged['HPI']/hpi_zip5_df_merged['HPI from 2012']


In [278]:
# convert the zip code to float for merging later
hpi_zip5_df_merged['Five-Digit ZIP Code']=hpi_zip5_df_merged['Five-Digit ZIP Code'].astype(float)
hpi_zip5_df_merged.head(100)

Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base,HPI from 2012,HPI with 2012 base
0,1001.0,1984,.,100.00,53.37,52.31,279.95,0.357207
1,1001.0,1985,16.00,116.00,61.91,60.68,279.95,0.414360
2,1001.0,1986,14.21,132.48,70.70,69.30,279.95,0.473227
3,1001.0,1987,21.08,160.41,85.61,83.90,279.95,0.572995
4,1001.0,1988,17.63,188.68,100.69,98.69,279.95,0.673977
...,...,...,...,...,...,...,...,...
95,1005.0,2008,2.67,201.24,.,163.10,150.95,1.333157
96,1005.0,2009,-11.86,177.37,.,143.75,150.95,1.175025
97,1005.0,2010,-2.56,172.83,.,140.07,150.95,1.144949
98,1005.0,2011,-8.36,158.38,.,128.37,150.95,1.049222


In [279]:
#merged_prices_hpi = pd.merge(hpi_zip5_df_merged, yearly_single_family_by_zip[['zip_code', 'weighted_mean_sale_price']], left_on='Five-Digit ZIP Code', right_on='zip_code')
merged_prices_hpi = hpi_zip5_df_merged.merge(yearly_single_family_by_zip[['zip_code', 'year', 'weighted_mean_sale_price']], left_on=['Five-Digit ZIP Code', 'Year'], right_on=['zip_code', 'year'])

merged_prices_hpi


Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base,HPI from 2012,HPI with 2012 base,zip_code,year,weighted_mean_sale_price
0,1001.0,2012,-4.40,279.95,149.40,146.43,279.95,1.000000,1001.0,2012,0.272763
1,1001.0,2013,1.09,283.00,151.03,148.03,279.95,1.010895,1001.0,2013,0.189874
2,1001.0,2014,0.89,285.52,152.37,149.35,279.95,1.019896,1001.0,2014,0.139873
3,1001.0,2015,1.98,291.17,155.39,152.30,279.95,1.040079,1001.0,2015,0.157378
4,1001.0,2016,2.67,298.96,159.55,156.38,279.95,1.067905,1001.0,2016,0.122202
...,...,...,...,...,...,...,...,...,...,...,...
164573,99712.0,2018,5.54,171.64,228.03,156.77,152.05,1.128839,99712.0,2018,1.235424
164574,99712.0,2019,-0.28,171.16,227.39,156.33,152.05,1.125682,99712.0,2019,1.669118
164575,99712.0,2020,1.73,174.12,231.32,159.03,152.05,1.145150,99712.0,2020,1.951323
164576,99712.0,2021,7.93,187.92,249.65,171.64,152.05,1.235909,99712.0,2021,2.786951


In [280]:
sample1 = yearly_single_family_by_zip[yearly_single_family_by_zip['zip_code']==97405]
sample1

Unnamed: 0,zip_code,year,weighted_mean_sale_price,homes_sold
230360,97405.0,2012,0.026766,2953.0
230361,97405.0,2013,0.017354,3882.0
230362,97405.0,2014,0.018886,3763.0
230363,97405.0,2015,0.012006,4750.0
230364,97405.0,2016,0.011351,5086.0
230365,97405.0,2017,0.012721,4948.0
230366,97405.0,2018,0.013929,4928.0
230367,97405.0,2019,0.01774,4557.0
230368,97405.0,2020,0.022595,4201.0
230369,97405.0,2021,0.021564,4646.0


In [281]:
sample = merged_prices_hpi[merged_prices_hpi['Five-Digit ZIP Code']==97405]
sample

Unnamed: 0,Five-Digit ZIP Code,Year,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base,HPI from 2012,HPI with 2012 base,zip_code,year,weighted_mean_sale_price
159429,97405.0,2012,-1.65,604.3,235.08,137.76,604.3,1.0,97405.0,2012,0.026766
159430,97405.0,2013,3.46,625.22,243.22,142.53,604.3,1.034619,97405.0,2013,0.017354
159431,97405.0,2014,5.46,659.35,256.5,150.31,604.3,1.091097,97405.0,2014,0.018886
159432,97405.0,2015,3.75,684.08,266.12,155.94,604.3,1.132021,97405.0,2015,0.012006
159433,97405.0,2016,6.6,729.24,283.69,166.24,604.3,1.206752,97405.0,2016,0.011351
159434,97405.0,2017,7.62,784.81,305.3,178.91,604.3,1.298709,97405.0,2017,0.012721
159435,97405.0,2018,7.22,841.49,327.35,191.83,604.3,1.392504,97405.0,2018,0.013929
159436,97405.0,2019,5.69,889.33,345.96,202.73,604.3,1.47167,97405.0,2019,0.01774
159437,97405.0,2020,5.06,934.36,363.48,213.0,604.3,1.546186,97405.0,2020,0.022595
159438,97405.0,2021,13.66,1062.01,413.13,242.1,604.3,1.757422,97405.0,2021,0.021564


### Section 3: Recession data


In [282]:
recession_df = pd.read_csv("data/recessionflag.csv")
recession_df['DATE'] = pd.to_datetime(recession_df['DATE'])
recession_df['year'] = recession_df['DATE'].dt.year
recession_df['quarter'] = recession_df['DATE'].dt.quarter
recession_df = recession_df.rename(columns={'JHDUSRGDPBR':'recession_flag'})
recession_df.head()

Unnamed: 0,DATE,recession_flag,year,quarter
0,1967-10-01,0.0,1967,4
1,1968-01-01,0.0,1968,1
2,1968-04-01,0.0,1968,2
3,1968-07-01,0.0,1968,3
4,1968-10-01,0.0,1968,4
