In [113]:
import pandas as pd
import numpy as np

In [114]:
target_states = ['FL','NC','GA','VA','MD','SC','WV','DC',
                 'GA-SC','NC-SC', 'TN-GA', 'TN-VA', 'GA-AL', 
                 'MD-WV', 'WV-KY-OH', 'SC-NC', 'MD-DE', 'VA-NC',
                 'DC-VA-MD-WV', 'WV-OH', 'VA-WV']

msa_zip = pd.read_csv('MSA_to_ZIP.csv')
msa_zip.STATE = msa_zip.STATE.str.strip()
msa_zip.MSA = msa_zip.MSA.str.strip()
msa_zip = msa_zip[msa_zip.STATE.isin(target_states)]

In [115]:
msa_zip.MSA = msa_zip.MSA.str.replace('-',' ',regex=False)
msa_zip.MSA = msa_zip.MSA.str.replace('.','',regex=False)
msa_zip.ZIPS = msa_zip.ZIPS.astype('string')

In [116]:
# to be used to collect data for each MSA
data = msa_zip[['CBSA_name','MSA','STATE','ZIPS']].set_index('CBSA_name')
data.head()

Unnamed: 0_level_0,MSA,STATE,ZIPS
CBSA_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Albany, GA",Albany,GA,"[31703, 31701, 31721, 31763, 31704, 39842, 317..."
"Albemarle, NC",Albemarle,NC,"[28097, 28124, 28002, 28137, 28127, 28107, 280..."
"Americus, GA",Americus,GA,"[31806, 31719, 31058, 31711, 31081, 31803, 310..."
"Arcadia, FL",Arcadia,FL,"[34266, 34269, 34267, 33865, 34268]"
"Asheville, NC",Asheville,NC,"[28753, 28804, 28709, 28791, 28730, 28715, 288..."


## FRED Home Price Index Data

In [117]:
HPI = pd.read_csv('https://nycdsacapstone2021.blob.core.windows.net/fromsmh/House_Price_Index_MSA.csv?sp=r&st=2021-12-13T00:02:49Z&se=2021-12-25T08:02:49Z&spr=https&sv=2020-08-04&sr=b&sig=sciIDIM7bTIZY6FEDhaR2U8MlhAWU1sIBohmuQhAN8g%3D', 
                  index_col='Date').T.reset_index()
HPI.rename(columns={'index':'Name','Date':'index'}, inplace=True)

In [118]:
for each in ['all_transactions_house_price_index_for_','_msa']:
    HPI.Name = HPI.Name.str.replace(each,'')
#HPI.Name = HPI.Name.str.replace('all_transactions_house_price_index_for_','').str.replace('_msa','').str.replace('_',' ')
replace_states = ['FL','NC','GA','VA','MD','SC','WV','DC','TN','KY','OH','DE','AL','WVD']
for _ in range(4):
    for each in replace_states:
        HPI.Name = HPI.Name.str.replace('_'+str.lower(each)+'$','', regex=True)
HPI.Name = HPI.Name.str.replace('_',' ')

In [119]:
HPI.Name = HPI.Name.apply(lambda x: str.title(x))

In [120]:
HPI_latest = HPI[['Name','2021-07-01']].rename(columns={'2021-07-01':'HPI'}).set_index('Name')

In [121]:
data = data.merge(HPI_latest, how='left', left_on='MSA', right_index=True)

In [122]:
data.head()

Unnamed: 0_level_0,MSA,STATE,ZIPS,HPI
CBSA_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Albany, GA",Albany,GA,"[31703, 31701, 31721, 31763, 31704, 39842, 317...",187.61
"Albemarle, NC",Albemarle,NC,"[28097, 28124, 28002, 28137, 28127, 28107, 280...",
"Americus, GA",Americus,GA,"[31806, 31719, 31058, 31711, 31081, 31803, 310...",
"Arcadia, FL",Arcadia,FL,"[34266, 34269, 34267, 33865, 34268]",
"Asheville, NC",Asheville,NC,"[28753, 28804, 28709, 28791, 28730, 28715, 288...",373.46


## Market Hotness

In [123]:
market = pd.read_csv('https://nycdsacapstone2021.blob.core.windows.net/fromsmh/Market_Hotness_Indicators_MSA.csv?sp=r&st=2021-12-13T00:04:00Z&se=2021-12-25T08:04:00Z&spr=https&sv=2020-08-04&sr=b&sig=JmSQ5ji9hpr%2FqGKqIx5ZvFQN8q0g2ypH1glJ1%2FcsU20%3D').\
    set_index('Date').T
market = market[['2021-11-01']]
market.index = market.index.str.replace('_cbsa','')
market.drop(market.filter(regex='percent_change_', axis=0).index, inplace=True)

* Demand Score

In [124]:
demand_score = market.filter(regex='demand_score', axis=0).rename(columns={'2021-11-01':'Demand_score'})
demand_score.index = demand_score.index.str.replace('market_hotness_demand_score_in_','')
for _ in range(4):
    for each in replace_states:
        demand_score.index = demand_score.index.str.replace('_'+str.lower(each)+'$','', regex=True)
demand_score.index = demand_score.index.str.replace('_',' ').str.title()

In [125]:
data = data.merge(demand_score, how='left', left_on='MSA', right_index=True)

* Suppy Score

In [126]:
supply_score = market.filter(regex='supply_score', axis=0).rename(columns={'2021-11-01':'Supply_score'})
supply_score.index = supply_score.index.str.replace('market_hotness_supply_score_in_','')
for _ in range(4):
    for each in replace_states:
        supply_score.index = supply_score.index.str.replace('_'+str.lower(each)+'$','', regex=True)
supply_score.index = supply_score.index.str.replace('_',' ').str.title()

In [127]:
data = data.merge(supply_score, how='left', left_on='MSA', right_index=True)

* Listing Views

In [128]:
listing_views = market.filter(regex='listing_views_per_property_versus_the_united_states', axis=0).\
    rename(columns={'2021-11-01':'listviews_vs_US'})
listing_views.index = listing_views.index.str.replace('market_hotness_listing_views_per_property_versus_the_united_states_in_','')
for _ in range(4):
    for each in replace_states:
        listing_views.index = listing_views.index.str.replace('_'+str.lower(each)+'$','', regex=True)
listing_views.index = listing_views.index.str.replace('_',' ').str.title()

In [129]:
data = data.merge(listing_views, how='left', left_on='MSA', right_index=True)

* Median Days on Market

In [130]:
mdays_on_mkt = market.filter(regex='market_hotness_median_days_on_market_in_', axis=0).\
    rename(columns={'2021-11-01':'med_days_on_mkt'})
mdays_on_mkt.index = mdays_on_mkt.index.str.replace('market_hotness_median_days_on_market_in_','')
for _ in range(4):
    for each in replace_states:
        mdays_on_mkt.index = mdays_on_mkt.index.str.replace('_'+str.lower(each)+'$','', regex=True)
mdays_on_mkt.index = mdays_on_mkt.index.str.replace('_',' ').str.title()

In [131]:
data = data.merge(mdays_on_mkt, how='left', left_on='MSA', right_index=True)

* Nielsen Household Rank

In [132]:
nielson = market.filter(regex='nielsen_household_rank_in_', axis=0).\
    rename(columns={'2021-11-01':'nielson_rank'})
nielson.index = nielson.index.str.replace('market_hotness_nielsen_household_rank_in_','')
for _ in range(4):
    for each in replace_states:
        nielson.index = nielson.index.str.replace('_'+str.lower(each)+'$','', regex=True)
nielson.index = nielson.index.str.replace('_',' ').str.title()

In [133]:
data = data.merge(nielson, how='left', left_on='MSA', right_index=True)

In [135]:
data.head()

Unnamed: 0_level_0,MSA,STATE,ZIPS,HPI,Demand_score,Supply_score,listviews_vs_US,med_days_on_mkt,nielson_rank
CBSA_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"Albany, GA",Albany,GA,"[31703, 31701, 31721, 31763, 31704, 39842, 317...",187.61,49.16388,52.173913,0.9959,45.0,279.0
"Albemarle, NC",Albemarle,NC,"[28097, 28124, 28002, 28137, 28127, 28107, 280...",,,,,,
"Americus, GA",Americus,GA,"[31806, 31719, 31058, 31711, 31081, 31803, 310...",,,,,,
"Arcadia, FL",Arcadia,FL,"[34266, 34269, 34267, 33865, 34268]",,,,,,
"Asheville, NC",Asheville,NC,"[28753, 28804, 28709, 28791, 28730, 28715, 288...",373.46,71.571906,7.023411,1.2686,63.0,106.0


## Unemployment

In [216]:
unemp = pd.read_excel('https://nycdsacapstone2021.blob.core.windows.net/fromsmh/BEA_unemployment.xlsx?sp=r&st=2021-12-13T01:25:18Z&se=2021-12-25T09:25:18Z&spr=https&sv=2020-08-04&sr=b&sig=dTinH9c9Ig4MPNpL51Dx4UrLu%2Fqg3mSqGyQWYcatoJE%3D',
                      header=2)
unemp = unemp[(unemp.Year==2021) & (unemp.Month==10)]

In [217]:
unemp.Area = unemp.Area.str.replace(' MSA','')

In [218]:
for each in unemp.index:
    msa = unemp.loc[each,'Area'].split(',')[0]
    state = unemp.loc[each,'Area'].split(',')[1]
    unemp.loc[each,'MSA'] = msa
    unemp.loc[each,'STATE'] = state
unemp.STATE = unemp.STATE.str.strip()
unemp = unemp[unemp.STATE.isin(target_states)]

In [224]:
unemp['MSA'] = unemp.MSA.str.replace('. ',' ', regex=False).str.replace('-',' ', regex=False)

In [None]:
data = data.merge(unemp.set_index('MSA')['Unemployment Rate'], how='left', left_on='MSA', right_index=True)
data.rename(columns={'Unemployment Rate':'UE_rate'}, inplace=True)

In [253]:
data.to_csv('MSA_data.csv')