In [1]:
import pandas as pd
import re
pd.set_option('precision', 0)

# Extracting Per Capita Personal Income

In [2]:
census_file = 'https://www.bea.gov/system/files/2019-11/lapi1119msa.xlsx'
# Local backup:
#census_file = 'data/lapi1119msa.xlsx'
census_df = pd.read_excel(census_file, index_col=0)
census_df

Unnamed: 0_level_0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
"Table 2. Per Capita Personal Income by Metropolitan Area, 2016 - 2018",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,Per capita personal income1,,,,Percent change from preceding period,,
,Dollars,,,Rank in United States,Percent change,,Rank in United States
,2016,2017,2018,2018,2017,2018,2018
United States,49870,51885,54446,--,4,5,--
Metropolitan portion,51742,53864,56527,--,4,5,--
Nonmetropolitan portion,38422,39690,41552,--,3,5,--
Metropolitan Statistical Areas2,,,,,,,
"Abilene, TX",40140,40862,43140,256,2,6,57
"Akron, OH",45515,47343,49423,129,4,4,211
"Albany, GA",35485,36522,37500,358,3,3,374


# Cleanup
This file needs to be cleaned up a little and the only thing we care about is the cities and the values for the year 2018

In [3]:
# Drop NaN rows
census_df = census_df.dropna(axis='rows')

# Drop the columns we don't need
census_df = census_df.drop(columns=['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7'])

# Drop the rows we don't need
census_df = census_df.drop(['United States','Metropolitan portion', 'Nonmetropolitan portion'])
census_df = census_df.iloc[1:]

# Rename the index to something more descriptive (and easy to call)
census_df.index.names = ['metro_area']

# Rename the column to something more descriptive (and easy to call)
census_df = census_df.rename(columns={'Unnamed: 3': 'per_capita_income'})

In [4]:
census_df

Unnamed: 0_level_0,per_capita_income
metro_area,Unnamed: 1_level_1
"Abilene, TX",43140
"Akron, OH",49423
"Albany, GA",37500
"Albany-Lebanon, OR",42891
"Albany-Schenectady-Troy, NY",58104
"Albuquerque, NM",42536
"Alexandria, LA",43995
"Allentown-Bethlehem-Easton, PA-NJ",54120
"Altoona, PA",46743
"Amarillo, TX",46131


Note: Now we need to address the metro areas containing multiple cities and multiple entries to connect with the housing data we're going to combine on

In [5]:
# Build a copy to transform
census_transform_df = census_df.reset_index()

In [6]:
# Traverse the cities
for ind in census_transform_df.index: 
    #print(test_df['metro_area'][ind])
    input_str = census_transform_df['metro_area'][ind]
    
    # Remove the text from ''-' to ',' (keeps the primary city)
    input_str = re.sub(r'-.*,', '', input_str)
    
    # Remove instances where we have a '/' instead of '-'
    input_str = re.sub(r'/.*,', '', input_str)
    
    # Where we have '-' leftover, those are between states, so keep the first state only
    input_str = input_str.split('-')[0]
    
    # Write it back to the dataframe
    census_transform_df.at[ind, 'metro_area'] = input_str

In [7]:
# Now, we traverse the cities again to clean up some newly created issues
# We now have no commas between cities and states
for ind in census_transform_df.index: 
    # Look for entries with no commas
    if census_transform_df['metro_area'][ind].find(',') == -1:
        #census_transform_df.at[ind, 'metro_area'] = census_transform_df['metro_area'][ind].replace(' ',', ')   
        
        # Find the last occurrance of a ' ' (this will be just before the state) and replace with a ','
        original_string = census_transform_df.at[ind, 'metro_area']
        last_space_index = original_string.rfind(' ')
        new_string = original_string[:last_space_index] + ", " + original_string[last_space_index+1:]
        
        # Write it back to the dataframe
        census_transform_df.at[ind, 'metro_area'] = new_string
       
    else:
        continue

In [8]:
census_transform_df.head(10)

Unnamed: 0,metro_area,per_capita_income
0,"Abilene, TX",43140
1,"Akron, OH",49423
2,"Albany, GA",37500
3,"Albany, OR",42891
4,"Albany, NY",58104
5,"Albuquerque, NM",42536
6,"Alexandria, LA",43995
7,"Allentown, PA",54120
8,"Altoona, PA",46743
9,"Amarillo, TX",46131


In [9]:
census_transform_df.to_csv('income.csv')

# Extracting Home Sales Data

In [None]:
# Path to the Zillow Home Sales .CSV file
zillow = "data/zillow_sales_data.csv"

# Read our Zillow Home Sales data into pandas
zillow_df = pd.read_csv(zillow)
zillow_df.head(10)

In [None]:
# Remove any RegionType that is not a US Metro
zillow_df = zillow_df.loc[(zillow_df["RegionType"] == "Msa")]

zillow_df

In [None]:
# Obtain Column names
zillow_df.columns

In [None]:
# Reflect only 2018 housing data from the Zillow Home Sales dataframe
reduced_zillow_df = zillow_df[["RegionName", "2018-01-31", "2018-02-28", "2018-03-31", 
                               "2018-04-30", "2018-05-31", "2018-06-30", "2018-07-31", 
                               "2018-08-31", "2018-09-30", "2018-10-31", "2018-11-30", 
                               "2018-12-31"]]
reduced_zillow_df

In [None]:
# Rename Column Headers and Include 2018 Sales Average 
final_zillow_df = pd.DataFrame(columns=["metro_area", "Median Home Price 1/18", 
                                     "Median Home Price 2/18", "Median Home Price 3/18", 
                                     "Median Home Price 4/18", "Median Home Price 5/18", 
                                     "Median Home Price 6/18", "Median Home Price 7/18", 
                                     "Median Home Price 8/18", "Median Home Price 9/18", 
                                     "Median Home Price 10/18", "Median Home Price 11/18", 
                                     "Median Home Price 12/18",])

final_zillow_df["metro_area"] = reduced_zillow_df["RegionName"].values
final_zillow_df["Median Home Price 1/18"] = reduced_zillow_df["2018-01-31"].values
final_zillow_df["Median Home Price 2/18"] = reduced_zillow_df["2018-02-28"].values
final_zillow_df["Median Home Price 3/18"] = reduced_zillow_df["2018-03-31"].values
final_zillow_df["Median Home Price 4/18"] = reduced_zillow_df["2018-04-30"].values
final_zillow_df["Median Home Price 5/18"] = reduced_zillow_df["2018-05-31"].values
final_zillow_df["Median Home Price 6/18"] = reduced_zillow_df["2018-06-30"].values
final_zillow_df["Median Home Price 7/18"] = reduced_zillow_df["2018-07-31"].values
final_zillow_df["Median Home Price 8/18"] = reduced_zillow_df["2018-08-31"].values
final_zillow_df["Median Home Price 9/18"] = reduced_zillow_df["2018-09-30"].values
final_zillow_df["Median Home Price 10/18"] = reduced_zillow_df["2018-10-31"].values
final_zillow_df["Median Home Price 11/18"] = reduced_zillow_df["2018-11-30"].values
final_zillow_df["Median Home Price 12/18"] = reduced_zillow_df["2018-12-31"].values

final_zillow_df

In [None]:
# Combine Multi City Metro's to match with the income data
# Build a copy to transform
zillow_transform_df = final_zillow_df.reset_index()

In [None]:
# Traverse the cities
for ind in zillow_transform_df.index: 
    
    #print(test_df['metro_area'][ind])
    input_str = zillow_transform_df['metro_area'][ind]
    
    # Remove the text from ''-' to ',' (keeps the primary city)
    input_str = re.sub(r'-.*,', '', input_str)
    
    # Where we have '-' leftover, those are between states, so keep the first state only
    input_str = input_str.split('-')[0]
    
    # Write it back to the dataframe
    zillow_transform_df.at[ind, 'metro_area'] = input_str

In [None]:
# Now, we traverse the cities again to clean up some newly created issues
# We now have no commas between cities and states
for ind in zillow_transform_df.index: 
    # Look for entries with no commas
    if zillow_transform_df['metro_area'][ind].find(',') == -1:
        #census_transform_df.at[ind, 'metro_area'] = census_transform_df['metro_area'][ind].replace(' ',', ')   
        
        # Find the last occurrance of a ' ' (this will be just before the state) and replace with a ','
        original_string = zillow_transform_df.at[ind, 'metro_area']
        last_space_index = original_string.rfind(' ')
        new_string = original_string[:last_space_index] + ", " + original_string[last_space_index+1:]
        
        # Write it back to the dataframe
        zillow_transform_df.at[ind, 'metro_area'] = new_string
       
    else:
        continue
zillow_transform_df.head(10)

In [None]:
# Cleanup the DataFrame
del zillow_transform_df['index']
zillow_transform_df

In [None]:
#Add average column
sales_average_df = zillow_transform_df.copy()
sales_average_df['2018 Sales Average'] = zillow_transform_df.sum(axis=1) / 12
pd.set_option('precision', 0)
sales_average_df.head()

In [None]:
sales_average_df.to_csv('sales.csv')

## Extracting Rental Data

In [None]:
#read in csv data
rent_data = pd.read_csv('data/Zillow_Rental.csv')
rent_data.head()

In [None]:
#drop all yearly data except 2018
rent_df = rent_data.drop(columns=['2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06',
                                   '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12',
                                   '2015-01', '2015-02', '2015-03', '2015-04', '2015-05', '2015-06',
                                   '2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12',
                                   '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
                                   '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12',
                                   '2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
                                   '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12',
                                   '2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06',
                                   '2019-07', '2019-08', '2019-09', '2019-10', '2019-11', '2019-12',
                                   '2020-01', '2020-02', '2020-03', '2020-04', '2020-05', '2020-06',
                                   '2020-07'])


rent_df.head()

In [None]:
#drop United States row
rent_df = rent_df.drop([0])
rent_df.head()

In [None]:
#reset index so first result is [0]
rent_df = rent_df.reset_index(drop=True)
rent_df.head()

In [None]:
#further column consolidation
rent_df = rent_df.drop(['RegionID', 'SizeRank'], axis=1)
rent_df = rent_df.rename({"RegionName": "metro_area", "2018-01": "Median Home Rent 1/2018", "2018-02": "Median Home Rent 2/2018",
                "2018-03" : "Median Home Rent 3/2018", "2018-04" : "Median Home Rent 4/2018", "2018-05" : "Median Home Rent 5/2018",
                "2018-06" : "Median Home Rent 6/2018", "2018-07" : "Median Home Rent 7/2018", "2018-08" : "Median Home Rent 8/2018",
                "2018-09" : "Median Home Rent 9/2018", "2018-10" : "Median Home Rent 10/2018", "2018-11" : "Median Home Rent 11/2018",
                "2018-12" : "Median Home Rent 12/2018"}, axis=1)
rent_df.head()

In [None]:
#Handling of Multi-city named metro_areas
rent_transform_df = rent_df.copy()

for ind in rent_transform_df.index:
    input_str = rent_transform_df['metro_area'][ind]
    input_str = re.sub(r'-.*,', '', input_str)
    rent_transform_df.at[ind, 'metro_area'] =input_str
    
#confirming missing comma issue
rent_transform_df.head(10)

In [None]:
#replacing missing comma
for ind in rent_transform_df.index:
    if rent_transform_df['metro_area'][ind].find(',') == -1:
        original_string = rent_transform_df.at[ind, 'metro_area']
        last_space_index = original_string.rfind(' ')
        new_string = original_string[:last_space_index] + ", " + original_string[last_space_index+1:]
        rent_transform_df.at[ind, 'metro_area'] = new_string
    else:
        continue
        
rent_transform_df.head(10)

In [None]:
#add column to average each row's values
rent_average_df = rent_transform_df.copy()
rent_average_df['2018 Rent Average'] = rent_transform_df.sum(axis=1) / 12
pd.set_option('precision', 0)
rent_average_df.head()

In [None]:
rent_average_df.to_csv('rent.csv')