In [1]:
import pandas as pd
import re

In [2]:
census_file = 'https://www.bea.gov/system/files/2019-11/lapi1119msa.xlsx'
# Local backup:
#census_file = 'data/lapi1119msa.xlsx'

census_df = pd.read_excel(census_file, index_col=0)
census_df

Unnamed: 0_level_0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
"Table 2. Per Capita Personal Income by Metropolitan Area, 2016 - 2018",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,Per capita personal income1,,,,Percent change from preceding period,,
,Dollars,,,Rank in United States,Percent change,,Rank in United States
,2016,2017.0,2018.0,2018,2017,2018.0,2018
United States,49870,51885.0,54446.0,--,4,4.9,--
Metropolitan portion,51742,53864.0,56527.0,--,4.1,4.9,--
...,...,...,...,...,...,...,...
"Yuba City, CA",40729,41270.0,42925.0,260,1.3,4.0,282
"Yuma, AZ",33141,35049.0,35682.0,374,5.8,1.8,380
1. Per capita personal income was computed using Census Bureau midyear population estimates. Estimates reflect county population estimates available as of March 2019.,,,,,,,
"2. The personal income level shown for the United States is derived as the sum of the county estimates. It differs from the estimate of personal income in the national income and product accounts because of differences in coverage, in the methodologies used to prepare the estimates, and in the timing of the availability of source data.",,,,,,,


# Cleanup
This file needs to be cleaned up a little and the only thing we care about is the cities and the values for the year 2018

In [3]:
# Drop NaN rows
census_df = census_df.dropna(axis='rows')

# Drop the columns we don't need
census_df = census_df.drop(columns=['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7'])

# Drop the rows we don't need
census_df = census_df.drop(['United States','Metropolitan portion', 'Nonmetropolitan portion'])
census_df = census_df.iloc[1:]

# Rename the index to something more descriptive (and easy to call)
census_df.index.names = ['metro_area']

# Rename the column to something more descriptive (and easy to call)
census_df = census_df.rename(columns={'Unnamed: 3': 'per_capita_income'})

In [4]:
census_df

Unnamed: 0_level_0,per_capita_income
metro_area,Unnamed: 1_level_1
"Abilene, TX",43140.0
"Akron, OH",49423.0
"Albany, GA",37500.0
"Albany-Lebanon, OR",42891.0
"Albany-Schenectady-Troy, NY",58104.0
...,...
"Yakima, WA",43379.0
"York-Hanover, PA",50113.0
"Youngstown-Warren-Boardman, OH-PA",42443.0
"Yuba City, CA",42925.0


Note: Now we need to address the metro areas containing multiple cities and multiple entries to connect with the housing data we're going to combine on

In [5]:
# Build a copy to transform
census_transform_df = census_df.reset_index()

In [7]:
# Traverse the cities
for ind in census_transform_df.index: 
    #print(test_df['metro_area'][ind])
    input_str = census_transform_df['metro_area'][ind]
    
    # Remove the text from ''-' to ',' (keeps the primary city)
    input_str = re.sub(r'-.*,', '', input_str)
    
    # Remove instances where we have a '/' instead of '-'
    input_str = re.sub(r'/.*,', '', input_str)
    
    # Where we have '-' leftover, those are between states, so keep the first state only
    input_str = input_str.split('-')[0]
    
    # Write it back to the dataframe
    census_transform_df.at[ind, 'metro_area'] = input_str

In [8]:
# Now, we traverse the cities again to clean up some newly created issues
# We now have no commas between cities and states
for ind in census_transform_df.index: 
    # Look for entries with no commas
    if census_transform_df['metro_area'][ind].find(',') == -1:
        #census_transform_df.at[ind, 'metro_area'] = census_transform_df['metro_area'][ind].replace(' ',', ')   
        
        # Find the last occurrance of a ' ' (this will be just before the state) and replace with a ','
        original_string = census_transform_df.at[ind, 'metro_area']
        last_space_index = original_string.rfind(' ')
        new_string = original_string[:last_space_index] + ", " + original_string[last_space_index+1:]
        
        # Write it back to the dataframe
        census_transform_df.at[ind, 'metro_area'] = new_string
       
    else:
        continue

In [9]:
census_transform_df.head(10)

Unnamed: 0,metro_area,per_capita_income
0,"Abilene, TX",43140.0
1,"Akron, OH",49423.0
2,"Albany, GA",37500.0
3,"Albany, OR",42891.0
4,"Albany, NY",58104.0
5,"Albuquerque, NM",42536.0
6,"Alexandria, LA",43995.0
7,"Allentown, PA",54120.0
8,"Altoona, PA",46743.0
9,"Amarillo, TX",46131.0


In [10]:
census_transform_df.to_csv('test2.csv')