In [1]:
import pandas
import os.path

In [2]:
def getData(fromCache=True):
    dataDirectory = '../data/census/Census_2010/'
    outputName = 'census2010_clean.csv'
    
    if fromCache & os.path.isfile(dataDirectory + outputName):
        return pandas.read_csv(dataDirectory + outputName, header=0)
    
    else:
        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/P1/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000
        totalPopulation = pandas.read_csv(dataDirectory + 'total_population/DEC_10_SF1_P1_with_ann.csv', 
                                          header=1)
        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/H11/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000
        occupiedHousingTenure = pandas.read_csv(dataDirectory + 'occupied_housing_tenure/DEC_10_SF1_H11_with_ann.csv',  
                                                header=1)
        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/P13/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000
        medianAgeSex = pandas.read_csv(dataDirectory + 'median_age_sex/DEC_10_SF1_P13_with_ann.csv',  
                                       header=1)
        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/QTP11/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000
        householdsFamilies = pandas.read_csv(dataDirectory + 'households_families/DEC_10_SF1_QTP11_with_ann.csv',  
                                             header=1)
        # Retrieved 10-07-2016 from http://factfinder.census.gov/bkmk/table/1.0/en/DEC/10_SF1/QTP11/0500000US04007.15000|0500000US04013.15000|0500000US04021.15000|0500000US04025.15000
        race = pandas.read_csv(dataDirectory + 'hispanic_latino/DEC_10_SF1_P11_with_ann.csv',  
                               header=1)

        selectedData = (pandas.DataFrame(totalPopulation[['Id2','Total']])
                        .merge(occupiedHousingTenure[['Id2',
                                                     'Owned with a mortgage or a loan',
                                                     'Owned free and clear',
                                                     'Renter occupied'
                                                    ]], 
                               on='Id2', how='outer'
                             )
                        .merge(medianAgeSex[['Id2',
                                             'Median age -- - Both sexes'
                                           ]], 
                              on='Id2', how='outer'
                             )
                        .merge(householdsFamilies[['Id2',
                                                  'Number; HOUSEHOLD TYPE - Total households',
                                                  'Number; HOUSEHOLD TYPE - Total households - Family households [1]',
                                                  'Number; HOUSEHOLD SIZE - Total households - 1-person household',
                                                  'Number; HOUSEHOLD SIZE - Total households - Average household size'
                                                 ]], 
                               on='Id2', how='outer'
                             )
                        .merge(race[['Id2',
                                    'Hispanic or Latino',
                                    'Not Hispanic or Latino: - Population of one race: - White alone',
                                    'Not Hispanic or Latino: - Population of one race: - Black or African American alone',
                                    'Not Hispanic or Latino: - Population of one race: - American Indian and Alaska Native alone',
                                    'Not Hispanic or Latino: - Population of one race: - Asian alone'    
                                   ]], 
                               on='Id2', how='outer'
                             )
                        .rename(columns={'Id2': 'GEOID',
                                         'Total': 'population_total',
                                         'Owned with a mortgage or a loan': 'home_mortgages',
                                         'Owned free and clear': 'home_owners',
                                         'Renter occupied': 'renters',
                                         'Median age -- - Both sexes': 'median_age',
                                         'Number; HOUSEHOLD TYPE - Total households': 'total_households',
                                         'Number; HOUSEHOLD TYPE - Total households - Family households [1]': 
                                             'family_households',
                                         'Number; HOUSEHOLD SIZE - Total households - 1-person household': 
                                             'single_households',
                                         'Number; HOUSEHOLD SIZE - Total households - Average household size': 
                                             'average_household_size',
                                         'Hispanic or Latino': 'population_hispanic_latino',
                                         'Not Hispanic or Latino: - Population of one race: - White alone': 
                                             'population_white',
                                         'Not Hispanic or Latino: - Population of one race: - Black or African American alone': 
                                             'population_black',
                                         'Not Hispanic or Latino: - Population of one race: - American Indian and Alaska Native alone': 
                                             'population_native_american',
                                         'Not Hispanic or Latino: - Population of one race: - Asian alone': 
                                             'population_asian'
                                        }
                               )
                        )
    
        selectedData.to_csv(dataDirectory + outputName, index=False)
        return selectedData