In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df
    
def bring_the_5yr_acs_2k11_thru_2k17():
    '''
    inputs)
        >> list_of_paths
            > paths to each raw dataframe
    output)
        >> list of modified dataframes
    function)
        1. load and copy data
        2. 
    '''
    # load 2011 
    y2k11 = clean_census_frame('../data/acs/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
    # copy
    y11 = y2k11.copy()
    # 2012
    y2k12 = clean_census_frame('../data/acs/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
    y12 = y2k12.copy()
    #2013
    y2k13 = clean_census_frame('../data/acs/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
    y13 = y2k13.copy()
    # 2014
    y2k14 = clean_census_frame('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
    y14 = y2k14.copy()
    # 2015
    y2k15 = clean_census_frame('../data/acs/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
    y15 = y2k15.copy()
    #2016
    y2k16 = clean_census_frame('../data/acs/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
    y16 = y2k16.copy()
    #2017
    y2k17 = clean_census_frame('../data/acs/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)
    y17 = y2k17.copy()

    '''copy data for editing and extracting info'''
    # 2011
    # 2012
    # 2013
    # 2014
    # 2015
    # 2016
    # 2017

    '''identify columns'''
    # 2011
    tags11 = y11.columns  
    # 2012
    tags12 = y12.columns  
    #2013
    tags13 = y13.columns  
    # 2014
    tags14 = y14.columns  
    # 2015
    tags15 = y15.columns  
    #2016
    tags16 = y16.columns  
    # 2017
    tags17 = y17.columns 

    '''identify common columns'''
    # collection of columns appearing in all 7 dataframes 2011-2017
    common_tags = [tag for tag in tags17 if tag in tags11 & tags12 & tags13 & tags14 & tags15 & tags16]

    '''identify non common columns for specific frames'''
    # 2011
    uncommon_11 = [tag for tag in y11.columns if tag not in common_tags]
    # 2012
    uncommon_12 = [tag for tag in y12.columns if tag not in common_tags]
    # 2013
    uncommon_13 = [tag for tag in y13.columns if tag not in common_tags]
    # 2014
    uncommon_14 = [tag for tag in y14.columns if tag not in common_tags]
    # 2015
    uncommon_15 = [tag for tag in y15.columns if tag not in common_tags]
    # 2016
    uncommon_16 = [tag for tag in y16.columns if tag not in common_tags]
    # 2017
    uncommon_17 = [tag for tag in y17.columns if tag not in common_tags]

    """drop each frame's uncommon columns, reset index"""
    # 2011
    y11 = y11.drop(uncommon_11,axis=1).reset_index()
    # # 2012
    y12 = y12.drop(uncommon_12,axis=1).reset_index()
    # # 2013
    y13 = y13.drop(uncommon_13,axis=1).reset_index()
    # # 2014
    y14 = y14.drop(uncommon_14,axis=1).reset_index()
    # # 2015
    y15 = y15.drop(uncommon_15,axis=1).reset_index()
    # # 2016
    y16 = y16.drop(uncommon_16,axis=1).reset_index()
    # # 2017
    y17 = y17.drop(uncommon_17,axis=1).reset_index()
    
    return [y11,y12,y13,y14,y15,y16,y17]

In [2]:
test = bring_the_5yr_acs_2k11_thru_2k17()

In [4]:
test_000 = test.copy()

In [5]:
for i in range(len(test_000)):
    print(f'{len(test_000[i])} rows x {len(test_000[i].iloc[1])} columns')

33120 rows x 220 columns
33120 rows x 220 columns
33120 rows x 212 columns
33120 rows x 212 columns
33120 rows x 212 columns
33120 rows x 212 columns
33120 rows x 212 columns


In [6]:
# number of identical columns 2011 and 2012 is same number as all columns in 2011
# and count of columns in 2012 is same as count in 2011
if sum(test_000[0].columns == test_000[1].columns) == len(test_000[0].columns) and len(test_000[1].columns) == len(test_000[0].columns):
    # number of columns for 2013 is same as number that are same between 2013 and 2014 and between 2014 and 2015 
    if len(test_000[2].columns) == sum(test_000[2].columns == test_000[3].columns) & sum(test_000[3].columns == test_000[4].columns):
        # number of columns for 2017 is same as number that are same between 2016 and 2014 and between 2017 and 2015 
        if len(test_000[6].columns) == sum(test_000[5].columns == test_000[3].columns) & sum(test_000[6].columns == test_000[4].columns):
            if len(test_000[0].columns) != len(test_000[5].columns) and len(test_000[5].columns) == len(test_000[5].columns):
                print('pretty ok to assume\n2011-2012 are identical and 2013-2017 are identical\nbut 2011-2012 and 2013-2017 are different')
        

pretty ok to assume
2011-2012 are identical and 2013-2017 are identical
but 2011-2012 and 2013-2017 are different


In [7]:
_2k11 = test_000[0]
_2k15 = test_000[4]

In [9]:
len(_2k11.columns), len(_2k15.columns)

(220, 212)

In [10]:
len(set(_2k11.columns)), len(set(_2k15.columns))

(204, 204)

In [13]:
out_15 = []
repeat_15=0
for i in _2k15.columns:
    if i not in out_15:
        out_15.append(i)
    else:
        print(i)
        repeat_15+=1
repeat_15

Estimate; SEX AND AGE - 18 years and over
Margin of Error; SEX AND AGE - 18 years and over
Percent; SEX AND AGE - 18 years and over
Percent Margin of Error; SEX AND AGE - 18 years and over
Estimate; SEX AND AGE - 65 years and over
Margin of Error; SEX AND AGE - 65 years and over
Percent; SEX AND AGE - 65 years and over
Percent Margin of Error; SEX AND AGE - 65 years and over


8

In [14]:
out_11 = []
repeat_11 = 0
for i in _2k11.columns:
    if i not in out_11:
        out_11.append(i)
    else:
        print(i)
        repeat_11+=1        
repeat_11

Estimate; SEX AND AGE - 18 years and over
Margin of Error; SEX AND AGE - 18 years and over
Percent; SEX AND AGE - 18 years and over
Percent Margin of Error; SEX AND AGE - 18 years and over
Estimate; SEX AND AGE - 65 years and over
Margin of Error; SEX AND AGE - 65 years and over
Percent; SEX AND AGE - 65 years and over
Percent Margin of Error; SEX AND AGE - 65 years and over
Estimate; RACE - One race
Margin of Error; RACE - One race
Percent; RACE - One race
Percent Margin of Error; RACE - One race
Estimate; RACE - Two or more races
Margin of Error; RACE - Two or more races
Percent; RACE - Two or more races
Percent Margin of Error; RACE - Two or more races


16