In [2]:
import pandas as pd
import os
import glob

In [3]:
def create_fips_code(df):
    fips_code = []
    for i in range(df['state fips code'].shape[0]):
        # combine the statecode and countycode to create the fips_code
        string = '{:d}{:03d}'.format(df['state fips code'].iloc[i], df['county fips code'].iloc[i])
        fips_code.append(int(string)) 
    return fips_code

In [4]:
def load_data_from_dir(data_dir):
    file_wildcard = os.path.join(data_dir, '*.csv')
    for i, filepath in enumerate(reversed(glob.glob(file_wildcard))):
        year = filepath.split('/')[-1].split('\\')[-1].split('_')[0]
        print(i, filepath, year)

        # load the data
        data = pd.read_csv(filepath, encoding='utf-8')
        print(data.shape)
        # convert all columns to lower case
        data.columns = map(str.lower, data.columns)
        # calculate the fips_code and insert as 1st column
        data.insert(loc=0, column='County Code', value=create_fips_code(data))
        # insert the year as the 2nd column
        data.insert(loc=1, column='year', value=int(year))
        
        # concatenate dataframes together
        if i == 0:
            df = data
            orig_columns = data.columns
        else:
            df = pd.concat([df, data])
        
    return df

In [5]:
def return_not_matches(a, b):
    return [[x for x in a if x not in b], [x for x in b if x not in a]]

In [31]:
# load the data
data_2016 = pd.read_csv('data/census/2016_census.csv', encoding='utf-8')
data_2015 = pd.read_csv('data/census/2015_census.csv', encoding='utf-8')
print(data_2016.shape)
print(data_2015.shape)
# convert all columns to lower case
data_2016.columns = map(str.lower, data_2016.columns)
data_2015.columns = map(str.lower, data_2015.columns)
# calculate the fips_code and insert as 1st column
data_2016.insert(loc=0, column='County Code', value=create_fips_code(data_2016))
data_2015.insert(loc=0, column='County Code', value=create_fips_code(data_2015))
# insert the year as the 2nd column
data_2016.insert(loc=1, column='year', value=int(2016))
data_2015.insert(loc=1, column='year', value=int(2015))

(3220, 1342)
(3220, 1342)


In [32]:
def replace_substring_in_list(str_list, target_substring, replace_substring):
    new_list = []
    for s in str_list:
        if target_substring in s:
            s = s.replace(target_substring, replace_substring)
        new_list.append(s)
    return new_list

In [39]:
data_2016.columns = replace_substring_in_list(data_2016.columns, '2016 ', '')
data_2016.columns = replace_substring_in_list(data_2016.columns, 'all people!!', '')
data_2016.columns = replace_substring_in_list(data_2016.columns, 'poverty level!!18 years and over!!', 'poverty level!!')

data_2015.columns = replace_substring_in_list(data_2015.columns, '2015 ', '')
data_2015.columns = replace_substring_in_list(data_2015.columns, 'poverty level!!65 years and over!!', 'poverty level!!')

In [42]:
tmp = return_not_matches(data_2016.columns, data_2015.columns)

print(len(tmp[0]))
print(len(tmp[1]))
print(tmp[0])
print()
print(tmp[1])

2
2
['estimate!!year householder moved into unit!!occupied housing units!!moved in 2015 or later', 'percent!!year householder moved into unit!!occupied housing units!!moved in 2015 or later']

['estimate!!year householder moved into unit!!occupied housing units!!moved in or later', 'percent!!year householder moved into unit!!occupied housing units!!moved in or later']


In [43]:
# concatenate data into a single dataframe
df = pd.concat([data_2016, data_2015])

In [44]:
print(df.shape)

(6440, 1346)
