In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [None]:
# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df
  

In [None]:
  
def test_non_unique(column_names):
    '''
    input) 
        >> list of column names {column_names}
            > columns to check for duplicate instances
    output)
        >> indexed list of names occouring more than once 
    '''
    # store first instance
    first_occour = []
    # store 2nd+ instance(s)
    non_unique = []
    # we're going to want index
    for i,_ in enumerate(column_names):
        # not first time
        if _ not in first_occour:
            first_occour.append(_)
        # if not first, tag&bag
        else:
            non_unique.append([i,_])
    # output index w/ non-first instances
    return non_unique

In [None]:
# def bring_the_5yr_acs_2k11_thru_2k17():
'''
inputs)
    >> list_of_paths
        > paths to each raw dataframe
output)
    >> list of modified dataframes
function)
    1. load and copy data
    2. 
'''
# load 2011 
y2k11 = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
# 2012
y2k12 = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
#2013
y2k13 = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
# 2014
y2k14 = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
# 2015
y2k15 = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
#2016
y2k16 = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
#2017
y2k17 = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)

In [None]:
# 2011
y11 = y2k11.copy()
# 2012
y12 = y2k12.copy()
# 2013
y13 = y2k13.copy()
# 2014
y14 = y2k14.copy()
# 2015
y15 = y2k15.copy()
# 2016
y16 = y2k16.copy()
# 2017
y17 = y2k17.copy()

In [None]:
'''identify columns'''
# 2011
tags11 = y11.columns  
# 2012
tags12 = y12.columns  
#2013
tags13 = y13.columns  
# 2014
tags14 = y14.columns  
# 2015
tags15 = y15.columns  
#2016
tags16 = y16.columns  
# 2017
tags17 = y17.columns 

In [None]:
tags = [tags11,tags12,tags13,tags14,tags15,tags16,tags17]

In [None]:
for tag in tags:
    print(f'len = {len(tag)}\nunique = {len(set(tag))}')

In [None]:
# 2011 == 2012
if tags11.all() != tags12.all():
    raise Exception('tags11 != tags12')

# 2013 == 2014
if tags13.all() != tags14.all():
    raise Exception('tags13 != tags14')

# 2015 == 2016
if tags15.all() != tags16.all():
    raise Exception('tags15 != tags16')

In [None]:
'''identify common columns'''
# collection of columns appearing in all 7 dataframes 2011-2017

# 2017
a = [t for t in tags17 if t in tags11 & tags12 & tags13 & tags14 & tags15 & tags16]
# 2012
z = [iii for iii in tags12 if iii in tags11 & tags13 & tags14 & tags15 & tags16 & tags17]
# 2011
b = [tt for tt in tags11 if tt in tags12 & tags13 & tags14 & tags15 & tags16 & tags17]
# 2014
y = [ii for ii in tags14 if ii in tags11 & tags12 & tags13 & tags15 & tags16 & tags17]
# 2013
c = [ttt for ttt in tags13 if ttt in tags11 & tags12 & tags14 & tags15 & tags16 & tags17]
# 2016
x = [i for i in tags16 if i in tags11 & tags12 & tags13 & tags14 & tags15 & tags17]
# 2015
d = [tttt for tttt in tags15 if tttt in tags11 & tags12 & tags13 & tags14 & tags16 & tags17]

# list of all common columns (in order of starting year 2011-2017)
collect = [b,z,c,y,d,x,a]

In [None]:
d = []
# for year
for _ in collect:
    # record size, unique instances, and diffenrence
    d.append((len(_),len(set(_)),len(_)-len(set(_))))
    
years = ['2011','2012','2013','2014','2015','2016','2017']
cols = ['list','set','diff']

q = pd.DataFrame(data=d,index=years,columns=cols)

In [None]:
q

- ***notes***:
    - all same set len
    - 2013-2017 have 8 non-unique than set len
    - 2011 and 2012 have 8 more non-unique than 2013-2017 (16 total)
- ***actions***:
    - ideal
        - leave 8 non-unique in all frames
        - remove non-unique 9-16 from 2011 and 2012
    - else
        - remove all non-unique values 


In [None]:
"""drop each frame's uncommon columns, reset index"""
# 2011
k11 = y11.copy()
k11 = k11[[i for i in a]].reset_index()
# # # 2012
k12 = y12.copy()
k12 = k12[[i for i in a]].reset_index()
# # # 2013
k13 = y13.copy()
k13 = k13[[i for i in a]].reset_index() #.drop(uncommon_13,axis=1).reset_index()
# # # 2014
k14 = y14.copy()
k14 = k14[[i for i in a]].reset_index()
# # # 2015
k15 = y15.copy()
k15 = k13[[i for i in a]].reset_index()  #.drop(uncommon_15,axis=1).reset_index()
# # # 2016
k16 = y16.copy()
k16 = k16[[i for i in a]].reset_index()
# # # 2017
k17 = y17.copy()
k17 = k17[[i for i in a]].reset_index()  # .drop(uncommon_17,axis=1).reset_index()

In [None]:
group = (k11.copy(),k12.copy(),k13.copy(),k14.copy(),k15.copy(),k16.copy(),k17.copy())
for _ in group:
    print(_.info(),'\n')

In [None]:
for i in range(len(k12.columns)):
    if k12.columns[i] != k11.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k12.columns[i]}\n!=\n{k11.columns[i]}\n')

In [None]:
for i in range(len(k13.columns)):
    if k13.columns[i] != k11.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k13.columns[i]}\n!=\n{k11.columns[i]}\n')

In [None]:
for i in range(len(k14.columns)):
    if k14.columns[i] != k11.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k14.columns[i]}\n!=\n{k11.columns[i]}\n')

In [None]:
for i in range(len(k14.columns)):
    if k14.columns[i] != k12.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k14.columns[i]}\n!=\n{k12.columns[i]}\n')

In [None]:
for i in range(len(k14.columns)):
    if k14.columns[i] != k15.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k14.columns[i]}\n!=\n{k11.columns[i]}\n')

In [None]:
for i in range(len(k14.columns)):
    if k14.columns[i] != k16.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k14.columns[i]}\n!=\n{k16.columns[i]}\n')

In [None]:
for i in range(len(k14.columns)):
    if k14.columns[i] != k17.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k14.columns[i]}\n!=\n{k17.columns[i]}\n')

In [None]:
for i in range(len(k15.columns)):
    if k15.columns[i] != k11.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k15.columns[i]}\n!=\n{k11.columns[i]}\n')

In [None]:
for i in range(len(k16.columns)):
    if k16.columns[i] != k11.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k16.columns[i]}\n!=\n{k11.columns[i]}\n')

In [None]:
for i in range(len(k17.columns)):
    if k17.columns[i] != k11.columns[i]:
        # let us know where first non match is
        raise Exception(f'{i}\n{k17.columns[i]}\n!=\n{k11.columns[i]}\n')

In [None]:
k15.columns.all() == k14.columns.all()

- ***notes***:
    - getting too interesting for ipynb
- ***actins***:
    - drop all duplicate columns
    - unless dropping 8 from 2011/2012 somehow works
- ***extra***:
    - old fashon enumerated for loop, find columns that are 100% the same
        - reset, restart, &continue below
            - just because it's late, we're going to have fun and try to do it all in one run
                - code as if was py with 1 attempt (aka note well or die)
                - I want straight into model from here, nothing less

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

"""
STEP 0 >> imports; def clean
"""

# default cleaning method until proven otherwise
def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df


'''
STEP 1 >> load data, reset; make copies
'''

# load 
# 2011 
twenty_eleven = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_11_5YR_DP05_with_ann.csv')
# 2012
twenty_twelve = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_12_5YR_DP05_with_ann.csv')
#2013
twenty_thirteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_13_5YR_DP05_with_ann.csv')
# 2014
twenty_fourteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_14_5YR_DP05_with_ann.csv')
# 2015
twenty_fifteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_15_5YR_DP05_with_ann.csv')
#2016
twenty_sixteen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_16_5YR_DP05_with_ann.csv')
#2017
twenty_seventeen = clean_census_frame('../../data/American_Community_Survey/aff_download/ACS_17_5YR_DP05_with_ann.csv')

# copy 
# 2011 
y2k11 = twenty_eleven.copy()
# 2012
y2k12 = twenty_twelve.copy()
#2013
y2k13 = twenty_thirteen.copy()
# 2014
y2k14 = twenty_fourteen.copy()
# 2015
y2k15 = twenty_fifteen.copy()
#2016
y2k16 = twenty_sixteen.copy()
#2017
y2k17 = twenty_seventeen.copy()

In [None]:
# [len(common_tags), len(set(common_tags))] , [len(_test_tags_), len(set(_test_tags_))]
len(set(tags11)),len(set(tags17))

In [None]:
'''identify non common columns for specific frames'''
# 2011
uncommon_11 = [tag for tag in tags11 if tag not in common_tags]
# 2012
uncommon_12 = [tag for tag in tags12 if tag not in common_tags]
# 2013
uncommon_13 = [tag for tag in tags13 if tag not in common_tags]
# 2014
uncommon_14 = [tag for tag in tags14 if tag not in common_tags]
# 2015
uncommon_15 = [tag for tag in tags15 if tag not in common_tags]
# 2016
uncommon_16 = [tag for tag in tags16 if tag not in common_tags]
# 2017
uncommon_17 = [tag for tag in tags17 if tag not in common_tags]

In [None]:
# collect uncommon
uncommon = [uncommon_11,uncommon_12,uncommon_13,uncommon_14,uncommon_15,uncommon_16,uncommon_17]

In [None]:
for uncommon_tag in uncommon:
    print(f'len = {len(uncommon_tag)}\nunique = {len(set(uncommon_tag))}')

In [None]:
"""drop each frame's uncommon columns, reset index"""
# 2011
k11 = y11.copy().drop(uncommon_11,axis=1).reset_index()
# # 2012
k12 = y12.copy().drop(uncommon_12,axis=1).reset_index()
# # 2013
k13 = y13.copy().drop(uncommon_13,axis=1).reset_index()
# # 2014
k14 = y14.copy().drop(uncommon_14,axis=1).reset_index()
# # 2015
k15 = y15.copy().drop(uncommon_15,axis=1).reset_index()
# # 2016
k16 = y16.copy().drop(uncommon_16,axis=1).reset_index()
# # 2017
k17 = y17.copy().drop(uncommon_17,axis=1).reset_index()

In [None]:
y11.info(),y13.info(),y15.info(),y17.info()

In [None]:
"""don't forget, 2011 and 2012 have extra repeats, check for non-unique column instances in new dfs"""
# 2011
a=test_non_unique(y11.copy().columns)
# 2012
b=test_non_unique(y12.copy().columns)
# 2013
c=test_non_unique(y13.copy().columns)
# 2014
d=test_non_unique(y14.copy().columns)
# 2015
e=test_non_unique(y15.copy().columns)
# 2016
f=test_non_unique(y16.copy().columns)
# 2017
g=test_non_unique(y17.copy().columns)

In [None]:
# collection of all repeats which occour in all 7 dataframes (len==8)
"""[[80, 'Estimate; SEX AND AGE - 18 years and over'],
[81, 'Margin of Error; SEX AND AGE - 18 years and over'],
[82, 'Percent; SEX AND AGE - 18 years and over'],
[83, 'Percent Margin of Error; SEX AND AGE - 18 years and over'],
[84, 'Estimate; SEX AND AGE - 65 years and over'],
[85, 'Margin of Error; SEX AND AGE - 65 years and over'],
[86, 'Percent; SEX AND AGE - 65 years and over'],
[87, 'Percent Margin of Error; SEX AND AGE - 65 years and over']]"""
common_repeats = [_ for _ in g if _ in [i for i in [a, b, c, d, e, f]]]

# identify repeats occouring only in 2011 and 2012 (already checked are not unique to self)
"""[[100, 'Estimate; RACE - One race'],
 [101, 'Margin of Error; RACE - One race'],
 [102, 'Percent; RACE - One race'],
 [103, 'Percent Margin of Error; RACE - One race'],
 [188, 'Estimate; RACE - Two or more races'],
 [189, 'Margin of Error; RACE - Two or more races'],
 [190, 'Percent; RACE - Two or more races'],
 [191, 'Percent Margin of Error; RACE - Two or more races']]"""
first_two_only = [i for i in a if i not in common_repeats and i in b]

In [None]:
print(f'{common_repeats}\n\n{first_two_only}')

In [None]:
# 2011
y11 = y11.copy().drop(first_two_only,axis=1).reset_index()
# # 2012
y12 = y12.copy().drop(first_two_only,axis=1).reset_index()

In [None]:
y11.info(),y12.info(),y13.info(),y15.info(),y17.info()

In [None]:
instances = [100,101,102,103,188,189,190,191]
# adjust 2011
y11 = y11.drop(y11.columns[instances], axis=1)
# adjust 2012
y12 = y12.drop(y12.columns[instances], axis=1)


# return common_repeats,first_two_only  # [y11,y12,y13,y14,y15,y16,y17]

In [None]:
test = bring_the_5yr_acs_2k11_thru_2k17()

In [None]:
test

In [None]:
for i in test:
    print(i.info())

In [None]:
len(common_tags)

In [None]:
test_000 = test.copy()

In [None]:
for i in range(len(test_000)):
    print(f'{len(test_000[i])} rows x {len(test_000[i].iloc[1])} columns')

In [None]:
# number of identical columns 2011 and 2012 is same number as all columns in 2011
# and count of columns in 2012 is same as count in 2011
if sum(test_000[0].columns == test_000[1].columns) == len(test_000[0].columns) and len(test_000[1].columns) == len(test_000[0].columns):
    # number of columns for 2013 is same as number that are same between 2013 and 2014 and between 2014 and 2015 
    if len(test_000[2].columns) == sum(test_000[2].columns == test_000[3].columns) & sum(test_000[3].columns == test_000[4].columns):
        # number of columns for 2017 is same as number that are same between 2016 and 2014 and between 2017 and 2015 
        if len(test_000[6].columns) == sum(test_000[5].columns == test_000[3].columns) & sum(test_000[6].columns == test_000[4].columns):
            if len(test_000[0].columns) != len(test_000[5].columns) and len(test_000[5].columns) == len(test_000[5].columns):
                print('pretty ok to assume\n2011-2012 are identical and 2013-2017 are identical\nbut 2011-2012 and 2013-2017 are different')
        

In [None]:
_2k11 = test_000[0]
_2k15 = test_000[4]

In [None]:
len(_2k11.columns), len(_2k15.columns)

In [None]:
len(set(_2k11.columns)), len(set(_2k15.columns))

In [None]:
out_15 = []
repeat_15=0
for i in _2k15.columns:
    if i not in out_15:
        out_15.append(i)
    else:
        print(i)
        repeat_15+=1
repeat_15

In [None]:
out_11 = []
repeat_11 = 0
for i in _2k11.columns:
    if i not in out_11:
        out_11.append(i)
    else:
        print(i)
        repeat_11+=1        
repeat_11

In [None]:
'''identify columns'''
# 2011
a = clean_census_frame('../data/acs/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
# 2012
b = clean_census_frame('../data/acs/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
#2013
c = clean_census_frame('../data/acs/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False) 
# 2014
d = clean_census_frame('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
# 2015
e = clean_census_frame('../data/acs/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False) 
#2016
f = clean_census_frame('../data/acs/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False) 
# 2017
g = clean_census_frame('../data/acs/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False) 

In [None]:
for j in [a,b,c,d,e,f,g]:
    print(len(j.columns))

In [None]:
'''identify common columns'''
# collection of columns appearing in all 7 dataframes 2011-2017
common = set([tag for tag in g.columns if tag in a.columns & b.columns & c.columns & d.columns & e.columns & f.columns])

In [None]:
'''identify non common columns for specific frames'''
# 2011
u11 = [tag for tag in a.columns if tag not in common]
# 2012
u12 = [tag for tag in b.columns if tag not in common]
# 2013
u13 = [tag for tag in c.columns if tag not in common]
# 2014
u14 = [tag for tag in d.columns if tag not in common]
# 2015
u15 = [tag for tag in e.columns if tag not in common]
# 2016
u16 = [tag for tag in f.columns if tag not in common]
# 2017
u17 = [tag for tag in g.columns if tag not in common]

In [None]:
"""drop each frame's uncommon columns, reset index"""
# 2011
aa = a.copy().drop(u11,axis=1).reset_index()
# # 2012
bb = b.copy().drop(u12,axis=1).reset_index()
# # 2013
cc = c.copy().drop(u13,axis=1).reset_index()
# # 2014
dd = d.copy().drop(u14,axis=1).reset_index()
# # 2015
ee = e.copy().drop(u15,axis=1).reset_index()
# # 2016
ff = f.copy().drop(u16,axis=1).reset_index()
# # 2017
gg = g.copy().drop(u17,axis=1).reset_index()

In [None]:
t=[aa,bb,cc,dd,ee,ff,gg]
for i in t:
    print(len(i.columns))

In [None]:
acol = aa.columns
bcol=bb.columns
ccol = cc.columns
dcol=dd.columns
ecol=ee.columns
fcol=ff.columns
gcol = gg.columns

def test_non_unique(column_names):
    first_occour = []
    non_unique = []
    for i,_ in enumerate(column_names):
        if _ not in first_occour:
            first_occour.append(_)
        else:
            non_unique.append([i,_])
    return non_unique

a=test_non_unique(acol)
b=test_non_unique(bcol)
c=test_non_unique(ccol)
d=test_non_unique(dcol)
e=test_non_unique(ecol)
f=test_non_unique(fcol)
g=test_non_unique(gcol)

common_repeats = [_ for _ in g if _ in a]
common_repeats = [i for i in b if i in common_repeats]
common_repeats = [_ for _ in c if _ in common_repeats]
common_repeats = [i for i in d if i in common_repeats]
common_repeats = [_ for _ in e if _ in common_repeats]
common_repeats = [i for i in f if i in common_repeats]
common_repeats = [_ for _ in a if _ in common_repeats]
# common_repeats 
for i in a:
    if i in b:
        if i not in c:
            if i not in d:
                if i not in e:
                    if i not in f:
                        if i not in g:
                            print(i)
common_repeats

- ***notes***:
    - the following occour as repeats in all years (2011-2017) 
        - [[80, 'Estimate; SEX AND AGE - 18 years and over'],
        - [81, 'Margin of Error; SEX AND AGE - 18 years and over'],
        - [82, 'Percent; SEX AND AGE - 18 years and over'],
        - [83, 'Percent Margin of Error; SEX AND AGE - 18 years and over'],
        - [84, 'Estimate; SEX AND AGE - 65 years and over'],
        - [85, 'Margin of Error; SEX AND AGE - 65 years and over'],
        - [86, 'Percent; SEX AND AGE - 65 years and over'],
        - [87, 'Percent Margin of Error; SEX AND AGE - 65 years and over']]
    - the following occour as repeats in 2011 and 2012 but no other years
        - [100, 'Estimate; RACE - One race']
        - [101, 'Margin of Error; RACE - One race']
        - [102, 'Percent; RACE - One race']
        - [103, 'Percent Margin of Error; RACE - One race']
        - [188, 'Estimate; RACE - Two or more races']
        - [189, 'Margin of Error; RACE - Two or more races']
        - [190, 'Percent; RACE - Two or more races']
        - [191, 'Percent Margin of Error; RACE - Two or more races']
- ***actions***:
    - remove the occourances only seen in 2011 and 2012
        - ensure they lead to columns being equal (as assumed)

- ***notes***;
    - while different len, the column names are the same across the board

In [None]:
out = [aa,bb,cc,dd,ee,ff,gg]

In [None]:
len(aa.columns), len(set(aa.columns)),len(aa.columns.unique())
# pre set(common) on non-common lists (220, 204, 204)
# post set(common) on non-common lists (220, 204, 204)

In [None]:
x = [u11,u12,u13,u14,u15,u16,u17]
y = out

o = [len(_) for _ in x]
so = [len(set(_)) for _ in x]

co = [len(_.columns) for _ in y]
cso = [len(set(_.columns)) for _ in y]

years = ['2011','2012','2013','2014','2015','2016','2017']
cols = ['list','set','diff']

os = []
for _ in range((len(o))):
    os.append([o[_],so[_],o[_]-so[_]])
    
cos = []
for _ in range((len(o))):
    cos.append([co[_],cso[_],co[_]-cso[_]]) 
    
q = pd.DataFrame(data=os,index=years,columns=cols)
cq = pd.DataFrame(data=cos,index=years,columns=cols)

In [None]:
# reallycommon = [_.columns for _ in y]
# unicommon = [_.columns.unique() for _ in y]
# # len(reallycommon[0]),len(unicommon[0])
# r=reallycommon[0].drop(reallycommon.index[[79,80]])
# ur=[unicommon][0][0]
# for u,i in enumerate(r):
#     if r[u]!=ur[u]:
#         print(f'{u}\n{r[u]}\n{ur[u]}\n')
# # len(r),len(ur)


In [None]:
q

In [None]:
cq


In [None]:
len(gg.columns), len(set(gg.columns)),len(gg.columns.unique())
# pre set(common) on non-common lists (212, 204, 204)
# post set(common) on non-common lists (212, 204, 204)

In [None]:
len(common), len(set(common)), len(common)
# pre set(common) on non-common lists (211, 203, 203)
# post set(common) on non-common lists (211, 203, 203)

In [None]:
len(aa['Geography']) , len(aa['Geography'].unique()) 


In [None]:
# X, y = make_classification(n_classes=3, n_features=2, n_redundant=0,
#                            n_informative=2, n_clusters_per_class=1,
#                            class_sep=1, random_state=5)
# print(y.shape)
# _knn = KNearestNeighbors(4, cosine_distance)
# _knn.fit(X, y)

In [None]:
# # load 2011 
# y2k11 = clean_census_frame('../data/acs/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
# # copy
# y11 = (y2k11.copy())
# # 2012
# y2k12 = clean_census_frame('../data/acs/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
# y12 = y2k12.copy()
# #2013
# y2k13 = clean_census_frame('../data/acs/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
# y13 = y2k13.copy()
# # 2014
# y2k14 = clean_census_frame('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)
# y14 = y2k14.copy()
# # 2015
# y2k15 = clean_census_frame('../data/acs/aff_download/ACS_15_5YR_DP05_with_ann.csv',reset=False)
# y15 = y2k15.copy()
# #2016
# y2k16 = clean_census_frame('../data/acs/aff_download/ACS_16_5YR_DP05_with_ann.csv',reset=False)
# y16 = y2k16.copy()
# #2017
# y2k17 = clean_census_frame('../data/acs/aff_download/ACS_17_5YR_DP05_with_ann.csv',reset=False)
# y17 = y2k17.copy()