In [None]:
import time
import numpy as np
import pandas as pd

In [None]:
# load 2000 data
bush = pd.read_csv( '../data/2000/age-groups-and-sex-census-DEC_00_SF1_QTP1/DEC_00_SF1_QTP1_with_ann.csv' , low_memory=False )
# load 2010 data
obama = pd.read_csv( '../data/2010/age-groups-and-sex-census-DEC_10_SF1_QTP1/DEC_10_SF1_QTP1_with_ann.csv' , low_memory=False )
# load 2000 metadata
b_metadata = pd.read_csv( '../data/2000/age-groups-and-sex-census-DEC_00_SF1_QTP1/DEC_00_SF1_QTP1_metadata.csv' )
# load 2010 metadata
o_metadata = pd.read_csv( '../data/2010/age-groups-and-sex-census-DEC_10_SF1_QTP1/DEC_10_SF1_QTP1_metadata.csv' )

In [None]:
# same metadata , should make for good comps
b_metadata.info() == o_metadata.info()

In [None]:
bush.describe()

In [None]:
obama.describe()

In [None]:
bush.head()

In [None]:
obama.head()

In [None]:
# compare dataframes

'''coded column names 
    >> e.g. SUBHD0103_S01, HD03_S41'''
# 2000 high-level column names (current exists before row 0 )
b_hi_column_names = [ name for name in bush.columns.values ]
# 2010 high-level column names (current exists before row 0 )
o_hi_column_names = [ name for name in obama.columns.values ]

# number of names are same 
if len( b_hi_column_names ) == len( o_hi_column_names ):
    # keep track of names that exist in both data
    hi_both = []   
    # keep track of 2000 names that do not exist in 2010 data
    hi_b_only = []
    # keep track of 2010 names that do not exist in 2000 data
    hi_o_only = []
    # index length
    for _ in range( len( b_hi_column_names ) ):
        # column name found at this index in 2000 df is not a column name in 2010 df 
        if b_hi_column_names[ _ ] not in o_hi_column_names:
            # add name to 2000 only list
            hi_b_only.append( b_hi_column_names[ _ ] )
        # column name found at this index in 2000 df is not a column name in 2010 df 
        if o_hi_column_names[ _ ] not in b_hi_column_names:
            # add name to 2000 only list
            hi_o_only.append( o_hi_column_names[ _ ] )
        else:
            hi_both.append([b_hi_column_names[ _ ],o_hi_column_names[ _ ]])
            
    print(f'hi\nboth = {len(hi_both)}\n2000 but not 2010 = {len(hi_b_only)}\n2010 but not 2000 = {len(hi_o_only)}')


'''commonly understandable column names
    >> e.g. Number - Female; Total population, Males per 100 females; Total population - Medi...'''
# 2000 translated column names (current row 0)
b_lo_column_names = [ name for name in bush.iloc[ 0 ] ]
# 2010 translated column names (current row 0)
o_lo_column_names = [ name for name in obama.iloc[ 0 ] ]
# number of names are same 
if len( b_lo_column_names ) == len( o_lo_column_names ):
    # keep track of names that exist in both data
    lo_both = []   
    # keep track of 2000 names that do not exist in 2010 data
    lo_b_only = []
    # keep track of 2010 names that do not exist in 2000 data
    lo_o_only = []
    # index length
    for _ in range( len( b_lo_column_names ) ):
        # column name found at this index in 2000 df is not a column name in 2010 df 
        if b_lo_column_names[ _ ] not in o_lo_column_names:
            # add name to 2000 only list
            lo_b_only.append( b_lo_column_names[ _ ] )
        # column name found at this index in 2000 df is not a column name in 2010 df 
        if o_lo_column_names[ _ ] not in b_lo_column_names:
            # add name to 2000 only list
            lo_o_only.append( o_lo_column_names[ _ ] )
        else:
            lo_both.append([b_lo_column_names[ _ ],o_lo_column_names[ _ ]])
            
    print(f'lo\nboth = {len(lo_both)}\n2000 but not 2010 = {len(lo_b_only)}\n2010 but not 2000 = {len(lo_o_only)}')


'''first row of numbered data 
    >> e.g. 18570, 100.0'''
# 2000 0th row of data (current row 1)
b_0th_real = [ name for name in bush.iloc[ 1 ] ]
# 2010 0th row of data (current row 1)
o_0th_real = [ name for name in obama.iloc[ 1 ] ]

# number of names are same 
if len( b_0th_real ) == len( o_0th_real ):
    # keep track of names that exist in both data
    both_0th = []   
    # keep track of 2000 names that do not exist in 2010 data
    b_only_0th = []
    # keep track of 2010 names that do not exist in 2000 data
    o_only_0th = []
    # index length
    for _ in range( len( b_0th_real ) ):
        # column name found at this index in 2000 df is not a column name in 2010 df 
        if b_0th_real[ _ ] not in o_0th_real:
            # add name to 2000 only list
            b_only_0th.append( b_0th_real[ _ ] )
        # column name found at this index in 2000 df is not a column name in 2010 df 
        if o_0th_real[ _ ] not in b_0th_real:
            # add name to 2000 only list
            o_only_0th.append( o_0th_real[ _ ] )
        else:
            both_0th.append([b_0th_real[ _ ],o_0th_real[ _ ]])
            
    print(f'data\nboth = {len(both_0th)}\n2000 but not 2010 = {len(b_only_0th)}\n2010 but not 2000 = {len(o_only_0th)}')

- ***notes***:
    - while the high level labels don't seem to coexist 
        - any actual column name in 2000 is seen in 2010 and vise verse
    - under 1/3 
        - of values seen in the first row of data for either the d2000 or the 2010 datasets
            - are seen in that same row on the other dataset 
- ***actions***:
    - repeat the tests but check for correcponding index instead of coexistance 

In [None]:
# compare on index 

'''coded column names 
    >> e.g. SUBHD0103_S01, HD03_S41'''
# 2000 high-level column names (current exists before row 0 )
b_hi_column_names = [ name for name in bush.columns.values ]
# 2010 high-level column names (current exists before row 0 )
o_hi_column_names = [ name for name in obama.columns.values ]

# number of names are same 
if len( b_hi_column_names ) == len( o_hi_column_names ):
    # values that are same 
    same_hi = []   
    # values that are not the same
    not_the_hi = []
    # index length
    for _ in range( len( b_hi_column_names ) ):
        # object found at this index in 2000 is not same object at this index in 2010
        if b_hi_column_names[ _ ] != o_hi_column_names[ _ ]:
            # add name to 2000 only list
            not_the_hi.append( [ _ , b_hi_column_names[ _ ] , o_hi_column_names[ _ ] ] )
        else:
            same_hi.append( [ _ , b_hi_column_names[ _ ] , o_hi_column_names[ _ ] ] )
            
    print(f'hi level\nboth the same = {len(same_hi)}\nnot the same = {len(not_the_hi)}')
    

'''commonly understandable column names
    >> e.g. Number - Female; Total population, Males per 100 females; Total population - Medi...'''
# 2000 translated column names (current row 0)
b_lo_column_names = [ name for name in bush.iloc[ 0 ] ]
# 2010 translated column names (current row 0)
o_lo_column_names = [ name for name in obama.iloc[ 0 ] ]
# number of names are same 
if len( b_lo_column_names ) == len( o_lo_column_names ):
    # values that are same 
    same_lo = []   
    # values that are not the same
    not_the_lo = []
    # index length
    for _ in range( len( b_lo_column_names ) ):
        # object found at this index in 2000 is not same object at this index in 2010
        if b_lo_column_names[ _ ] != o_lo_column_names[ _ ]:
            # add name to 2000 only list
            not_the_lo.append( [ _ , b_lo_column_names[ _ ] , o_lo_column_names[ _ ] ] )
        else:
            same_lo.append( [ _ , b_lo_column_names[ _ ] , o_lo_column_names[ _ ] ] )
            
    print(f'common\nboth the same = {len(same_lo)}\nnot the same = {len(not_the_lo)}')

'''first row of numbered data 
    >> e.g. 18570, 100.0'''
# 2000 0th row of data (current row 1)
b_0th_real = [ name for name in bush.iloc[ 1 ] ]
# 2010 0th row of data (current row 1)
o_0th_real = [ name for name in obama.iloc[ 1 ] ]

# number of names are same 
if len( b_0th_real ) == len( o_0th_real ):
    # values that are same 
    same_0th = []   
    # values that are not the same
    not_the_0th = []
    # index length
    for _ in range( len( b_0th_real ) ):
        # object found at this index in 2000 is not same object at this index in 2010
        if o_0th_real[ _ ] != b_0th_real[ _ ]:
            # add name to 2000 only list
            not_the_0th.append( [ _ , b_0th_real[ _ ] , o_0th_real[ _ ] ] )
        else:
            same_0th.append( [ _ , b_0th_real[ _ ] , o_0th_real[ _ ] ] )
            
    print(f'data\nboth the same = {len(same_0th)}\nnot the same = {len(not_the_0th)}')

- ***notes***:
    - hi level
        - first 3 are the same
            - as expected
    - commonly understandable column names
        - perfect match
            - as expected
    - first row of data
        - less than 4.5% the same
            - about 85% fewer occourances than when testing existance of same instance in other df
        - good
- ***actions***:
    - set column names to commonly understandable column names
    - visualize change from 2000 to 2010

In [None]:
# copy dataframes
# 2000 Census
b = bush.copy()
# 2010 Census
o = obama.copy()

# reset 2000 columns to current 0th row values
b.columns = b.iloc[0]
# new 2000 dataframe without row where values are from
b = b[1:]

# reset 2010 columns to current 0th row values
o.columns = o.iloc[0]
# new 2010 dataframe without row where values are from
o = o[1:]

- ***actions***:
    - find commonalities to compare by
        - Id , Id2 , Geography
            - seem to be good possibilities

In [None]:
try_these = [ 'Id' , 'Id2' , 'Geography' ]

for column_name in try_these:
    print(column_name)
    same = []
    diff = []
    # len 2000 is 58 more than len 2010 
    for _ in range( 1 , len( b[ column_name ] )-58 ):
        if b[ column_name ][ _ ] == o[ column_name ][ _ ]:
            same.append( [_,b[ column_name ][ _ ],o[ column_name ][ _ ]] )
        else:
            diff.append( [_,b[ column_name ][ _ ],o[ column_name ][ _ ]] )
    print(f'same = {len(same)}\ndiff = {len(diff)}')
    i_same = [ instance.pop(0) for instance in same ] 
    # print(i_same)

In [None]:
# 3 lists just output
by_id = [1, 2, 3, 44, 88, 89, 90, 19635, 19637, 19638, 19639, 19640, 19641, 19642, 19643, 19644, 19645, 19646, 19647, 19648, 19649, 19650, 19651, 19652, 19653, 19743, 19744, 20642, 20643, 20644, 20645, 20646, 20647, 20648, 20649, 20650, 20651, 20652, 20653, 20654, 20655, 20656, 20657, 20658, 20659, 20812, 20813, 20814, 20815, 20816, 20817, 20818, 20819, 20820, 20821, 20822, 20823, 20824, 20825, 20826, 20827, 20828, 20893, 20894, 20895, 20896, 20897, 20898, 20899, 20900, 20901, 20902, 20903, 20904, 20905, 20906, 20907, 20908, 20992, 20993, 20994, 20995, 20996, 20997, 20998, 20999, 21000, 21001, 21002, 21003, 21004, 21005, 21006, 21007, 21008, 21009, 21010, 21011, 21012, 21013, 21014, 21064, 21065, 21066, 21067, 21068, 21069, 21070, 21071, 21072, 21073, 21074, 21075, 21076, 21077, 21078, 21079, 21080, 21081, 21082, 21083, 21084, 21085, 21086, 21087, 21088, 21089, 21090, 21091, 21092, 21093, 21094, 21523, 21524, 21778, 21779, 21780, 21781, 21782, 21783, 21784, 21785, 21786, 21787, 21788, 21789, 21790, 21791, 21792, 21793, 21794, 21839, 21840, 22138, 22146, 22147, 22148, 22149, 22150, 22151, 22152, 22225, 22226, 22227, 22228]
by_id2 = [1, 2, 3, 44, 88, 89, 90, 19635, 19637, 19638, 19639, 19640, 19641, 19642, 19643, 19644, 19645, 19646, 19647, 19648, 19649, 19650, 19651, 19652, 19653, 19743, 19744, 20642, 20643, 20644, 20645, 20646, 20647, 20648, 20649, 20650, 20651, 20652, 20653, 20654, 20655, 20656, 20657, 20658, 20659, 20812, 20813, 20814, 20815, 20816, 20817, 20818, 20819, 20820, 20821, 20822, 20823, 20824, 20825, 20826, 20827, 20828, 20893, 20894, 20895, 20896, 20897, 20898, 20899, 20900, 20901, 20902, 20903, 20904, 20905, 20906, 20907, 20908, 20992, 20993, 20994, 20995, 20996, 20997, 20998, 20999, 21000, 21001, 21002, 21003, 21004, 21005, 21006, 21007, 21008, 21009, 21010, 21011, 21012, 21013, 21014, 21064, 21065, 21066, 21067, 21068, 21069, 21070, 21071, 21072, 21073, 21074, 21075, 21076, 21077, 21078, 21079, 21080, 21081, 21082, 21083, 21084, 21085, 21086, 21087, 21088, 21089, 21090, 21091, 21092, 21093, 21094, 21523, 21524, 21778, 21779, 21780, 21781, 21782, 21783, 21784, 21785, 21786, 21787, 21788, 21789, 21790, 21791, 21792, 21793, 21794, 21839, 21840, 22138, 22146, 22147, 22148, 22149, 22150, 22151, 22152, 22225, 22226, 22227, 22228]
by_geo = [1, 2, 3, 44, 88, 89, 90, 19635, 19637, 19638, 19639, 19640, 19641, 19642, 19643, 19644, 19645, 19646, 19647, 19648, 19649, 19650, 19651, 19652, 19653, 19743, 19744, 20642, 20643, 20644, 20645, 20646, 20647, 20648, 20649, 20650, 20651, 20652, 20653, 20654, 20655, 20656, 20657, 20658, 20659, 20812, 20813, 20814, 20815, 20816, 20817, 20818, 20819, 20820, 20821, 20822, 20823, 20824, 20825, 20826, 20827, 20828, 20893, 20894, 20895, 20896, 20897, 20898, 20899, 20900, 20901, 20902, 20903, 20904, 20905, 20906, 20907, 20908, 20992, 20993, 20994, 20995, 20996, 20997, 20998, 20999, 21000, 21001, 21002, 21003, 21004, 21005, 21006, 21007, 21008, 21009, 21010, 21011, 21012, 21013, 21014, 21064, 21065, 21066, 21067, 21068, 21069, 21070, 21071, 21072, 21073, 21074, 21075, 21076, 21077, 21078, 21079, 21080, 21081, 21082, 21083, 21084, 21085, 21086, 21087, 21088, 21089, 21090, 21091, 21092, 21093, 21094, 21523, 21524, 21778, 21779, 21780, 21781, 21782, 21783, 21784, 21785, 21786, 21787, 21788, 21789, 21790, 21791, 21792, 21793, 21794, 21839, 21840, 22138, 22146, 22147, 22148, 22149, 22150, 22151, 22152, 22225, 22226, 22227, 22228]

# these lists are the same
if not by_id == by_id2 == by_geo:
    raise Exception('FALSE == by_id == by_id2 == by_geo == False')

# and for those 3 columns* every index value in any list is the same in 2000 and 2010 *(Id, Id2, Geography)
for i in [ by_id , by_id2 , by_geo ]:
    for _ in i:
        # compare Id values
        if b.Id[_] != o.Id[_]:
            # if any don't match 2000 vs 2010
            raise Exception(f'b.Id[_] != o.Id[_]\n{b.Id[_]} != {o.Id[_]}')
        # compare Id2 values
        if b.Id2[_] != o.Id2[_]:
            # if any don't match 2000 vs 2010
            raise Exception(f'b.Id2[_] != o.Id2[_]\n{b.Id2[_]} != {o.Id2[_]}')
        # compare Geography values
        if b.Geography[_] != o.Geography[_]:
            # if any don't match 2000 vs 2010
            raise Exception(f'b.Geography[_] != o.Geography[_]\n{b.Geography[_]} != {o.Geography[_]}')


- ***notes***:
    - why are these values the same?
    - what do they mean/represent?
- ***actions***:
    - answer above

In [None]:
len(o.Geography)

In [None]:
last_5_2000_geo = [ b.Geography[_][-5:] for _ in range( 1 , len(b.Geography)+1) ]
last_5_2010_geo = [ o.Geography[_][-5:] for _ in range( 1 , len(o.Geography)+1) ]

# note the index patterns
if b.Geography[33113][-5:] == last_5_2000_geo[33112] == o.Geography[33062][-5:] == last_5_2010_geo[33061]:
    pass
    if b.Geography[33114][-5:] == last_5_2000_geo[33113] == o.Geography[33063][-5:] == last_5_2010_geo[33062]:
        pass
        if b.Geography[33110][-5:] == last_5_2000_geo[33109] == o.Geography[33059][-5:] == last_5_2010_geo[33058]:
            pass
        else:
            raise Exception('ERROR2')
    else:
        raise Exception('ERROR1')
else:
    raise Exception('ERROR')

In [None]:
# each 5 digit code from end of each row in 2010 'Geography'
bgeo = [ _[-5:] for _ in b.Geography ]

# number of coexisting instances
x=0
# collect coexisting instances
collect_locs = []
# collect unique coexisting instances
uni_col_locs = set() 
# len 2010 is -58 vs len 2000 
for _ in range(1,len(o.Geography)):
    # 5 digit code is in 2010 collection 
    if o.Geography[_][-5:] in bgeo:
        # tag it
        x += 1
        # bag it
        collect_locs.append(o.Geography[_][-5:])
        uni_col_locs.add(o.Geography[_][-5:])


# number of coexisting , size of that list , number of unique , len 2000 vs 2010
x , len(collect_locs) , len(list(uni_col_locs)) , len(b.Geography) - len(o.Geography)

- ***notes***:
    - ok, this is large
        - readings credit
            - did a few pages of this https://www.census.gov/prod/cen2000/doc/sf1.pdf 
                - 494-497 + skim 
            - followed by https://www.quora.com/U-S-Census-Bureau-How-does-Zip-Code-Tabulation-Area-is-created 
    - in short,
        - the US Census Bureau doesn't strictly use postal zip code when determining 5-digit zip
            - leave out some very small zips
            - made zips for large bodies of water and land which did not have zip codes
                - this was done in 2000 and not done in 2010
    - what we've found:
        - all 5 digit codes (found at end of each object) in 2010 'Geography' column 
            - are seen at the end of an object in the 2000 'Geograpy' column 
- ***actions***:
    - drop columns where 5-digit is not seen
        - see if df is then ready for straight compare

In [None]:
# identify zip codes from 2000 .Geography (last 5 chars of string)
zip_2000_codes = [q[-5:] for q in b.Geography]  # ValueError: invalid literal for int() with base 10: '006HH'
# identify zip codes from 2010 .Geography (last 5 chars of string)
zip_2010_codes = [q[-5:] for q in o.Geography]

# from 2000.Geography , instance is not seen in 2010.Geography  -- sample: zip_code = (2, 'c')
in_2000_but_not_2010_from_2000 = [zip_code for zip_code in enumerate(zip_2000_codes) if zip_code[1] not in zip_2010_codes]
# from 2010.Geography , instance is not seen in 2000.Geography  -- sample: zip_code[1] = 'c'
in_2010_but_not_2000_from_2010 = [zip_code for zip_code in enumerate(zip_2010_codes) if zip_code[1] not in zip_2000_codes]

# from 2000.Geography , instance is seen in 2010.Geography
in_2000_and_2010_from_2000 = [zip_code for zip_code in enumerate(zip_2000_codes) if zip_code[1] in zip_2010_codes]
# from 2010.Geography , instance is seen in 2000.Geography
in_2010_and_2000_from_2010 = [zip_code for zip_code in enumerate(zip_2010_codes) if zip_code[1] in zip_2000_codes]

In [None]:
len(in_2010_but_not_2000_from_2010)

In [None]:
'''
len(in_2000_but_not_2010_from_2000) = 1800
len(in_2010_but_not_2000_from_2010) = 1742
len(in_2000_and_2010_from_2000) = 31378
len(in_2010_and_2000_from_2010) = 31378'''
print(f'len(in_2000_but_not_2010_from_2000) = {len(in_2000_but_not_2010_from_2000)}\nlen(in_2010_but_not_2000_from_2010) = {len(in_2010_but_not_2000_from_2010)}\nlen(in_2000_and_2010_from_2000) = {len(in_2000_and_2010_from_2000)}\nlen(in_2010_and_2000_from_2010) = {len(in_2010_and_2000_from_2010)}')

- ***notes***:
    - len(in_2000_and_2010_from_2000) == len(in_2010_and_2000_from_2010) == 31378
        - big relief 
- ***actions***:
    - convert 2010 and 2000 dataframes based on shared values in .Geography 
        - and
            - drop 'ZCTA5' form 'Geography' rows
            - check that in_2000_but_not_2010_from_2000 doesn't share with in_2010_but_not_2000_from_2010
        - potential issues
            - same 5-digit zip code is not actually same place
        - solutions
            - any changes of this sort have been documented
                - review prior readings on overview
    - below before editing df is confirming assumptions and provinding evidence for df editing strats

In [None]:
for _ in in_2000_but_not_2010_from_2000:
    if _ in in_2010_but_not_2000_from_2010:
        raise Exception(f'{_} in in_2010_but_not_2000_from_2010')
for _ in in_2010_but_not_2000_from_2010:
    if _ in in_2000_but_not_2010_from_2000:
        raise Exception(f'{_} in in_2000_but_not_2010_from_2000')

In [None]:
lettered = []
no_letters = []
for _ in in_2000_but_not_2010_from_2000:
    if isinstance(_[1],str):
        if 'X' in _[1] or 'H' in _[1]:
            lettered.append(_)
        elif 'X' not in _[1] and 'H' not in _[1]:
            no_letters.append(int(_[1]))
        else:
            raise Exception(type(_[1]),_[1])
    else:
        raise Exception(f'NON STR INSTANCE , {type(_[1])}')

for _ in lettered:
    if _ in in_2010_but_not_2000_from_2010:
        raise Exception(_)

for _ in no_letters:
    if _ in in_2010_but_not_2000_from_2010:
        raise Exception(_)

In [None]:
print(f'lettered = {len(lettered)}\nno_letters = {len(no_letters)}')

In [None]:
lettered2 = []
no_letters2 = []
for _ in in_2010_but_not_2000_from_2010:
    if isinstance(_[1],str):
        if 'X' in _[1] or 'H' in _[1]:
            lettered2.append(_)
        elif 'X' not in _[1] and 'H' not in _[1]:
            no_letters2.append(int(_[1]))
        else:
            raise Exception(_[1])
    else:
        raise Exception(f'NON STR INSTANCE , {type(_[1])}')

In [None]:
print(f'lettered = {len(lettered)}\nlettered2 = {len(lettered2)}\nno_letters = {len(no_letters)}\nno_letters2 = {len(no_letters2)}')

In [None]:
"""in_2010_but_not_2000_from_2010 and in_2000_but_not_2010_from_2000 are completely different""" 

# 2010_not_2000 vs 2000_not_2010
for _ in in_2010_but_not_2000_from_2010:
    if _ in in_2000_but_not_2010_from_2000:
        raise Exception(f'in_2010_but_not_2000_from_2010 {_}')

# 2000_not_2010 vs 2010_not_2000
for _ in in_2000_but_not_2010_from_2000:
    if _ in in_2010_but_not_2000_from_2010:
        raise Exception(f'in_2010_but_not_2000_from_2010 {_}')

In [None]:
for _ in range(len(in_2010_and_2000_from_2010)):
    if in_2010_and_2000_from_2010[_][1] != in_2000_and_2010_from_2000[_][1]:
        # note: i (for i,j) are not same 
        # in_2010_and_2000_from_2010[3] = (3, '00606')
        # in_2000_and_2010_from_2000[3] = (4, '00606')
        # in_2010_and_2000_from_2010[0] = (0, '00601')
        # in_2000_and_2010_from_2000[0] = (0, '00601')
        raise Exception(f'{in_2010_and_2000_from_2010[_]}\n!= ERROR != ERROR !=\n{in_2000_and_2010_from_2000[_]}')

- ***note***:
    - error pointed about above comments
- ***action***:
    - when dropping column by index, use i (for i,j) coming from that year's df 

In [None]:
# index of objects coexisting in 2000 and 2010
of_2000_indexes = [i for i,j in in_2000_and_2010_from_2000]
# index of objects coexisting in 2010 and 2000 
of_2010_indexes = [i for i,j in in_2010_and_2000_from_2010]
# ^note: these are different lists, if took j instead of i, then would be same list
if [j for i,j in in_2000_and_2010_from_2000] != [j for i,j in in_2010_and_2000_from_2010]:
    # like is seen here, j for j == True
    raise Exception('FLAWED ASSUMPTION')
# however i for i == False
if of_2000_indexes == of_2010_indexes:
    # cheers
    raise Exception('FLAWED ASSUMPTION')    

In [None]:
# set 2000 
__b__ = b.copy()
# set 2010
__o__ = o.copy()

In [None]:
common_geo_values = [j for i,j in in_2010_and_2000_from_2010]

- ***action***:
    - using common_geo_values
        - compare values from one table to the next
            - works as table is set time
            - leaves 
                - 1742 unused 2010 rows
                - 1800 unused 2000 rows

In [None]:
# 2000 Census
b = bush.copy()
# 2010 Census
o = obama.copy()

# reset 2000 columns to current 0th row values
b.columns = b.iloc[0]
# new 2000 dataframe without row where values are from
b = b[1:]

# reset 2010 columns to current 0th row values
o.columns = o.iloc[0]
# new 2010 dataframe without row where values are from
o = o[1:]

In [None]:
'''but of course, the whole time we should have just called the list'''

# for geo_value in common_geo_values:
twenty_ten = b.copy().set_index(b.Geography)
# b.iloc[0:3]
b = b.iloc[of_2000_indexes]
o = o.iloc[of_2010_indexes]

In [None]:
b = b.reset_index()
o = o.reset_index()

In [None]:
b.head()

In [None]:
o.head()

In [None]:
for _ in range(1,len(b)):
    if b.Id2[_] != o.Id2[_]:
        print(b.Id2[_],o.Id2[_])


In [None]:
len(__b__.Geography), len(__o__.Geography),len(__b__.Geography)- len(__o__.Geography)

In [None]:
len(b.Geography), len(o.Geography)

In [None]:
'''split'''
# # label objects in 2000.Geography with their index
# nume_b_geo = enumerate(b.Geography)
# # label objects in 2010.Geography with their index
# nume_b_geo = enumerate(o.Geography)

# # identify codes occouring in 2000 and not 2010
# [i for (i,_) in nume_b_geo if _[-5:] not in zip_2010_codes]

# # index and instance for all 2000 Census 'Geography' rows 
# # if the zip code (last 5 digits) is NOT in the collection of 2010 Census zip codes 
# # drop row index from the 2000 Census 
# # ready_b = b.copy().Geography.drop([i for (i,_) in nume_b_geo if _[-5:] not in zip_2010_codes])
# ready_b = b.copy().drop()
pass

In [None]:


for i in enumerate(['a','b','c','d','e','f']):
    print(i)

In [1]:
# master py file
import time
import numpy as np
import pandas as pd

def convert(a='../data/2000/age-groups-and-sex-census-DEC_00_SF1_QTP1/DEC_00_SF1_QTP1_with_ann.csv',
           b='../data/2010/age-groups-and-sex-census-DEC_10_SF1_QTP1/DEC_10_SF1_QTP1_with_ann.csv'):
    # load 2000 data
    y2k = pd.read_csv( a , low_memory=False )
    # load 2010 data
    y2k10 = pd.read_csv( b , low_memory=False )

    # 2000 Census
    b = y2k.copy()
    # 2010 Census
    o = y2k10.copy()

    # reset 2000 columns to current 0th row values
    b.columns = b.iloc[0]
    # new 2000 dataframe without row where values are from
    b = b[1:]
    # reset index
    b = b.reset_index()

    # reset 2010 columns to current 0th row values
    o.columns = o.iloc[0]
    # new 2010 dataframe without row where values are from
    o = o[1:]
    # reset index
    o = o.reset_index()

    # identify zip codes from 2000 .Geography (last 5 chars of string)
    zip_2000_codes = [q[-5:] for q in b.Geography]  # ValueError: invalid literal for int() with base 10: '006HH'
    # identify zip codes from 2010 .Geography (last 5 chars of string)
    zip_2010_codes = [q[-5:] for q in o.Geography]

    # from 2000.Geography , instance is not seen in 2010.Geography  -- sample: zip_code = (2, 'c')
    in_2000_but_not_2010_from_2000 = [zip_code for zip_code in enumerate(zip_2000_codes) if zip_code[1] not in zip_2010_codes]
    # from 2010.Geography , instance is not seen in 2000.Geography  -- sample: zip_code[1] = 'c'
    in_2010_but_not_2000_from_2010 = [zip_code for zip_code in enumerate(zip_2010_codes) if zip_code[1] not in zip_2000_codes]

    # from 2000.Geography , instance is seen in 2010.Geography
    in_2000_and_2010_from_2000 = [zip_code for zip_code in enumerate(zip_2000_codes) if zip_code[1] in zip_2010_codes]
    # from 2010.Geography , instance is seen in 2000.Geography
    in_2010_and_2000_from_2010 = [zip_code for zip_code in enumerate(zip_2010_codes) if zip_code[1] in zip_2000_codes]

    # index of objects coexisting in 2000 and 2010
    of_2000_indexes = [i for i,j in in_2000_and_2010_from_2000]
    # index of objects coexisting in 2010 and 2000 
    of_2010_indexes = [i for i,j in in_2010_and_2000_from_2010]
    # ^note: these are different lists, if took j instead of i, then would be same list
    if [j for i,j in in_2000_and_2010_from_2000] != [j for i,j in in_2010_and_2000_from_2010]:
        # like is seen here, j for j == True
        raise Exception(f'FLAWED ASSUMPTION , [j for i,j in 2000] != [j for i,j in 2010]\n'
                        f'len {len(in_2000_and_2010_from_2000)} {len(in_2010_and_2000_from_2010)}')
    # however i for i == False
    if of_2000_indexes == of_2010_indexes:
        # cheers
        raise Exception('FLAWED ASSUMPTION , of_2000_indexes != of_2010_indexes\n'
                        f'len y2k {len(of_2000_indexes)} 2k10 {len(of_2010_indexes)}')  

    # thin 2000 to shared geo
    b = b.iloc[of_2000_indexes]
    # thin 2010 to shared geo
    o = o.iloc[of_2010_indexes]
    
    # out 
    return b , o

In [None]:
a = convert()

In [None]:
len(a)