In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

In [14]:
# default cleaning method until proven otherwise

def clean_census_frame(csv_path , head=False , reset=True , set_index=False ):
    '''
    inputs) 
        >> csv_path
            > path to csv
        >> head
            > default=False
                >> if != False
                    > integer
                        >> returns the first {head} rows (using .head() method) 
                            > instead of enitre dataframe
        >> reset
            > default=True
                >> resets index after taking out rows
            > if set to False
                >> will not reset index
        >> set_index
            > default=False
            > if != False
                >> will set_index of new df to set_index
    output)
        >> dataframe cleaned like 2000 Census age&sex by 5-digit Zip Code (also how 2010 for same is cleaned)
    how)
        1. reads in csv , assumes it's large
        2. makes a copy for editing 
            > and potential future use
        3. locates readable column names  and non-readable names 
            > readable
                    > e.g. Estimate; SEX AND AGE - Total population
                >> assumes they are currently in row 0
            > non-readable
                    > e.g. HC01_VC03
                >> assumes they are currently == dataframe.columns
        4. replaces dataframe.columns (non-readable) with readable column names
            > and drops the old 0th column (column where readable names were stored)
        
    '''
    # load data
    df = pd.read_csv( csv_path , low_memory=False )

    # and copy
    _df = df.copy()

    # reset column names to current 0th row values
    _df.columns = _df.iloc[0]
    # new 2000 dataframe without row where values are from
    clean_df = _df[1:]
    
    # default
    if reset==True:
        # reset index
        clean_df = clean_df.reset_index()
        
    # set_index
    if set_index:
        clean_df = clean_df.set_index(set_index)
    
    if head:
        # return first {head} rows of dataframe
        return clean_df.head(head)
    else:
        # return dataframe
        return clean_df
    
    
def show_lengths(collection):
    '''
    shows lengths of collection of items
    '''
    # collect
    lengths = []
    # iterate
    for i in collection:
        # add len to collect
        lengths.append(len(i))
    # return all lens
    return lengths

In [9]:
# test = pd.read_csv('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv',low_memory=False)
# ID=test.copy()
# for _ in range(len(ID.columns)):
#     for i in range(len(ID[ID.columns[_]])):
#         if isinstance(ID[ID.columns[_]][i],str):
#             pass
#         else:
#             print(f'column {_}\nrow {i}\ntype {type(ID[ID.columns[_]][i])}\n{ID[ID.columns[_]][i]}\n\n') 

- ***notes***:
    - every instance is string (was no output)
    - merged & hashed as to avoid running again on accident or restart^

In [4]:
# reload a year for base comps
t000 = pd.read_csv('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv',low_memory=False)
# limit size
# test = test.head(1000)

In [25]:
test = clean_census_frame('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv',reset=False)

In [31]:
t000.columns  #.describe()

Index(['GEO.id', 'GEO.id2', 'GEO.display-label', 'HC01_VC03', 'HC02_VC03',
       'HC03_VC03', 'HC04_VC03', 'HC01_VC04', 'HC02_VC04', 'HC03_VC04',
       ...
       'HC03_VC101', 'HC04_VC101', 'HC01_VC102', 'HC02_VC102', 'HC03_VC102',
       'HC04_VC102', 'HC01_VC104', 'HC02_VC104', 'HC03_VC104', 'HC04_VC104'],
      dtype='object', length=327)

In [29]:
test.columns  #describe()

Index(['Id', 'Id2', 'Geography', 'Estimate; SEX AND AGE - Total population',
       'Margin of Error; SEX AND AGE - Total population',
       'Percent; SEX AND AGE - Total population',
       'Percent Margin of Error; SEX AND AGE - Total population',
       'Estimate; SEX AND AGE - Total population - Male',
       'Margin of Error; SEX AND AGE - Total population - Male',
       'Percent; SEX AND AGE - Total population - Male',
       ...
       'Percent; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race',
       'Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race',
       'Estimate; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races',
       'Margin of Error; HISPANIC OR LATINO AND RACE - Total populati

In [30]:
# 2012
y2k12 = clean_census_frame('../data/acs/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
#2013
y2k13 = clean_census_frame('../data/acs/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)

In [83]:
# 2011
t11 = test.copy()
# 2012
t12 = y2k12.copy()
# 2013
t13 = y2k12.copy()

# tag columns
tags11 = t11.columns  # 2011
tags12 = t12.columns  # 2012
tags13 = t13.columns  #2013

In [84]:
# common columns
common_tags = [tag for tag in tags13 if tag in tags11 & tags12]
# chk chg
len(common_tags),len(tags13)

(219, 327)

In [85]:
plz = t11.copy().drop(common_tags,axis=1)
plz

Unnamed: 0,Estimate; SEX AND AGE - Total population - Male,Margin of Error; SEX AND AGE - Total population - Male,Percent; SEX AND AGE - Total population - Male,Percent Margin of Error; SEX AND AGE - Total population - Male,Estimate; SEX AND AGE - Total population - Female,Margin of Error; SEX AND AGE - Total population - Female,Percent; SEX AND AGE - Total population - Female,Percent Margin of Error; SEX AND AGE - Total population - Female,Estimate; SEX AND AGE - 18 years and over - Male,Margin of Error; SEX AND AGE - 18 years and over - Male,...,Percent; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races,Estimate; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race,Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race,Percent; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race,"Estimate; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races"
1,8824,174,48.8,0.4,9264,159,51.2,0.4,6609,123,...,0.0,0.2,0,20,0.0,0.2,0,20,0.0,0.2
2,20079,83,49.1,0.1,20780,99,50.9,0.1,15502,51,...,2.7,0.7,14,23,0.0,0.1,1099,284,2.7,0.7
3,25971,390,48.9,0.4,27191,395,51.1,0.4,19752,280,...,1.0,0.3,0,30,0.0,0.1,553,183,1.0,0.3
4,3188,179,49.7,1.6,3227,151,50.3,1.6,2350,148,...,0.0,0.6,0,18,0.0,0.6,0,18,0.0,0.6
5,13958,102,48.5,0.2,14847,91,51.5,0.2,10594,99,...,0.1,0.1,0,24,0.0,0.1,32,35,0.1,0.1
6,31226,725,47.1,0.5,35025,741,52.9,0.5,23393,674,...,0.0,0.1,0,30,0.0,0.1,22,34,0.0,0.1
7,5205,530,49.7,2.5,5261,519,50.3,2.5,4025,477,...,0.0,0.1,0,20,0.0,0.4,3,8,0.0,0.1
8,11927,72,47.9,0.2,12990,81,52.1,0.2,8857,48,...,0.1,0.1,0,24,0.0,0.2,15,24,0.1,0.1
9,2915,478,44.2,3.9,3674,648,55.8,3.9,2133,347,...,0.0,0.6,0,18,0.0,0.6,0,18,0.0,0.6
10,21308,478,48.3,0.6,22809,649,51.7,0.6,16339,347,...,0.0,0.1,0,27,0.0,0.1,0,27,0.0,0.1


In [60]:
test11_to_common = t11.copy()

# for u in test11_to_common.columns:
#     if u not in common_tags:
#         test11_to_common = test11_to_common[[u]]   # test11_to_common[u]
# test11_to_common.drop([common_tags],axis=1)
# type(common_tags[1])
# test11_to_common['Id']
type(tuple(common_tags))

tuple

In [61]:
len(tuple(common_tags))

219

In [70]:
common_tags[:3]

['Id', 'Id2', 'Geography']

In [81]:
# t = t11.copy().drop(['Id', 'Id2', 'Geography'],axis=1)
# t


# for _ in common_tags[1]:
#     tags11 = tags11[_]

Unnamed: 0,Estimate; SEX AND AGE - Total population,Margin of Error; SEX AND AGE - Total population,Percent; SEX AND AGE - Total population,Percent Margin of Error; SEX AND AGE - Total population,Estimate; SEX AND AGE - Total population - Male,Margin of Error; SEX AND AGE - Total population - Male,Percent; SEX AND AGE - Total population - Male,Percent Margin of Error; SEX AND AGE - Total population - Male,Estimate; SEX AND AGE - Total population - Female,Margin of Error; SEX AND AGE - Total population - Female,...,Percent; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races including Some other race,"Estimate; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races","Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total population - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races",Estimate; HISPANIC OR LATINO AND RACE - Total housing units,Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units,Percent; HISPANIC OR LATINO AND RACE - Total housing units,Percent Margin of Error; HISPANIC OR LATINO AND RACE - Total housing units
1,18088,295,18088,(X),8824,174,48.8,0.4,9264,159,...,0.0,0.2,0,20,0.0,0.2,7156,164,(X),(X)
2,40859,154,40859,(X),20079,83,49.1,0.1,20780,99,...,0.0,0.1,1099,284,2.7,0.7,16843,216,(X),(X)
3,53162,657,53162,(X),25971,390,48.9,0.4,27191,395,...,0.0,0.1,553,183,1.0,0.3,24326,395,(X),(X)
4,6415,264,6415,(X),3188,179,49.7,1.6,3227,151,...,0.0,0.6,0,18,0.0,0.6,2557,121,(X),(X)
5,28805,163,28805,(X),13958,102,48.5,0.2,14847,91,...,0.0,0.1,32,35,0.1,0.1,11987,184,(X),(X)
6,66251,1289,66251,(X),31226,725,47.1,0.5,35025,741,...,0.0,0.1,22,34,0.0,0.1,29752,461,(X),(X)
7,10466,906,10466,(X),5205,530,49.7,2.5,5261,519,...,0.0,0.4,3,8,0.0,0.1,4567,278,(X),(X)
8,24917,134,24917,(X),11927,72,47.9,0.2,12990,81,...,0.0,0.2,15,24,0.1,0.1,9733,252,(X),(X)
9,6589,1001,6589,(X),2915,478,44.2,3.9,3674,648,...,0.0,0.6,0,18,0.0,0.6,8218,348,(X),(X)
10,44117,1001,44117,(X),21308,478,48.3,0.6,22809,649,...,0.0,0.1,0,27,0.0,0.1,20955,408,(X),(X)


In [63]:
t11_two19  # common_tags

NameError: name 't11_two19' is not defined

In [None]:
# len(ID[col_names[_]]) 
# ID[col_names[_]][33120] == ID[col_names[_]][33120]

In [None]:
# # reload a year for base comps
# test = pd.read_csv('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv',low_memory=False)
# # limit size
# # test = test.head(1000)

In [None]:
# confirm location of column names and current column names
test.iloc[0]

In [None]:
# examine metadata (2014 is same as 2011-2013 as same len(.columns))
meta_fourteen_ =  pd.read_csv('../data/acs/aff_download/ACS_14_5YR_DP05_metadata.csv')
for _ in range(1,len(meta_fourteen_)):
    print(str(meta_fourteen_.Id[_-1:_])+'\n')

- ***notes***:
    - looks nothing like the online American Fact Finder version 
        - always fun
    - online 2014
        - highest columns are Geography
            - ZCTA5 00601
            - which is broken into subcolumns
                - Total, Male, Female
                - each of which are broken into subcolumns
                    - Estimate, Margin of Error
        - effectively
            - each Geography is represented by 6 columns
        - seems I have more information regarding race and ethnicity per zip
- ***actions***:
    - represent each area by Geography in each table
    - compare evolution of that Geography
        - Clustering (KNN)
        - Regression

In [None]:
"""
# 2011
y2k11 = clean_census_frame('../data/acs/aff_download/ACS_11_5YR_DP05_with_ann.csv',reset=False)
# 2012
y2k12 = clean_census_frame('../data/acs/aff_download/ACS_12_5YR_DP05_with_ann.csv',reset=False)
#2013
y2k13 = clean_census_frame('../data/acs/aff_download/ACS_13_5YR_DP05_with_ann.csv',reset=False)
# 2014
y2k14 = clean_census_frame('../data/acs/aff_download/ACS_14_5YR_DP05_with_ann.csv')
# 2015
y2k15 = clean_census_frame('../data/acs/aff_download/ACS_15_5YR_DP05_with_ann.csv')
# 2016
y2k16 = clean_census_frame('../data/acs/aff_download/ACS_16_5YR_DP05_with_ann.csv')
# 2017
y2k17 = clean_census_frame('../data/acs/aff_download/ACS_17_5YR_DP05_with_ann.csv')
"""

In [None]:
# as it's always come with such joy, let's check length
full_yrs = [y2k11,y2k12,y2k13,y2k14,y2k15,y2k16,y2k17]
# we want these to all be the same
show_lengths(full_yrs)

In [None]:
# is Geography consistent?

# a = sum(y2k11.Geography != y2k13.Geography)  
# b = sum(y2k12.Geography != y2k14.Geography) 
# c = sum(y2k13.Geography != y2k15.Geography)
# d = sum(y2k14.Geography != y2k16.Geography)
# e = sum(y2k15.Geography != y2k17.Geography)
# f = sum(y2k16.Geography != y2k11.Geography)

# print(sum([a,b,c,d,e,f]))


In [None]:
# find common columns across all frames
_11_cols = y2k11.columns
_12_cols = y2k12.columns
_13_cols = y2k13.columns
_14_cols = y2k14.columns
_15_cols = y2k15.columns
_16_cols = y2k16.columns
_17_cols = y2k17.columns

collection = [_11_cols,_12_cols,_13_cols,_14_cols,_15_cols,_16_cols,_17_cols]

for _ in range(len(collection)):
    a = len([col for col in collection[_] if col in y2k11])
    b = len([col for col in collection[_] if col in y2k12])
    c = len([col for col in collection[_] if col in y2k13])
    d = len([col for col in collection[_] if col in y2k14])
    e = len([col for col in collection[_] if col in y2k15])
    f = len([col for col in collection[_] if col in y2k16])
    g = len([col for col in collection[_] if col in y2k17])
    out = [_,a,b,c,d,e,f,g]
    for i in out:
        print(i)
    print('\n')

- ***notes***:
    - use 2011 or 2012 columns to find common
    - max len shared columns == 220
- ***action***:
    - find shared columns

In [None]:
# a = sum(y2k11.Geography != y2k13.Geography)  
# b = sum(y2k12.Geography != y2k14.Geography) 
# c = sum(y2k13.Geography != y2k15.Geography)
# d = sum(y2k14.Geography != y2k16.Geography)
# e = sum(y2k15.Geography != y2k17.Geography)
# f = sum(y2k16.Geography != y2k11.Geography)

# geo11 = y2k11.Geography
# geo12 = y2k12.Geography
# geo13 = y2k13.Geography
# geo14 = y2k14.Geography
# geo15 = y2k15.Geography
# geo16 = y2k16.Geography
# geo17 = y2k17.Geography

In [None]:
# # zip codes shared across all dataframes, index
# frames = []
# # 2011 zips
# # watch out for 0,1 flipflop due to adjusting sorts above
# for _ in range(len(geo11)): 
#     # set geo11
#     g = geo11[_]
#     # 2012 zips
#     if g == geo12[_]:
#         # 2013 zips
#         if g == geo13[_]:
#             # 2014 zips
#             if g == geo14[_]:
#                 # 2015 zips
#                 if g == geo15[_]:
#                     # 2016 zips
#                     if g == geo16[_]:
#                         # 2017 zips
#                         if g == geo17[_]:
#                             frames.append(_)
# # make sure frames are good to set_index
# if len(frames) != len(geo11) -1:
#     raise Exception(f'len(frames) != len(geo11)\n{len(frames)} != {len(geo11)}')
# determine shared columns across all dataframes, used to ensure codes across time/geo are comparable
columns = []
# 2011 columns
for _ in y2k11.columns:
    # 2012 columns
    if _ in y2k12.columns:
        # 2013 columns
        if _ in y2k13.columns:
            # 2014 columns
            if _ in y2k14.columns:
                # 2015 columns
                if _ in y2k15.columns:
                    # 2016 columns
                    if _ in y2k16.columns:
                        # 2017 columns
                        if _ in y2k17.columns:
                            columns.append(_)
# adjust dataframes to coexistance only
# for _ in columns:


In [None]:
# _11_cols = y2k11.copy().columns
# _12_cols = y2k12.copy().columns
# _13_cols = y2k13.copy().columns
# _14_cols = y2k14.copy().columns
# _15_cols = y2k15.copy().columns
# _16_cols = y2k16.copy().columns
_17_cols = y2k17.copy().columns

drop_17 = [col for col in _17_cols if col not in columns]

In [None]:
len(_17_cols)-len(drop_17), len(columns)

In [None]:
# y2k11[[columns]]

In [None]:
y11 = y2k11.copy()
y12 = y2k12.copy()
y13 = y2k13.copy()
y14 = y2k14.copy()
y15 = y2k15.copy()
y16 = y2k16.copy()
y17 = y2k17.copy()

In [None]:
frames = [y11,y12,y13,y14,y15,y16,y17]

In [None]:
coc = pd.concat(frames,sort=True,axis=1)

pd.to_numeric(coc)

In [None]:
"""we're going to make a bunch of small dataframes for each zip code
        this should proove useful as zip code is common across all these frames and many other Census datasets"""
# pandas.core.series.Series
# titles = y2k11.Geography

# set years index
idx = ['2011','2012','2013','2014','2015','2016','2017']

blank_frames = []

In [None]:
y2k14

In [None]:
'''MAKE IT FASTER >> done to speed up modeling process'''

# 2011
sy2k11 = y2k11.head(1000)
# 2012
sy2k12 = y2k12.head(1000)
#2013
sy2k13 = y2k13.head(1000)
# 2014
sy2k14 = y2k14.head(1000)
# 2015
sy2k15 = y2k15.head(1000)
# 2016
sy2k16 = y2k16.head(1000)
# 2017
sy2k17 = y2k17.head(1000)

In [None]:
sy2k14

In [None]:
# gather column names
column_names = [_ for _ in sy2k11[:0]]
column_names

In [None]:
for _ in [sy2k11,sy2k12,sy2k13,sy2k14,sy2k15,sy2k16,sy2k17]:
    print(_.info(),'\n\n')

In [None]:
sy2k14

In [None]:
# determine columns in 2015, 2016 and 2017 which are not seen in 2011-2014 
eleven_columns = [name for name in sy2k11.columns]
twelve_columns = [name for name in sy2k12.columns]
thirteen_columns = [name for name in sy2k12.columns]
fourteen_columns = [name for name in sy2k12.columns]
fifteen_columns = [name for name in sy2k15.columns]
sixteen_columns = [name for name in sy2k16.columns]
seventeen_columns = [name for name in sy2k17.columns]

In [None]:
# compare lengths of columns
show_lengths([eleven_columns,twelve_columns,thirteen_columns,fourteen_columns,fifteen_columns,sixteen_columns,seventeen_columns])

In [None]:
'''make sure those with 328 columns all have the same columns'''

# collect similar
w_328_cols = [eleven_columns,twelve_columns,thirteen_columns,fourteen_columns]

# count non coexist 
same_len_non_coexist = 0

# show lens
show_w_328 = show_lengths(w_328_cols)
# double check lengths
for i in range(len(show_w_328)):
    if show_w_328[i] != show_w_328[-i]:
        raise Exception(f'ERROR len != len\n{show_w_328[i]} != {show_w_328[-i]}')

# compare all to initial 
for name in eleven_columns: 
    if name not in twelve_columns:
        same_len_non_coexist+=1
        # raise Exception(f'FLAWED ASSUMPTION if name in y2k12.columns name = {name}')
    elif name not in thirteen_columns:
        same_len_non_coexist+=1
        #raise Exception(f'FLAWED ASSUMPTION if name in y2k13.columns name = {name}')
    elif name not in fourteen_columns:
        same_len_non_coexist+=1
        #raise Exception(f'FLAWED ASSUMPTION if name in y2k14.columns name = {name}')

# final hurdle, check for non coexisting (based on prior hoops this should be a given)
if same_len_non_coexist > 0:
    raise Exception(f'FLAWED ASSUMPTION same_len_non_coexist ({same_len_non_coexist}) > 0')

In [None]:
'''make sure those with 340 columns all have the same columns'''

for _ in range(len(fifteen_columns)): 
    if fifteen_columns[_] != sixteen_columns[_]:
        raise Exception(f'FLAWED ASSUMPTION if name in y2k12.columns name = {name}')

In [None]:
def col_name_eda():
    '''see if those 328 column names are seen in 2015, 2016, and 2017'''
    # store non coexist 2011 vs 2015
    non_coexist_2015 = []
    # store non coexist 2011 vs 2016
    non_coexist_2016 = []
    # store non coexist 2011 vs 2017
    non_coexist_2017 = []

    # compare to initial 
    for _ in range(len(eleven_columns)): 
        # 2017
        if eleven_columns[_] not in seventeen_columns:
            # update 2017 bag w/ index & instance
            non_coexist_2017.append((_,eleven_columns[_]))
        # 2016
        if eleven_columns[_] not in sixteen_columns:
            # update 2016 bag w/ index & instance
            non_coexist_2016.append((_,eleven_columns[_]))
        # 2015
        if eleven_columns[_] not in fifteen_columns:
            # update 2015 bag w/ index & instance
            non_coexist_2015.append((_,eleven_columns[_]))


    '''reverse vs 2016 and then vs 2017 if 2015 == 2016'''
    if fifteen_columns == sixteen_columns:

        # store non coexist 2016 vs 2011
        _from_2016_non_coexist_2011 = []
        # store non coexist 2017 vs 2011
        _from_2017_non_coexist_2011 = []

        # 2017
        for _ in range(len(seventeen_columns)):
            if seventeen_columns[_] not in eleven_columns:
                # update 2017 bag w/ index & instance
                _from_2017_non_coexist_2011.append((_,seventeen_columns[_]))
        # 2016
        for _ in range(len(sixteen_columns)):
            if sixteen_columns[_] not in eleven_columns:
                # update 2016 bag w/ index & instance
                _from_2016_non_coexist_2011.append((_,sixteen_columns[_]))

    # let us know if 2015 != 2016
    else:
        raise Exception('ERROR\nfifteen_columns != sixteen_columns\nERROR')


    '''2016 vs 2017 and reverse'''
    # store non coexist 2016 vs 2017
    _from_2016_non_coexist_2017 = []
    # store non coexist 2017 vs 2016
    _from_2017_non_coexist_2016 = []

    # from 2017 to 2016
    for _ in range(len(seventeen_columns)):
        if seventeen_columns[_] not in eleven_columns:
            # update 2017 bag w/ index & instance
            _from_2017_non_coexist_2016.append((_,seventeen_columns[_]))
    # from 2016 to 2017
    for _ in range(len(sixteen_columns)):
        if sixteen_columns[_] not in seventeen_columns:
            # update 2016 bag w/ index & instance
            _from_2016_non_coexist_2017.append((_,sixteen_columns[_]))

    '''2011 vs 2015/2016/2017'''
    # count non coexist 2011 vs 2015
    non_coexist_2015_count = len(non_coexist_2015)
    # count non coexist 2011 vs 2016
    non_coexist_2016_count = len(non_coexist_2016)
    # count non coexist 2011 vs 2017
    non_coexist_2017_count = len(non_coexist_2017)
    # count non coexist 2011 vs each in ALL
    non_coexist_count = non_coexist_2015_count + non_coexist_2016_count + non_coexist_2017_count
    # identify unique non coexistances
    u_non_coexist = set(non_coexist_2015 + non_coexist_2016 + non_coexist_2017)
    # count the number of unique non coexist 2011 vs each in All
    u_non_coexist_count = len(u_non_coexist)

    '''2016/2017 vs 2011'''
    # count non coexist 2016 vs 2011
    _from_2016_non_coexist_2011_count = len(_from_2016_non_coexist_2011)
    # count non coexist 2017 vs 2011
    _from_2017_non_coexist_2011_count = len(_from_2017_non_coexist_2011)
    # identify unique non coexistances (exist in 2016 or 2017 but not in 2011)
    r_u_non_coexist = set(_from_2016_non_coexist_2011 + _from_2017_non_coexist_2011)
    # count the number of unique non coexist 2011 vs each in All
    r_u_non_coexist_count = len(r_u_non_coexist)

    '''2016 & 2017'''
    # count non coexist 2016 vs 2017
    _from_2016_non_coexist_2017_count = len(_from_2016_non_coexist_2017)
    # count non coexist 2017 vs 2016
    _from_2017_non_coexist_2016_count = len(_from_2017_non_coexist_2016)

    # 2011 vs
    print(f'forward\n2011 vs 2015\nnon_coexist_2015_count = {non_coexist_2015_count} /{len(fifteen_columns)}\n'
            f'2011 vs 2016\nnon_coexist_2016_count = {non_coexist_2016_count} /{len(sixteen_columns)}\n'
            f'2011 vs 2017\nnon_coexist_2017_count = {non_coexist_2017_count} /{len(seventeen_columns)}\n'
            f'2011 vs all\ninstances not existing in all  = {u_non_coexist_count} / {len(eleven_columns)} possible\n')
    # vs 2011
    print(f'backward\n2016 vs 2011\n_from_2016_non_coexist_2011_count = {_from_2016_non_coexist_2011_count} /{len(sixteen_columns)}\n'
            f'2017 vs 2011\n_from_2017_non_coexist_2011_count = {_from_2017_non_coexist_2011_count} /{len(sixteen_columns)}\n'
            f'ALL vs 2011\nunique non coexist 2011 vs each in All = {r_u_non_coexist_count}\n')
    # 2016/2017 & 2017/2016
    print(f'2016 & 2016\n2016 vs 2017\n_from_2016_non_coexist_2017_count = {_from_2016_non_coexist_2017_count} /{len(sixteen_columns)}\n'
            f'2017 vs 2016\n_from_2017_non_coexist_2016_count = {_from_2017_non_coexist_2016_count} /{len(seventeen_columns)}\n')
# col_name_eda

- ***notes***:
    - 108 columns (distinctions) of 328
- ***possible***:
    - take out non-coexisting columns from all dataframes
        - then compare
            - would result in loss of ~1/3+ of columns
    - try to adjust
        - find what would be
            - seems possible for columns like the following
                - 10, 'Percent; SEX AND AGE - Male'
                - 96, 'Estimate; SEX AND AGE - Female'
                - 248, 'Estimate; RACE - Asian'
            - but more difficult for columns like
                - 11, 'Percent Margin of Error; SEX AND AGE - Male'
                - 97, 'Margin of Error; SEX AND AGE - Female'
                - 323, 'Percent Margin of Error; HISPANIC OR LATINO AND RACE - Not Hispanic or Latino - Two or more races - Two races excluding Some other race, and Three or more races')
                    - though ?could exclue this when calculating change?
        
- ***actions***:
    - time is becoming a more relevant an issue, MVP due today

In [None]:
# make dataframe for each zip code, should contain row from each year
def frame_per_zip(dataframes):
    '''
    take dataframe for each year set is available
    find common zipcode for each year
    make dataframe for that zipcode
        containing each year's measurements
    note:
        dataframes must be same length
    '''
    # check length of dataframes is same
    for _ in range(len(dataframes)):
        # done to ensure each zip code has same represnetation 
        if len(dataframes[_-1]) != len(dataframes[_]):
            # stop if we have different lengths
            raise Exception(f'len(dataframe[{-_}]) != len(dataframe[{_}])')
        # also check that dataframes have same Id
        random_samples = [random.randint(1,int(len(dataframes[-_])/2)),
                          random.randint(1,int(len(dataframes[-_]))),
                          random.randint(int(len(dataframes[-_])/2),len(dataframes[-_]))]
        for sample in random_samples:
            # pull Id from df to compare
            if dataframes[_]['Id'][sample] != dataframes[_-2]['Id'][sample]:
                # stop if they don't match
                raise Exception(f"NON MATCHING Id\n{dataframes[_].Id[sample]}\nERROR\n{dataframes[_-2].Id[sample]}\n")
    
    # now we can get to work
    mini_dfs = []
    for i in range(len(dataframes[0])):
        mini_df = pd.DataFrame(index=['2011','2012','2013','2014','2015','2016','2017'], columns=dataframes[0].columns)
        for _ in range(len(dataframes)):
            q = dataframes[_]
            print(q)
            # mini_df[_] = q.index(i)
        mini_dfs.append(mini_df)
    
    print(len(mini_df))
        
    return mini_dfs
        
    

In [None]:
# yrs = [sy2k11,sy2k12,sy2k13,sy2k14,sy2k15,sy2k16,sy2k17]
# z = frame_per_zip(yrs)

In [None]:
for year in yrs:
    print (year.columns)

In [None]:
# mini_df = pd.DataFrame(index=['2011','2012','2013','2014','2015','2016','2017'], columns=['a','b','c','d','e'])
# mini_df[1:2]

In [None]:
# dataframes = [y2k11,y2k12,y2k13,y2k14,y2k15,y2k16,y2k17]
# mini_df = pd.DataFrame(index=['2011','2012','2013','2014','2015','2016','2017'], columns=dataframes[0].columns)
# mini_df.iloc[0] = 'lol'
# mini_df.iloc[0]
# mini_df

In [None]:
'''# load 2000 data
y2k = pd.read_csv( a , low_memory=False )
# load 2010 data
y2k10 = pd.read_csv( b , low_memory=False )

# 2000 Census
b = y2k.copy()
# 2010 Census
o = y2k10.copy()

# reset 2000 columns to current 0th row values
b.columns = b.iloc[0]
# new 2000 dataframe without row where values are from
b = b[1:]
# reset index
b = b.reset_index()

# reset 2010 columns to current 0th row values
o.columns = o.iloc[0]
# new 2010 dataframe without row where values are from
o = o[1:]
# reset index
o = o.reset_index()

# identify zip codes from 2000 .Geography (last 5 chars of string)
zip_2000_codes = [q[-5:] for q in b.Geography]  # ValueError: invalid literal for int() with base 10: '006HH'
# identify zip codes from 2010 .Geography (last 5 chars of string)
zip_2010_codes = [q[-5:] for q in o.Geography]

# from 2000.Geography , instance is not seen in 2010.Geography  -- sample: zip_code = (2, 'c')
in_2000_but_not_2010_from_2000 = [zip_code for zip_code in enumerate(zip_2000_codes) if zip_code[1] not in zip_2010_codes]
# from 2010.Geography , instance is not seen in 2000.Geography  -- sample: zip_code[1] = 'c'
in_2010_but_not_2000_from_2010 = [zip_code for zip_code in enumerate(zip_2010_codes) if zip_code[1] not in zip_2000_codes]

# from 2000.Geography , instance is seen in 2010.Geography
in_2000_and_2010_from_2000 = [zip_code for zip_code in enumerate(zip_2000_codes) if zip_code[1] in zip_2010_codes]
# from 2010.Geography , instance is seen in 2000.Geography
in_2010_and_2000_from_2010 = [zip_code for zip_code in enumerate(zip_2010_codes) if zip_code[1] in zip_2000_codes]

# index of objects coexisting in 2000 and 2010
of_2000_indexes = [i for i,j in in_2000_and_2010_from_2000]
# index of objects coexisting in 2010 and 2000 
of_2010_indexes = [i for i,j in in_2010_and_2000_from_2010]
# ^note: these are different lists, if took j instead of i, then would be same list
if [j for i,j in in_2000_and_2010_from_2000] != [j for i,j in in_2010_and_2000_from_2010]:
    # like is seen here, j for j == True
    raise Exception(f'FLAWED ASSUMPTION , [j for i,j in 2000] != [j for i,j in 2010]\n'
                    f'len {len(in_2000_and_2010_from_2000)} {len(in_2010_and_2000_from_2010)}')
# however i for i == False
if of_2000_indexes == of_2010_indexes:
    # cheers
    raise Exception('FLAWED ASSUMPTION , of_2000_indexes != of_2010_indexes\n'
                    f'len y2k {len(of_2000_indexes)} 2k10 {len(of_2010_indexes)}')  

# thin 2000 to shared geo
b = b.iloc[of_2000_indexes]
# thin 2010 to shared geo
o = o.iloc[of_2010_indexes]'''
pass

In [None]:
"""play df"""
data = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42], 'Weight':[128,134,129,142]}
df = pd.DataFrame(data)
df

In [None]:
"""play list"""
a = [1,2,3,3,5,69,1,2,7,9,2]
b = [1,2,3]
c = [10,20,50,69]
d = set(a+b+c)
# d = set(d)
d

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(y2k11.set_index('Geography'),  figsize=(10, 10), diagonal='kde')

In [None]:
q = [1,2,3,'4',5,6,7,2,6,1,9,1,4,1]
z = 0
while z ==0:
    for x in q:
        if isinstance(x,str):
            z+=1
        print(f'z={z}\nx={x}')

In [None]:
old = pd.DataFrame({'A' : [4,5], 'B' : [10,20], 'C' : [100,50], 'D' : [-30,-50]})
old

In [None]:
new = old[['A', 'C', 'D']].copy()
new

In [None]:
'''
12-words
    combining census datasets to predict long term population trends (scalable)
yesterday: 
    mvp
today: 
    get a score
    enlarge dataset for long term trends
        e.g. historical migration trends vs census vs acs 5-yr /vs& acs 1-yr 
    explore other ml and sexy 
        if no time, set up for tomorrow
blocks:
    extracting data, usual
'''