# Consistency Checks for different source files/encodings

### Author: James Sharpnack @jsharpna

This contains consistency checks for any different data sources for geomapping.

In [3]:
import pandas as pd
df = pd.read_csv("old_source_files/02_20_uszips.csv")
df

Unnamed: 0,zip,fips,county_name,hrrnum,hsanum,population,city,state_id,state_name,zcta,...,county_weights,county_names_all,county_fips_all,imprecise,military,timezone,dma_code,dma_name,cbsa_id,msa_name
0,601,72001,Adjuntas,,,17242,Adjuntas,PR,Puerto Rico,True,...,"{'72001':99.43,'72141':0.57}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico,,,,
1,602,72003,Aguada,,,38442,Aguada,PR,Puerto Rico,True,...,{'72003':100},Aguada,72003,False,False,America/Puerto_Rico,,,,
2,603,72005,Aguadilla,,,48814,Aguadilla,PR,Puerto Rico,True,...,{'72005':100},Aguadilla,72005,False,False,America/Puerto_Rico,,,,
3,606,72093,Maricao,,,6437,Maricao,PR,Puerto Rico,True,...,"{'72093':94.88,'72121':1.35,'72153':3.78}",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico,,,,
4,610,72011,Añasco,,,27073,Anasco,PR,Puerto Rico,True,...,"{'72003':0.55,'72011':99.45}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33094,99923,2198,Prince of Wales-Hyder,10.0,2006.0,15,Hyder,AK,Alaska,True,...,{'02198':100},Prince of Wales-Hyder,2198,False,False,America/Sitka,0.0,(NON-DMA COUNTIES),,
33095,99925,2198,Prince of Wales-Hyder,10.0,2006.0,927,Klawock,AK,Alaska,True,...,{'02198':100},Prince of Wales-Hyder,2198,False,False,America/Sitka,0.0,(NON-DMA COUNTIES),,
33096,99926,2198,Prince of Wales-Hyder,10.0,2006.0,1635,Metlakatla,AK,Alaska,True,...,{'02198':100},Prince of Wales-Hyder,2198,False,False,America/Metlakatla,0.0,(NON-DMA COUNTIES),,
33097,99927,2198,Prince of Wales-Hyder,10.0,2006.0,38,Point Baker,AK,Alaska,True,...,{'02198':100},Prince of Wales-Hyder,2198,False,False,America/Sitka,0.0,(NON-DMA COUNTIES),,


In [41]:
df['county_weights']

0                     {'72001':99.43,'72141':0.57}
1                                    {'72003':100}
2                                    {'72005':100}
3        {'72093':94.88,'72121':1.35,'72153':3.78}
4                     {'72003':0.55,'72011':99.45}
                           ...                    
33094                                {'02198':100}
33095                                {'02198':100}
33096                                {'02198':100}
33097                                {'02198':100}
33098                                {'02275':100}
Name: county_weights, Length: 33099, dtype: object

##  msa cross table

### Compare against cbsatocountycrosswalk.csv

The old msa data is from https://data.nber.org/data/cbsa-fips-county-crosswalk.html, new data is from census bureau 

In [5]:
old_msa_filename = "old_source_files/cbsatocountycrosswalk.csv"
old_msa = pd.read_csv(old_msa_filename)
old_msa.head()

Unnamed: 0,countyname,state,ssacounty,fips,msa,l,msaname,cbsa,cbsaname,cbsaold,...,ssast,fipst,y2005,y2011,y2012,y2013,y2014,y2015,y2016,y2017
0,AUTAUGA,AL,1000,1001,5240.0,,"MONTGOMERY, AL",33860.0,"Montgomery, AL",33860.0,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
1,BALDWIN,AL,1010,1003,5160.0,,"MOBILE, AL",,,,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
2,BARBOUR,AL,1020,1005,1.0,,ALABAMA,,,,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
3,BIBB,AL,1030,1007,1.0,,ALABAMA,13820.0,"Birmingham-Hoover, AL",13820.0,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
4,BLOUNT,AL,1040,1009,1000.0,,"BIRMINGHAM, AL",13820.0,"Birmingham-Hoover, AL",13820.0,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0


In [7]:
new_msa_file = "../../delphi_utils/data/fips_msa_table.csv"
new_msa = pd.read_csv(new_msa_file)
new_msa.head()

Unnamed: 0,fips,msa
0,48059,10180
1,48253,10180
2,48441,10180
3,72003,10380
4,72005,10380


In [8]:
old_msa = old_msa[~old_msa['cbsa'].isna()][['fips','cbsa']].astype(int)
old_msa.set_index('fips',inplace=True)
new_msa.set_index('fips',inplace=True)
msa_comp = new_msa.join(old_msa,how="outer")
msa_comp

Unnamed: 0_level_0,msa,cbsa
fips,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,33860.0,33860.0
1003,19300.0,
1007,13820.0,13820.0
1009,13820.0,13820.0
1015,11500.0,11500.0
...,...,...
72143,41980.0,41980.0
72145,41980.0,41980.0
72149,38660.0,38660.0
72151,41980.0,41980.0


In [9]:
old_val = set(pd.unique(old_msa['cbsa']))
new_val = set(pd.unique(new_msa['msa']))
len(old_val.symmetric_difference(new_val))
msa_comp.info()
pd.crosstab(msa_comp['msa'].isna(),msa_comp['cbsa'].isna())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1293 entries, 1001 to 72153
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   msa     1252 non-null   float64
 1   cbsa    1165 non-null   float64
dtypes: float64(2)
memory usage: 30.3 KB


cbsa,False,True
msa,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1124,128
True,41,0


- 128 fips are in the Census data that are not in the NBER data
- 41 fips are in the NBER data that are not in the Census data

In [10]:
msa_comp.dropna(axis=0).eval('msa != cbsa').sum()

136

### Compare against the msa_list file

In [11]:
old_msa_filename = "old_source_files/msa_list.csv"
old_msa = pd.read_csv(old_msa_filename)
old_msa.head()

Unnamed: 0,CBSA ID,MSA Name,FIPS ID,County Name
0,10180,"Abilene, TX (Metropolitan Statistical Area)",48059,"Callahan, TX"
1,10180,"Abilene, TX (Metropolitan Statistical Area)",48253,"Jones, TX"
2,10180,"Abilene, TX (Metropolitan Statistical Area)",48441,"Taylor, TX"
3,10380,"Aguadilla-Isabela, PR (Metropolitan Statistica...",72003,"Aguada Municipio, PR"
4,10380,"Aguadilla-Isabela, PR (Metropolitan Statistica...",72005,"Aguadilla Municipio, PR"


In [12]:
old_msa = old_msa[~old_msa['CBSA ID'].isna()][['FIPS ID','CBSA ID']].astype(int)

old_msa.set_index('FIPS ID',inplace=True)

old_msa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231 entries, 48059 to 4027
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   CBSA ID  1231 non-null   int64
dtypes: int64(1)
memory usage: 19.2 KB


In [13]:
msa_comp = new_msa.join(old_msa,how="outer")

In [14]:
msa_comp

Unnamed: 0,msa,CBSA ID
1001,33860.0,33860.0
1003,19300.0,19300.0
1007,13820.0,13820.0
1009,13820.0,13820.0
1015,11500.0,11500.0
...,...,...
72143,41980.0,41980.0
72145,41980.0,41980.0
72149,38660.0,38660.0
72151,41980.0,41980.0


In [15]:
old_val = set(pd.unique(old_msa['CBSA ID']))
new_val = set(pd.unique(new_msa['msa']))

In [16]:
len(old_val.symmetric_difference(new_val))

0

In [17]:
msa_comp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1268 entries, 1001 to 72153
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   msa      1251 non-null   float64
 1   CBSA ID  1231 non-null   float64
dtypes: float64(2)
memory usage: 29.7 KB


In [18]:
(msa_comp['msa'] - msa_comp['CBSA ID']).sum()

0.0

In [19]:
pd.crosstab(msa_comp['msa'].isna(),msa_comp['CBSA ID'].isna())

CBSA ID,False,True
msa,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1214,37
True,17,0


- 37 fips are in the March Census data that are not in the old census data
- 17 fips are in the old census data that are not in the March Census data

In [20]:
msa_comp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1268 entries, 1001 to 72153
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   msa      1251 non-null   float64
 1   CBSA ID  1231 non-null   float64
dtypes: float64(2)
memory usage: 29.7 KB


In [21]:
msa_comp['CBSA ID']

1001     33860.0
1003     19300.0
1007     13820.0
1009     13820.0
1015     11500.0
          ...   
72143    41980.0
72145    41980.0
72149    38660.0
72151    41980.0
72153    49500.0
Name: CBSA ID, Length: 1268, dtype: float64

In [72]:
msa_comp[msa_comp['CBSA ID'].isna()]

Unnamed: 0,msa,CBSA ID
15009,27980.0,
51003,16820.0,
51015,44420.0,
51031,31340.0,
51053,40060.0,
51059,47900.0,
51069,49020.0,
51095,47260.0,
51121,13980.0,
51149,40060.0,


- 15009 is Maui
- 51XXX is in VA

In [73]:
msa_comp[msa_comp['msa'].isna()]

Unnamed: 0,msa,CBSA ID
15901,,27980.0
51901,,16820.0
51907,,44420.0
51911,,31340.0
51918,,40060.0
51919,,47900.0
51921,,49020.0
51931,,47260.0
51933,,13980.0
51941,,40060.0


- Another Hawaii
- The rest are VA

## State encoding

In [22]:
pd.unique(msa_comp['msa'].dropna()).min()

10180.0

In [23]:
pd.unique(new_msa['msa']).shape

(392,)