# Consistency Checks for different source files/encodings

### Author: James Sharpnack @jsharpna

This contains consistency checks for any different data sources for geomapping.

##  msa cross table

### Compare against cbsatocountycrosswalk.csv

The old msa data is from https://data.nber.org/data/cbsa-fips-county-crosswalk.html, new data is from census bureau 

In [1]:
import pandas as pd

In [2]:
old_msa_filename = "cbsatocountycrosswalk.csv"

old_msa = pd.read_csv(old_msa_filename)

In [3]:
old_msa.head()

Unnamed: 0,countyname,state,ssacounty,fips,msa,l,msaname,cbsa,cbsaname,cbsaold,...,ssast,fipst,y2005,y2011,y2012,y2013,y2014,y2015,y2016,y2017
0,AUTAUGA,AL,1000,1001,5240.0,,"MONTGOMERY, AL",33860.0,"Montgomery, AL",33860.0,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
1,BALDWIN,AL,1010,1003,5160.0,,"MOBILE, AL",,,,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
2,BARBOUR,AL,1020,1005,1.0,,ALABAMA,,,,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
3,BIBB,AL,1030,1007,1.0,,ALABAMA,13820.0,"Birmingham-Hoover, AL",13820.0,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0
4,BLOUNT,AL,1040,1009,1000.0,,"BIRMINGHAM, AL",13820.0,"Birmingham-Hoover, AL",13820.0,...,1,1,2005.0,2011.0,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0


In [4]:
old_msa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3293 entries, 0 to 3292
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   countyname   3293 non-null   object 
 1   state        3293 non-null   object 
 2   ssacounty    3293 non-null   int64  
 3   fips         3293 non-null   int64  
 4   msa          3289 non-null   float64
 5   l            387 non-null    object 
 6   msaname      3288 non-null   object 
 7   cbsa         1165 non-null   float64
 8   cbsaname     1163 non-null   object 
 9   cbsaold      1171 non-null   float64
 10  cbsanameold  1171 non-null   object 
 11  ssast        3293 non-null   int64  
 12  fipst        3293 non-null   int64  
 13  y2005        3289 non-null   float64
 14  y2011        3289 non-null   float64
 15  y2012        3273 non-null   float64
 16  y2013        3273 non-null   float64
 17  y2014        3273 non-null   float64
 18  y2015        3273 non-null   float64
 19  y2016 

In [5]:
new_msa_file = "../../delphi_utils/data/fips_msa_cross.csv"

new_msa = pd.read_csv(new_msa_file)

In [6]:
new_msa.head()

Unnamed: 0,fips,msa
0,48059,10180
1,48253,10180
2,48441,10180
3,72003,10380
4,72005,10380


In [7]:
new_msa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251 entries, 0 to 1250
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   fips    1251 non-null   int64
 1   msa     1251 non-null   int64
dtypes: int64(2)
memory usage: 19.7 KB


In [25]:
old_msa = old_msa[~old_msa['cbsa'].isna()][['fips','cbsa']].astype(int)

old_msa.set_index('fips',inplace=True)

old_msa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1165 entries, 1001 to 72153
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   cbsa    1165 non-null   int64
dtypes: int64(1)
memory usage: 18.2 KB


In [26]:
new_msa.set_index('fips',inplace=True)
msa_comp = new_msa.join(old_msa,how="outer")

In [27]:
msa_comp

Unnamed: 0_level_0,msa,cbsa
fips,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,33860.0,33860.0
1003,19300.0,
1007,13820.0,13820.0
1009,13820.0,13820.0
1015,11500.0,11500.0
...,...,...
72143,41980.0,41980.0
72145,41980.0,41980.0
72149,38660.0,38660.0
72151,41980.0,41980.0


In [44]:
old_val = set(pd.unique(old_msa['cbsa']))
new_val = set(pd.unique(new_msa['msa']))

In [46]:
len(old_val.symmetric_difference(new_val))

95

In [28]:
msa_comp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1293 entries, 1001 to 72153
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   msa     1252 non-null   float64
 1   cbsa    1165 non-null   float64
dtypes: float64(2)
memory usage: 30.3 KB


In [42]:
pd.crosstab(msa_comp['msa'].isna(),msa_comp['cbsa'].isna())

cbsa,False,True
msa,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1124,128
True,41,0


- 128 fips are in the Census data that are not in the NBER data
- 41 fips are in the NBER data that are not in the Census data

In [32]:
## There 

msa_comp.dropna(axis=0).eval('msa != cbsa').sum()

136

### Compare against the msa_list file

In [52]:
old_msa_filename = "msa_list.csv"

old_msa = pd.read_csv(old_msa_filename)

In [53]:
old_msa.head()

Unnamed: 0,CBSA ID,MSA Name,FIPS ID,County Name
0,10180,"Abilene, TX (Metropolitan Statistical Area)",48059,"Callahan, TX"
1,10180,"Abilene, TX (Metropolitan Statistical Area)",48253,"Jones, TX"
2,10180,"Abilene, TX (Metropolitan Statistical Area)",48441,"Taylor, TX"
3,10380,"Aguadilla-Isabela, PR (Metropolitan Statistica...",72003,"Aguada Municipio, PR"
4,10380,"Aguadilla-Isabela, PR (Metropolitan Statistica...",72005,"Aguadilla Municipio, PR"


In [54]:
old_msa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1231 entries, 0 to 1230
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CBSA ID      1231 non-null   int64 
 1   MSA Name     1231 non-null   object
 2   FIPS ID      1231 non-null   int64 
 3   County Name  1231 non-null   object
dtypes: int64(2), object(2)
memory usage: 38.6+ KB


In [55]:
old_msa = old_msa[~old_msa['CBSA ID'].isna()][['FIPS ID','CBSA ID']].astype(int)

old_msa.set_index('FIPS ID',inplace=True)

old_msa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1231 entries, 48059 to 4027
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   CBSA ID  1231 non-null   int64
dtypes: int64(1)
memory usage: 19.2 KB


In [56]:
msa_comp = new_msa.join(old_msa,how="outer")

In [57]:
msa_comp

Unnamed: 0,msa,CBSA ID
1001,33860.0,33860.0
1003,19300.0,19300.0
1007,13820.0,13820.0
1009,13820.0,13820.0
1015,11500.0,11500.0
...,...,...
72143,41980.0,41980.0
72145,41980.0,41980.0
72149,38660.0,38660.0
72151,41980.0,41980.0


In [59]:
old_val = set(pd.unique(old_msa['CBSA ID']))
new_val = set(pd.unique(new_msa['msa']))

In [60]:
len(old_val.symmetric_difference(new_val))

0

In [61]:
msa_comp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1268 entries, 1001 to 72153
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   msa      1251 non-null   float64
 1   CBSA ID  1231 non-null   float64
dtypes: float64(2)
memory usage: 29.7 KB


In [70]:
(msa_comp['msa'] - msa_comp['CBSA ID']).sum()

0.0

In [63]:
pd.crosstab(msa_comp['msa'].isna(),msa_comp['CBSA ID'].isna())

CBSA ID,False,True
msa,Unnamed: 1_level_1,Unnamed: 2_level_1
False,1214,37
True,17,0


- 37 fips are in the March Census data that are not in the old census data
- 17 fips are in the old census data that are not in the March Census data

In [67]:
msa_comp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1268 entries, 1001 to 72153
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   msa      1251 non-null   float64
 1   CBSA ID  1231 non-null   float64
dtypes: float64(2)
memory usage: 29.7 KB


In [71]:
msa_comp['CBSA ID']

1001     33860.0
1003     19300.0
1007     13820.0
1009     13820.0
1015     11500.0
          ...   
72143    41980.0
72145    41980.0
72149    38660.0
72151    41980.0
72153    49500.0
Name: CBSA ID, Length: 1268, dtype: float64

In [72]:
msa_comp[msa_comp['CBSA ID'].isna()]

Unnamed: 0,msa,CBSA ID
15009,27980.0,
51003,16820.0,
51015,44420.0,
51031,31340.0,
51053,40060.0,
51059,47900.0,
51069,49020.0,
51095,47260.0,
51121,13980.0,
51149,40060.0,


- 15009 is Maui
- 51XXX is in VA

In [73]:
msa_comp[msa_comp['msa'].isna()]

Unnamed: 0,msa,CBSA ID
15901,,27980.0
51901,,16820.0
51907,,44420.0
51911,,31340.0
51918,,40060.0
51919,,47900.0
51921,,49020.0
51931,,47260.0
51933,,13980.0
51941,,40060.0


- Another Hawaii
- The rest are VA

## State encoding

In [84]:
pd.unique(msa_comp['msa'].dropna()).min()

10180.0

In [10]:
pd.unique(new_msa['msa']).shape

(392,)