### This notebook uses the old ACS data dictionary provided during the DataDive event on Sep 18 
### Please check the _'acs_dictionary_updated_eda'_ notebook for EDA on recently scraped ACS dictionaries (scraping code in _update_acs_dictionaries_)

<font color = Blue>
Research Question 3 (ACS): What are the differences between the 2014 and 2019 5-year American Community Survey datasets? Which variables or labels are missing from the 2014 data but in the 2019 data, and vice versa? Which 2019 variables have changed significantly from the 2014 estimates?

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
# Read the dictionary csv
acs_dic = pd.read_csv('data/acs5_variable_dict_2014_2019.csv')

# Change label field to upper case
acs_dic['label'] = acs_dic['label'].str.upper()

# Split the labels and store them in a list
acs_dic['label'] = acs_dic['label'].str.split('!!')

In [3]:
# Read other data files for parallel EDA
hill_acs_2014 = pd.read_csv("data/hillsborough_acs5-2014_census.csv")
miami_dade_acs_2014 = pd.read_csv("data/miami_dade_acs5-2014_census.csv")
orange_acs_2014 = pd.read_csv("data/orange_acs5-2014_census.csv")

hill_acs_2019 = pd.read_csv("data/hillsborough_acs5-2019_census.csv")
miami_dade_acs_2019 = pd.read_csv("data/miami_dade_acs5-2019_census.csv")
orange_acs_2019 = pd.read_csv("data/orange_acs5-2019_census.csv")

In [1]:
hill_acs_2014.head(5)

NameError: name 'hill_acs_2014' is not defined

In [53]:
hill_acs_2019.head(5)

Unnamed: 0,index,DP02_0001E,DP02_0001PE,DP02_0002E,DP02_0002PE,DP02_0003E,DP02_0003PE,DP02_0004E,DP02_0004PE,DP02_0005E,...,B25087_037E,B25087_038E,B25087_039E,B25088_001E,B25088_002E,B25088_003E,B25092_001E,B25092_002E,B25092_003E,GEOID
0,"Census Tract 4.01, Hillsborough County, Florid...",926,926,294,31.7,124,13.4,96,10.4,18,...,0,0,0,991,1172,359,19.9,26.3,9.0,12057000401
1,"Census Tract 101.05, Hillsborough County, Flor...",1496,1496,921,61.6,325,21.7,78,5.2,52,...,0,0,10,866,1549,391,17.3,23.3,9.0,12057010105
2,"Census Tract 102.03, Hillsborough County, Flor...",766,766,222,29.0,40,5.2,56,7.3,39,...,0,0,0,608,1155,263,18.6,21.9,15.7,12057010203
3,"Census Tract 106, Hillsborough County, Florida...",1108,1108,598,54.0,244,22.0,48,4.3,9,...,0,0,15,1241,1584,566,14.2,16.9,9.0,12057010600
4,"Census Tract 23, Hillsborough County, Florida:...",1419,1419,606,42.7,213,15.0,131,9.2,38,...,0,0,10,1174,1400,408,17.8,18.7,14.2,12057002300
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,"Census Tract 1.02, Hillsborough County, Florid...",2484,2484,478,19.2,172,6.9,70,2.8,56,...,0,0,0,779,1186,300,18.1,26.8,9.0,12057000102
317,"Census Tract 115.04, Hillsborough County, Flor...",797,797,519,65.1,153,19.2,19,2.4,6,...,25,0,20,1534,2325,708,17.2,20.8,9.0,12057011504
318,"Census Tract 141.06, Hillsborough County, Flor...",1311,1311,602,45.9,239,18.2,150,11.4,86,...,0,0,6,941,1352,352,17.3,21.3,11.9,12057014106
319,"Census Tract 128, Hillsborough County, Florida...",1500,1500,773,51.5,209,13.9,92,6.1,74,...,0,0,0,850,987,337,14.9,19.0,9.0,12057012800


In [4]:
# Create a df with a list of field names and datasets they're present in
df1 = pd.DataFrame(columns = ['variable_code'], data = hill_acs_2014.columns)
df1['dataset1'] = 'hill_14'
df2 = pd.DataFrame(columns = ['variable_code'], data = miami_dade_acs_2014.columns)
df2['dataset2'] = 'miami_dade_14'
df3 = pd.DataFrame(columns = ['variable_code'], data = orange_acs_2014.columns)
df3['dataset3'] = 'orange_14'
df4 = pd.DataFrame(columns = ['variable_code'], data = hill_acs_2019.columns)
df4['dataset4'] = 'hill_19'
df5 = pd.DataFrame(columns = ['variable_code'], data = miami_dade_acs_2019.columns)
df5['dataset5'] = 'miami_dade_19'
df6 = pd.DataFrame(columns = ['variable_code'], data = orange_acs_2019.columns)
df6['dataset6'] = 'orange_19'

df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['variable_code'], how='outer'), 
                   [df1, df2, df3, df4, df5, df6])

df_merged['datasets'] = df_merged[['dataset1', 'dataset2', 'dataset3', 'dataset4', 'dataset5', 'dataset6',]]\
                        .apply(lambda x: ', '.join(x.dropna()), axis=1)
df_merged = df_merged[['variable_code', 'datasets']]
df_merged = df_merged.loc[df_merged['variable_code'] != 'index']

In [5]:
df_merged.head(5)

Unnamed: 0,variable_code,datasets
1,DP02_0001E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
2,DP02_0001PE,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
3,DP02_0002E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
4,DP02_0002PE,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
5,DP02_0003E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."


In [6]:
# Separate 2014 variable codes from 2019 variable codes
acs_dic_14 = acs_dic.loc[acs_dic['acs_year'] == 2014]
acs_dic_19 = acs_dic.loc[acs_dic['acs_year'] == 2019]

In [7]:
print('Number of variable codes in 2014 dataset:', len(pd.unique(acs_dic_14['variable_code'])))
print('Number of variable codes in 2019 dataset:', len(pd.unique(acs_dic_19['variable_code'])))

Number of variable codes in 2014 dataset: 4088
Number of variable codes in 2019 dataset: 4176


In [8]:
# Format the field names add dataset year as a suffix 
# (except for 'variable_code' field since it'll be used to join the 2 tables)
acs_dic_14.columns = ['{}{}'.format(c, '' if c == 'variable_code' else '_14') for c in acs_dic_14.columns]
acs_dic_19.columns = ['{}{}'.format(c, '' if c == 'variable_code' else '_19') for c in acs_dic_19.columns]

In [9]:
# Merge 2014 dataset, 2019 dataset and df_merged (field names from each file) on 'variable_code'
acs_dic_compare = reduce(lambda  left,right: pd.merge(left,right,on=['variable_code'], how='outer'), 
                   [acs_dic_14, acs_dic_19, df_merged])

# Compare 'concept', 'predicateType', 'predicateOnly' and 'label' fields in both of the datasets
acs_dic_compare['concept_match'] = np.where(acs_dic_compare['concept_14'] == acs_dic_compare['concept_19'], True, False)
acs_dic_compare['predicateType_match'] = np.where(acs_dic_compare['predicateType_14'] == acs_dic_compare['predicateType_19'], True, False)
acs_dic_compare['predicateOnly_match'] = np.where(acs_dic_compare['predicateOnly_14'] == acs_dic_compare['predicateOnly_19'], True, False)
acs_dic_compare['label_match'] = np.where(acs_dic_compare['label_14'] == acs_dic_compare['label_19'], True, False)

In [34]:
# Replace NaN values in 'label_14' and 'label_19' columns with an empty list
acs_dic_compare['label_14'] = acs_dic_compare['label_14'].apply(lambda d: d if isinstance(d, list) else [])
acs_dic_compare['label_19'] = acs_dic_compare['label_19'].apply(lambda d: d if isinstance(d, list) else [])
acs_dic_compare['datasets'].fillna('', inplace = True)
acs_dic_compare.head(5)

Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14,label_19,concept_19,...,predicateOnly_19,acs_year_19,datasets,concept_match,predicateType_match,predicateOnly_match,label_match,label_same,label_14_diff,label_19_diff
0,DP02_0019EA,"[ANNOTATION OF ESTIMATE, RELATIONSHIP, POPULAT...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF ESTIMATE, RELATIONSHIP, POPULAT...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,,True,True,True,False,"[RELATIONSHIP, ANNOTATION OF ESTIMATE, POPULAT...",[SPOUSE],[HOUSEHOLDER]
1,DP02_0126E,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, DANISH]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, CZECH]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[ESTIMATE, ANCESTRY, TOTAL POPULATION]",[DANISH],[CZECH]
2,DP02_0072EA,"[ANNOTATION OF ESTIMATE, DISABILITY STATUS OF ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF ESTIMATE, DISABILITY STATUS OF ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,,True,True,True,False,[DISABILITY STATUS OF THE CIVILIAN NONINSTITUT...,[UNDER 18 YEARS],[TOTAL CIVILIAN NONINSTITUTIONALIZED POPULATIO...
3,DP02_0069PMA,"[ANNOTATION OF PERCENT MARGIN OF ERROR, VETERA...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF PERCENT MARGIN OF ERROR, VETERA...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,,True,True,True,False,"[VETERAN STATUS, CIVILIAN POPULATION 18 YEARS ...",[CIVILIAN VETERANS],[]
4,DP02_0126M,"[MARGIN OF ERROR, ANCESTRY, TOTAL POPULATION, ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[MARGIN OF ERROR, ANCESTRY, TOTAL POPULATION, ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,,True,True,True,False,"[ANCESTRY, TOTAL POPULATION, MARGIN OF ERROR]",[DANISH],[CZECH]


In [None]:
# Identify the difference between labels from 2014 and 2019 dataset

# Common labels for each row:
acs_dic_compare['label_same'] = acs_dic_compare.apply(lambda row: list(set(row['label_14']).intersection(set(row['label_19']))), axis = 1)

# Labels present in 2014's dataset but not in 2019's
acs_dic_compare['label_14_diff'] = acs_dic_compare.apply(lambda row: list(set(row['label_14']).difference(set(row['label_same']))), axis = 1)

# Labels present in 2019's datatset but not in 2014's
acs_dic_compare['label_19_diff'] = acs_dic_compare.apply(lambda row: list(set(row['label_19']).difference(set(row['label_same']))), axis = 1)

## Data dictionary for the new fields:
- *label_14, concept_14, predicateType_14, group_14, limit_14*: labels, concepts, predicateTypes, groups, limits for __2014's__ variable codes

- *label_19, concept_19, predicateType_19, group_19, limit_19*: labels, concepts, predicateTypes, groups, limits for __2019's__ variable codes

<font color = Blue>
    
- *datasets*: variable code is present in these datafiles
    
- *concept_match, predicateType_match, predicateOnly_match, label_match*:(T/F) if 2014's fields match with 2019's fields
    
- *label_same*: common labels in 2014's and 2019's dataset
    
- *label_14_diff*: labels present in 2014's dataset but NOT in 2019's
    
- *label_19_diff*: labels present in 2019's dataset but NOT in 2014's

In [10]:
acs_dic_compare.head(5)

Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14,label_19,concept_19,predicateType_19,group_19,limit_19,predicateOnly_19,acs_year_19,datasets,concept_match,predicateType_match,predicateOnly_match,label_match
0,DP02_0019EA,"[ANNOTATION OF ESTIMATE, RELATIONSHIP, POPULAT...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF ESTIMATE, RELATIONSHIP, POPULAT...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2019.0,,True,True,True,False
1,DP02_0126E,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, DANISH]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, CZECH]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False
2,DP02_0072EA,"[ANNOTATION OF ESTIMATE, DISABILITY STATUS OF ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF ESTIMATE, DISABILITY STATUS OF ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2019.0,,True,True,True,False
3,DP02_0069PMA,"[ANNOTATION OF PERCENT MARGIN OF ERROR, VETERA...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF PERCENT MARGIN OF ERROR, VETERA...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2019.0,,True,True,True,False
4,DP02_0126M,"[MARGIN OF ERROR, ANCESTRY, TOTAL POPULATION, ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[MARGIN OF ERROR, ANCESTRY, TOTAL POPULATION, ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2019.0,,True,True,True,False


In [11]:
# Number of matches
for c in ['concept_match', 'predicateType_match', 'predicateOnly_match', 'label_match']:
    print('\n')
    print(acs_dic_compare[c].value_counts())



True     4088
False     659
Name: concept_match, dtype: int64


True     3872
False     875
Name: predicateType_match, dtype: int64


True     4088
False     659
Name: predicateOnly_match, dtype: int64


False    4203
True      544
Name: label_match, dtype: int64


## How many fields from data dictionary are not present in any of the files?

In [12]:
print('Number of fields from data dictinary that are not present in any of the files:', acs_dic_compare['datasets'].isna().sum())
acs_dic_compare.loc[acs_dic_compare['datasets'].isna()].head(5)

Number of fields from data dictinary that are not present in any of the files: 3132


Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14,label_19,concept_19,predicateType_19,group_19,limit_19,predicateOnly_19,acs_year_19,datasets,concept_match,predicateType_match,predicateOnly_match,label_match
0,DP02_0019EA,"[ANNOTATION OF ESTIMATE, RELATIONSHIP, POPULAT...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF ESTIMATE, RELATIONSHIP, POPULAT...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2019.0,,True,True,True,False
2,DP02_0072EA,"[ANNOTATION OF ESTIMATE, DISABILITY STATUS OF ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF ESTIMATE, DISABILITY STATUS OF ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2019.0,,True,True,True,False
3,DP02_0069PMA,"[ANNOTATION OF PERCENT MARGIN OF ERROR, VETERA...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF PERCENT MARGIN OF ERROR, VETERA...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2019.0,,True,True,True,False
4,DP02_0126M,"[MARGIN OF ERROR, ANCESTRY, TOTAL POPULATION, ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[MARGIN OF ERROR, ANCESTRY, TOTAL POPULATION, ...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2019.0,,True,True,True,False
5,DP02_0057PMA,"[ANNOTATION OF PERCENT MARGIN OF ERROR, SCHOOL...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2014.0,"[ANNOTATION OF PERCENT MARGIN OF ERROR, SCHOOL...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,string,DP02,0.0,True,2019.0,,True,True,True,False


## How many fields from the files are not present in the data dictionary?

In [13]:
print('Number of fields from data dictinary that are not present in any of the files:', \
      len(acs_dic_compare.loc[(acs_dic_compare['concept_14'].isna()) & (acs_dic_compare['concept_19'].isna())].index))
acs_dic_compare.loc[(acs_dic_compare['concept_14'].isna()) & (acs_dic_compare['concept_19'].isna())].reset_index(drop = True)[['variable_code', 'datasets']].head(5)

Number of fields from data dictinary that are not present in any of the files: 571


Unnamed: 0,variable_code,datasets
0,S1701_C01_001E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
1,S1701_C01_002E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
2,S1701_C01_003E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
3,S1701_C01_004E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."
4,S1701_C01_005E,"hill_14, miami_dade_14, orange_14, hill_19, mi..."


In [14]:
#df8 = acs_dic_compare.loc[(acs_dic_compare['concept_14'].isna()) & (acs_dic_compare['concept_19'].isna())].reset_index(drop = True)[['variable_code', 'datasets']]
#df8.to_csv('acs_dictionary_missing_variable_codes.csv')

In [16]:
acs_dic_compare.to_csv('acs_dictinary_transformed.csv')

In [18]:
# Find number of variable codes associated with each label 
acs_dic_14['label_14'] = acs_dic_14.apply(lambda row: str(row['label_14']), axis = 1)
acs_dic_14_labels = acs_dic_14.groupby('label_14')['variable_code'].apply(list).reset_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acs_dic_14['label_14'] = acs_dic_14.apply(lambda row: str(row['label_14']), axis = 1)


In [19]:
# Are there labels with more than 1 variable codes?
acs_dic_14_labels['ct_variable_code'] = acs_dic_14_labels['variable_code'].apply(lambda x: len(x))
acs_dic_14_labels.loc[acs_dic_14_labels['ct_variable_code']>1]

Unnamed: 0,label_14,variable_code,ct_variable_code
420,"['ANNOTATION OF ESTIMATE', 'SELECTED MONTHLY O...","[DP04_0114EA, DP04_0123EA]",2
425,"['ANNOTATION OF ESTIMATE', 'SEX AND AGE', '18 ...","[DP05_0022EA, DP05_0018EA]",2
438,"['ANNOTATION OF ESTIMATE', 'SEX AND AGE', '65 ...","[DP05_0021EA, DP05_0025EA]",2
928,"['ANNOTATION OF MARGIN OF ERROR', 'SELECTED MO...","[DP04_0123MA, DP04_0114MA]",2
933,"['ANNOTATION OF MARGIN OF ERROR', 'SEX AND AGE...","[DP05_0022MA, DP05_0018MA]",2
946,"['ANNOTATION OF MARGIN OF ERROR', 'SEX AND AGE...","[DP05_0021MA, DP05_0025MA]",2
1436,"['ANNOTATION OF PERCENT MARGIN OF ERROR', 'SEL...","[DP04_0123PMA, DP04_0114PMA]",2
1441,"['ANNOTATION OF PERCENT MARGIN OF ERROR', 'SEX...","[DP05_0018PMA, DP05_0022PMA]",2
1454,"['ANNOTATION OF PERCENT MARGIN OF ERROR', 'SEX...","[DP05_0021PMA, DP05_0025PMA]",2
1944,"['ANNOTATION OF PERCENT', 'SELECTED MONTHLY OW...","[DP04_0114PEA, DP04_0123PEA]",2


In [20]:
acs_dic_14[acs_dic_14['variable_code'].isin(['DP04_0114EA', 'DP04_0123EA'])]

Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14
2844,DP04_0114EA,"['ANNOTATION OF ESTIMATE', 'SELECTED MONTHLY O...",SELECTED HOUSING CHARACTERISTICS,string,DP04,0,True,2014
3433,DP04_0123EA,"['ANNOTATION OF ESTIMATE', 'SELECTED MONTHLY O...",SELECTED HOUSING CHARACTERISTICS,string,DP04,0,True,2014


## QC in the breakout

In [21]:
acs_dic_compare.loc[acs_dic_compare['variable_code'] == 'DP04_0037E']

Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14,label_19,concept_19,...,predicateOnly_19,acs_year_19,datasets,concept_match,predicateType_match,predicateOnly_match,label_match,label_same,label_14_diff,label_19_diff
2456,DP04_0037E,"[ESTIMATE, BEDROOMS, TOTAL HOUSING UNITS]",SELECTED HOUSING CHARACTERISTICS,int,DP04,0.0,True,2014.0,"[ESTIMATE, ROOMS, TOTAL HOUSING UNITS, MEDIAN ...",SELECTED HOUSING CHARACTERISTICS,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,False,True,False,"[ESTIMATE, TOTAL HOUSING UNITS]",[BEDROOMS],"[MEDIAN ROOMS, ROOMS]"


In [22]:
acs_dic_compare.loc[acs_dic_compare['datasets'].apply(lambda x: ('miami_dade_19' in x) & ('miami_dade_14' not in x))]


Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14,label_19,concept_19,...,predicateOnly_19,acs_year_19,datasets,concept_match,predicateType_match,predicateOnly_match,label_match,label_same,label_14_diff,label_19_diff
4092,DP02_0153PE,[],,,,,,,"[PERCENT, COMPUTERS AND INTERNET USE, TOTAL HO...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],"[COMPUTERS AND INTERNET USE, TOTAL HOUSEHOLDS,..."
4095,DP02_0153E,[],,,,,,,"[ESTIMATE, COMPUTERS AND INTERNET USE, TOTAL H...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],"[COMPUTERS AND INTERNET USE, TOTAL HOUSEHOLDS,..."
4101,DP04_0142PE,[],,,,,,,"[PERCENT, GROSS RENT AS A PERCENTAGE OF HOUSEH...",SELECTED HOUSING CHARACTERISTICS,...,True,2019.0,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],[OCCUPIED UNITS PAYING RENT (EXCLUDING UNITS W...
4105,DP04_0143PE,[],,,,,,,"[PERCENT, GROSS RENT AS A PERCENTAGE OF HOUSEH...",SELECTED HOUSING CHARACTERISTICS,...,True,2019.0,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],[OCCUPIED UNITS PAYING RENT (EXCLUDING UNITS W...
4109,DP04_0143E,[],,,,,,,"[ESTIMATE, GROSS RENT AS A PERCENTAGE OF HOUSE...",SELECTED HOUSING CHARACTERISTICS,...,True,2019.0,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],[OCCUPIED UNITS PAYING RENT (EXCLUDING UNITS W...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4742,B25087_035E,[],,,,,,,[],,...,,,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],[]
4743,B25087_036E,[],,,,,,,[],,...,,,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],[]
4744,B25087_037E,[],,,,,,,[],,...,,,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],[]
4745,B25087_038E,[],,,,,,,[],,...,,,"hill_19, miami_dade_19, orange_19",False,False,False,False,[],[],[]


In [23]:
acs_dic_compare.loc[acs_dic_compare['datasets'].apply(lambda x: ('miami_dade_14' in x) & ('miami_dade_19' not in x))]

Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14,label_19,concept_19,...,predicateOnly_19,acs_year_19,datasets,concept_match,predicateType_match,predicateOnly_match,label_match,label_same,label_14_diff,label_19_diff
4479,S2506_C01_066E,[],,,,,,,[],,...,,,"hill_14, miami_dade_14, orange_14",False,False,False,False,[],[],[]
4480,S2506_C01_067E,[],,,,,,,[],,...,,,"hill_14, miami_dade_14, orange_14",False,False,False,False,[],[],[]
4481,S2506_C01_068E,[],,,,,,,[],,...,,,"hill_14, miami_dade_14, orange_14",False,False,False,False,[],[],[]


## Work with Dulakshi on Task 1

In [43]:
acs_dic_compare[(acs_dic_compare['label_14'].apply(lambda x: len(x)) != 0)]['label_match'].value_counts() 

False    3544
True      544
Name: label_match, dtype: int64

In [47]:
q1 = acs_dic_compare[(acs_dic_compare['label_14'].apply(lambda x: len(x)) != 0)]

In [48]:
q1.loc[q1['datasets']!='']

Unnamed: 0,variable_code,label_14,concept_14,predicateType_14,group_14,limit_14,predicateOnly_14,acs_year_14,label_19,concept_19,...,predicateOnly_19,acs_year_19,datasets,concept_match,predicateType_match,predicateOnly_match,label_match,label_same,label_14_diff,label_19_diff
1,DP02_0126E,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, DANISH]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, CZECH]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[ESTIMATE, ANCESTRY, TOTAL POPULATION]",[DANISH],[CZECH]
10,DP02_0125E,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, CZECH]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, ARAB]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[ESTIMATE, ANCESTRY, TOTAL POPULATION]",[CZECH],[ARAB]
11,DP02_0054PE,"[PERCENT, SCHOOL ENROLLMENT, POPULATION 3 YEAR...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,float,DP02,0.0,True,2014.0,"[PERCENT, SCHOOL ENROLLMENT, POPULATION 3 YEAR...",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[SCHOOL ENROLLMENT, PERCENT, POPULATION 3 YEAR...",[KINDERGARTEN],"[NURSERY SCHOOL, PRESCHOOL]"
14,DP02_0124E,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, ARAB]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, AMERICAN]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[ESTIMATE, ANCESTRY, TOTAL POPULATION]",[ARAB],[AMERICAN]
21,DP02_0123E,"[ESTIMATE, ANCESTRY, TOTAL POPULATION, AMERICAN]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,int,DP02,0.0,True,2014.0,"[ESTIMATE, ANCESTRY, TOTAL POPULATION]",SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[ESTIMATE, ANCESTRY, TOTAL POPULATION]",[AMERICAN],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4067,DP05_0010PE,"[PERCENT, SEX AND AGE, 35 TO 44 YEARS]",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,float,DP05,0.0,True,2014.0,"[PERCENT, SEX AND AGE, TOTAL POPULATION, 25 TO...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[SEX AND AGE, PERCENT]",[35 TO 44 YEARS],"[25 TO 34 YEARS, TOTAL POPULATION]"
4074,DP05_0050E,"[ESTIMATE, RACE, ONE RACE, NATIVE HAWAIIAN AND...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,int,DP05,0.0,True,2014.0,"[ESTIMATE, RACE, TOTAL POPULATION, ONE RACE, A...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[ESTIMATE, RACE, ONE RACE]","[NATIVE HAWAIIAN AND OTHER PACIFIC ISLANDER, S...","[ASIAN, TOTAL POPULATION, VIETNAMESE]"
4076,DP05_0051E,"[ESTIMATE, RACE, ONE RACE, NATIVE HAWAIIAN AND...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,int,DP05,0.0,True,2014.0,"[ESTIMATE, RACE, TOTAL POPULATION, ONE RACE, A...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[ESTIMATE, RACE, ONE RACE]","[OTHER PACIFIC ISLANDER, NATIVE HAWAIIAN AND O...","[ASIAN, OTHER ASIAN, TOTAL POPULATION]"
4080,DP05_0054PE,"[PERCENT, RACE, TWO OR MORE RACES, WHITE AND B...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,float,DP05,0.0,True,2014.0,"[PERCENT, RACE, TOTAL POPULATION, ONE RACE, NA...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,...,True,2019.0,"hill_14, miami_dade_14, orange_14, hill_19, mi...",True,True,True,False,"[RACE, PERCENT]","[TWO OR MORE RACES, WHITE AND BLACK OR AFRICAN...","[ONE RACE, NATIVE HAWAIIAN AND OTHER PACIFIC I..."


In [51]:
q1[['variable_code', 'label_14', 'label_19', 'label_same', 'label_14_diff', 'label_19_diff']].to_csv('label_mismatch_just_labels.csv')