This notebook is for Data Wrangling and Cleaning

In [1]:
# Imports
import pandas as pd
import numpy as np

This is the main dataset that contains information for each Food Environment Atlas category.

In [2]:
# Importing Dataset
file_food_atlas = 'StateAndCountyData.csv'
data_state_county = pd.read_csv(file_food_atlas)
data_state_county.head()

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga,LACCESS_POP10,18428.43969
1,1001,AL,Autauga,LACCESS_POP15,17496.69304
2,1001,AL,Autauga,PCH_LACCESS_POP_10_15,-5.056026
3,1001,AL,Autauga,PCT_LACCESS_POP10,33.769657
4,1001,AL,Autauga,PCT_LACCESS_POP15,32.062255


In [3]:
data_state_county.dtypes

FIPS               int64
State             object
County            object
Variable_Code     object
Value            float64
dtype: object

In [4]:
data_state_county.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 852810 entries, 0 to 852809
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   FIPS           852810 non-null  int64  
 1   State          852810 non-null  object 
 2   County         852810 non-null  object 
 3   Variable_Code  852810 non-null  object 
 4   Value          852810 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 32.5+ MB


In [5]:
data_state_county.describe()

Unnamed: 0,FIPS,Value
count,852810.0,852810.0
mean,30336.233835,10539.19
std,15188.596259,214819.9
min,1.0,-100.0
25%,18171.0,0.0
50%,29171.0,3.0
75%,45069.0,29.1
max,56045.0,39557040.0


Importing State Dataset -- This dataset contains supplemental information at the state level

In [6]:
# Importing State Dataset
file_state_data = 'SupplementalDataState.csv'
data_state = pd.read_csv(file_state_data)
data_state.head()

Unnamed: 0,State_FIPS,State,Variable_Code,Value
0,1,AL,WIC_PART_2012,141899.8333
1,1,AL,WIC_PART_2013,139000.5
2,1,AL,WIC_PART_2014,131046.25
3,1,AL,WIC_PART_2015,132132.75
4,1,AL,WIC_PART_2016,129159.9167


Importing County Dataset -- This dataset contains supplemental information

In [7]:
# Importing County Dataset
file_county_data = 'SupplementalDataCounty.csv'
data_county = pd.read_csv(file_county_data)
data_county.head()

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga County,2010_Census_Population,54571
1,1001,AL,Autauga County,Population_Estimate_2011,55208
2,1001,AL,Autauga County,Population_Estimate_2012,54936
3,1001,AL,Autauga County,Population_Estimate_2013,54713
4,1001,AL,Autauga County,Population_Estimate_2014,54876


In [8]:
data_county.dtypes

FIPS              int64
State            object
County           object
Variable_Code    object
Value             int64
dtype: object

Importing Variable List -- Might be useful later

In [9]:
file_variables = 'VariableList.csv'
variable_list = pd.read_csv(file_variables)
variable_list.head()

Unnamed: 0,Variable_Name,Category_Name,Category_Code,Subcategory_Name,Variable_Code,Geography,Units
0,"Population, low access to store, 2010",Access and Proximity to Grocery Store,ACCESS,Overall,LACCESS_POP10,CNTY10,Count
1,"Population, low access to store, 2015",Access and Proximity to Grocery Store,ACCESS,Overall,LACCESS_POP15,CNTY10,Count
2,"Population, low access to store (% change), 20...",Access and Proximity to Grocery Store,ACCESS,Overall,PCH_LACCESS_POP_10_15,CNTY10,% change
3,"Population, low access to store (%), 2010",Access and Proximity to Grocery Store,ACCESS,Overall,PCT_LACCESS_POP10,CNTY10,Percent
4,"Population, low access to store (%), 2015",Access and Proximity to Grocery Store,ACCESS,Overall,PCT_LACCESS_POP15,CNTY10,Percent


In [10]:
variable_list.dtypes

Variable_Name       object
Category_Name       object
Category_Code       object
Subcategory_Name    object
Variable_Code       object
Geography           object
Units               object
dtype: object

Reviewing the data_state_county data more closely

In [11]:
# The different variables are contained in the Variable_Code column
# Each Variable_Code is available for each state
data_state_county[data_state_county['State']=='AL']

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga,LACCESS_POP10,1.842844e+04
1,1001,AL,Autauga,LACCESS_POP15,1.749669e+04
2,1001,AL,Autauga,PCH_LACCESS_POP_10_15,-5.056026e+00
3,1001,AL,Autauga,PCT_LACCESS_POP10,3.376966e+01
4,1001,AL,Autauga,PCT_LACCESS_POP15,3.206225e+01
...,...,...,...,...,...
850705,1,AL,Total,State_Population_2014,4.842481e+06
850706,1,AL,Total,State_Population_2015,4.853160e+06
850707,1,AL,Total,State_Population_2016,4.864745e+06
850708,1,AL,Total,State_Population_2017,4.875120e+06


In [12]:
data_state_county.dtypes

FIPS               int64
State             object
County            object
Variable_Code     object
Value            float64
dtype: object

In [13]:
# Changing State from object to string
data_state_county['State'] = data_state_county['State'].astype('string')

In [14]:
# Changing County from object to string
data_state_county['County'] = data_state_county['County'].astype('string')

In [15]:
assert data_state_county['County'].dtype == 'string'

In [16]:
assert data_state_county['State'].dtype == 'string'

Finding missing values

In [17]:
#There are no missing values
data_state_county.isna().sum()

FIPS             0
State            0
County           0
Variable_Code    0
Value            0
dtype: int64

Subsetting and grouping

In [18]:
is_mo = data_state_county['State'] == 'MO'
is_stl = data_state_county['County'] == 'St. Louis'
data_state_county[is_mo & is_stl]

Unnamed: 0,FIPS,State,County,Variable_Code,Value
64423,29189,MO,St. Louis,LACCESS_POP10,264849.564300
64424,29189,MO,St. Louis,LACCESS_POP15,267303.233300
64425,29189,MO,St. Louis,PCH_LACCESS_POP_10_15,0.926439
64426,29189,MO,St. Louis,PCT_LACCESS_POP10,26.512689
64427,29189,MO,St. Louis,PCT_LACCESS_POP15,26.758313
...,...,...,...,...,...
799217,29510,MO,St. Louis,PERPOV10,1.000000
799218,29510,MO,St. Louis,CHILDPOVRATE15,39.300000
799219,29510,MO,St. Louis,PERCHLDPOV10,1.000000
799220,29510,MO,St. Louis,METRO13,1.000000


In [19]:
# data_state_county.pivot_tables(values=

In [20]:
LACCESS_POP10 = data_state_county[data_state_county['Variable_Code'] == 'LACCESS_POP10']

In [21]:
print(LACCESS_POP10)

         FIPS State      County  Variable_Code         Value
0        1001    AL     Autauga  LACCESS_POP10  18428.439690
41       1003    AL     Baldwin  LACCESS_POP10  35210.814080
82       1005    AL     Barbour  LACCESS_POP10   5722.305602
123      1007    AL        Bibb  LACCESS_POP10   1044.867327
164      1009    AL      Blount  LACCESS_POP10   1548.175559
...       ...   ...         ...            ...           ...
128070  56037    WY  Sweetwater  LACCESS_POP10  13391.715500
128111  56039    WY       Teton  LACCESS_POP10   6212.423697
128152  56041    WY       Uinta  LACCESS_POP10   4270.147119
128193  56043    WY    Washakie  LACCESS_POP10    931.411647
128234  56045    WY      Weston  LACCESS_POP10   1240.493102

[3143 rows x 5 columns]


In [22]:
data_state_county.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,852800,852801,852802,852803,852804,852805,852806,852807,852808,852809
FIPS,1001,1001,1001,1001,1001,1001,1001,1001,1001,1001,...,56,56,56,56,56,56,56,56,56,56
State,AL,AL,AL,AL,AL,AL,AL,AL,AL,AL,...,WY,WY,WY,WY,WY,WY,WY,WY,WY,WY
County,Autauga,Autauga,Autauga,Autauga,Autauga,Autauga,Autauga,Autauga,Autauga,Autauga,...,Total,Total,Total,Total,Total,Total,Total,Total,Total,Total
Variable_Code,LACCESS_POP10,LACCESS_POP15,PCH_LACCESS_POP_10_15,PCT_LACCESS_POP10,PCT_LACCESS_POP15,LACCESS_LOWI10,LACCESS_LOWI15,PCH_LACCESS_LOWI_10_15,PCT_LACCESS_LOWI10,PCT_LACCESS_LOWI15,...,SFSP_PART_2016,SFSP_PART_2017,SFSP_PART_2018,State_Population_2012,State_Population_2013,State_Population_2014,State_Population_2015,State_Population_2016,State_Population_2017,State_Population_2018
Value,18428.43969,17496.69304,-5.056026,33.769657,32.062255,5344.427472,6543.676824,22.439248,9.79353,11.991125,...,3734.0,4620.0,4671.0,576270.0,582123.0,582548.0,585668.0,584290.0,578934.0,577737.0


In [23]:
# Changing Variable_Code to the index
data_s_c = data_state_county.set_index("Variable_Code")
#data_s_c = data_s_c.T

In [24]:
data_s_c.head()

Unnamed: 0_level_0,FIPS,State,County,Value
Variable_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LACCESS_POP10,1001,AL,Autauga,18428.43969
LACCESS_POP15,1001,AL,Autauga,17496.69304
PCH_LACCESS_POP_10_15,1001,AL,Autauga,-5.056026
PCT_LACCESS_POP10,1001,AL,Autauga,33.769657
PCT_LACCESS_POP15,1001,AL,Autauga,32.062255


In [25]:
#data_s_c[data_s_c['State']=='AL']

In [26]:
#data_s_c[data_s_c['State']=='MO']

In [27]:
data_state_county.head()

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga,LACCESS_POP10,18428.43969
1,1001,AL,Autauga,LACCESS_POP15,17496.69304
2,1001,AL,Autauga,PCH_LACCESS_POP_10_15,-5.056026
3,1001,AL,Autauga,PCT_LACCESS_POP10,33.769657
4,1001,AL,Autauga,PCT_LACCESS_POP15,32.062255


In [33]:
al = data_state_county[data_state_county['State']=='AL']

In [34]:
al.head()

Unnamed: 0,FIPS,State,County,Variable_Code,Value
0,1001,AL,Autauga,LACCESS_POP10,18428.43969
1,1001,AL,Autauga,LACCESS_POP15,17496.69304
2,1001,AL,Autauga,PCH_LACCESS_POP_10_15,-5.056026
3,1001,AL,Autauga,PCT_LACCESS_POP10,33.769657
4,1001,AL,Autauga,PCT_LACCESS_POP15,32.062255


In [35]:
al_pivot = al.pivot_table(values="Value", index="County", columns="Variable_Code")

In [36]:
al_pivot.head()

Variable_Code,2010_Census_Population,AGRITRSM_OPS07,AGRITRSM_OPS12,AGRITRSM_RCT07,AGRITRSM_RCT12,BERRY_ACRES07,BERRY_ACRES12,BERRY_ACRESPTH07,BERRY_ACRESPTH12,BERRY_FARMS07,...,WICS16,WICSPTH11,WICSPTH16,WIC_PART_2012,WIC_PART_2013,WIC_PART_2014,WIC_PART_2015,WIC_PART_2016,WIC_PART_2017,WIC_PART_2018
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Autauga,,7.0,10.0,228000.0,146000.0,,5.0,,0.090621,3.0,...,5.0,0.090567,0.090511,,,,,,,
Autauga County,54571.0,,,,,,,,,,...,,,,,,,,,,
Baldwin,,18.0,16.0,124000.0,204000.0,79.0,93.0,0.458226,0.488456,36.0,...,28.0,0.13938,0.134802,,,,,,,
Baldwin County,182265.0,,,,,,,,,,...,,,,,,,,,,
Barbour,,27.0,32.0,163000.0,304000.0,,42.0,,1.546449,3.0,...,6.0,0.255942,0.232387,,,,,,,


In [37]:
al_pivot['LACCESS_POP10']

County
Autauga              18428.439690
Autauga County                NaN
Baldwin              35210.814080
Baldwin County                NaN
Barbour               5722.305602
                         ...     
Washington County             NaN
Wilcox                5173.220579
Wilcox County                 NaN
Winston               1011.264406
Winston County                NaN
Name: LACCESS_POP10, Length: 135, dtype: float64