**Data Examples**

In [1]:
import pandas as pd

In [17]:
## Only run if data is not already downloaded
!mkdir -p data
## Download relevant SEDA data
!curl --output ./data/seda_codebook_county_5.0.xlsx "https://stacks.stanford.edu/file/druid:cs829jn7849/seda_codebook_county_5.0.xlsx"
!curl --output ./data/seda_county_long_cs_5.0.csv   "https://stacks.stanford.edu/file/druid:cs829jn7849/seda_county_long_cs_5.0.csv"
!curl --output ./data/seda_county_long_gcs_5.0.csv  "https://stacks.stanford.edu/file/druid:cs829jn7849/seda_county_long_gcs_5.0.csv"

## Download and extract relevant food desert data
!curl --output ./data/food_atlas.zip "https://ers.usda.gov/sites/default/files/_laserfiche/DataFiles/80591/2019%20Food%20Access%20Research%20Atlas%20Data.zip?v=65647"
!unzip ./data/food_atlas.zip -d ./data


**Dowload CDC Data**

In [None]:
!curl --output ./data/cdc_health_data.csv "https://data.cdc.gov/api/views/mb5y-ytti/rows.csv?accessType=DOWNLOAD"

**Food Atlas and CDC Data California DB Download**

In [22]:
!curl -L -o ./data/health_food_access_ca.csv  "https://drive.google.com/uc?export=download&id=1NSa26vi0gFuOdWejolhQw9AV_qk6LlKq"


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  559k  100  559k    0     0   249k      0  0:00:02  0:00:02 --:--:--  579k


**Food Atlas Data Example**

In [18]:
import json
import statistics
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [7]:
food_atlas = pd.read_csv("./data/Food Access Research Atlas.csv")
# food_atlas.head(100)

**CDC Health Data Example**

In [16]:
health_cdc = pd.read_csv("./data/cdc_health_data.csv")
# health_cdc.head(100)

**Food Atlas and CDC Data California Example**

In [25]:
health_cdc_and_food_atlas = pd.read_csv("./data/health_food_access_ca.csv")
# health_cdc_and_food_atlas.head(100)

**SEDA County Data Example**
- gcs: grade cohort scale, accounts for differences in state assessments to  allow cross state comparison analysis
- cs: cohort scale, standardized for a particular point in time and used for local analysis across grade levels.

In [8]:
seda_gcs = pd.read_csv("./data/seda_county_long_gcs_5.0.csv")
# seda_gcs.head(100)

In [9]:
len(seda_gcs)

354949

In [None]:
## Plot histogram for a categorical column
# state_counts = seda_gcs['year'].value_counts()
# state_counts = state_counts.sort_index()
# plt.figure(figsize=(10, 5))
# state_counts.plot(kind='bar', color='skyblue', edgecolor='black')
# plt.xlabel('Year')
# plt.ylabel('Count')
# plt.title('Histogram of Year counts')
# plt.xticks(rotation=45) 
# plt.show()

In [44]:
## Assess rought distributional statistics for a continuous column
# agg_min, agg_max, agg_mean, agg_stedv = [], [], [], []
# for col in seda_gcs.columns:
#     if col.startswith("tot_asmt") and len(col.split("_")) == 3:
#         values = seda_gcs[seda_gcs[col].notna()][col]
#         agg_min.append(values.min())
#         agg_max.append(values.max())
#         agg_mean.append(values.mean())
#         agg_stedv.append(values.std())
# agg_min.sort()
# agg_max.sort()
# agg_mean.sort()
# agg_stedv.sort()
# print(f"{agg_min},\n{agg_max},\n{agg_mean},\n{agg_stedv}")
# print(statistics.mean(agg_min), statistics.mean(agg_max), statistics.mean(agg_mean), statistics.mean(agg_stedv))


**County Coverage**

In [10]:
with open("state_abbv_mapping.json") as f:
    state_abbv_mapping = json.load(f)

In [11]:
food_atlas_counties = set()
for state, county in zip(food_atlas["State"], food_atlas["County"]):
    food_atlas_counties.add(f"{state_abbv_mapping[state]}_{' '.join(county.split(' ')[:-1])}".lower())
len(food_atlas_counties)

3136

In [12]:
seda_counties = set()
for state, county in zip(seda_gcs["stateabb"], seda_gcs["sedacountyname"]):
    seda_counties.add(f"{state}_{' '.join(county.split(' ')[:-1])}".lower())
len(seda_counties)

3098

**Missing counties**

In [13]:
missing = []
for pair in seda_counties:
    if pair not in food_atlas_counties:
        state, county = pair.split("_")
        missing.append(f"{county}, {state}")
print(f"In SEDA but not in Food Atlas:")
print(missing)

missing = []
for pair in food_atlas_counties:
    if pair not in seda_counties:
        state, county = pair.split("_")
        missing.append(f"{county}, {state}")
print(f"In Food Atlas but not in SEDA:")
print(missing)


In SEDA but not in Food Atlas:
[]
In Food Atlas but not in SEDA:
['keweenaw, mi', 'esmeralda, nv', 'loup, ne', 'harding, nm', 'prairie, mt', 'alpine, ca', 'buffalo, sd', 'mcpherson, ne', 'clark, id', 'billings, nd', 'carter, mt', 'sheridan, nd', 'wheeler, or', 'yakutat city and, ak', 'hinsdale, co', 'skagway, ak', 'campbell, sd', 'wheeler, ne', 'terrell, tx', 'hayes, ne', 'treasure, mt', 'slope, nd', 'san juan, co', 'loving, tx', 'emporia, va', 'keya paha, ne', 'golden valley, mt', 'arthur, ne', 'kenedy, tx', 'kalawao, hi', 'mineral, co', 'banner, ne', 'kent, tx', 'sioux, ne', 'petroleum, mt', 'king, tx', 'issaquena, ms', 'blaine, ne']


**Missing Data in SEDA dataset**

In [8]:
race_tags = {
    "Asian": "asn",
    "Black": "blk",
    "Hispanic": "hsp",
    "Native American": "nam",
    # "White-Asian Gap": "wag",
    # "White-Black Gap": "wbg",
    # "White-Hispanic Gap": "whg",
    # "White": "wht",
    # "White-Native American Gap": "whg",
}
race_tags_inv = {v: k for k, v in race_tags.items()}

gender_tags = {
    "Female": "fem",
    "Male": "mal",
    # "Male-Female Gap": "mfg"
}
gender_tags_inv = {v: k for k, v in gender_tags.items()}

additional_tags = {
    "Early Childhood Development": "ecd",
    "Non-Early Childhood Development": "nec",
    "Non-ECD ECD Gap": "neg"
}
additional_tags_inv = {v: k for k, v in additional_tags.items()}

# Add g before cs (i.e. "gcs") for the grade comparable scale table
columns = {
    "County Mean": "cs_mn",
    "County STDEV": "cs_mn_se",
    "Total Number of Students": "tot_asmt",
}
columns_inv = {v: k for k, v in columns.items()}

In [9]:

for state, group in seda_gcs.groupby("stateabb"):
    max_missing = 0
    max_missing_col = ""
    for col, tot in zip(group.isna().sum().keys(), group.isna().sum().values):
        if tot > max_missing:
            if any([x in col for x in ["asn", "blk", "hsp", "nam", "fem", "mal"]]):
                max_missing = tot
                max_missing_col = col
    print(f"{state}'s most sparse column is {max_missing_col} with {max_missing / len(group)} percent missing")

AK's most sparse column is gcs_mn_blk with 0.8925576519916143 percent missing
AL's most sparse column is gcs_mn_nam with 0.9030344512890714 percent missing
AR's most sparse column is gcs_mn_nam with 0.9624605678233439 percent missing
AZ's most sparse column is gcs_mn_blk with 0.6272054638588503 percent missing
CA's most sparse column is gcs_mn_nam with 0.4873345317862396 percent missing
CO's most sparse column is gcs_mn_nam with 0.8560961313012896 percent missing
CT's most sparse column is gcs_mn_nam with 0.6638743455497382 percent missing
DC's most sparse column is gcs_mn_nam with 1.0 percent missing
DE's most sparse column is gcs_mn_nam with 0.9111675126903553 percent missing
FL's most sparse column is gcs_mn_nam with 0.874119367273694 percent missing
GA's most sparse column is gcs_mn_nam with 0.9898800057828538 percent missing
HI's most sparse column is gcs_mn_blk with 0.6660341555977229 percent missing
IA's most sparse column is gcs_mn_nam with 0.9825442187379315 percent missing
ID

In [10]:
## Investigate missing values

# State
for state, group in seda_gcs.groupby("stateabb"):
    print(f"Percent of rows with missing values for {state}: {max(group.isna().sum()) / len(group)}")


# Subject
print()
for state, group in seda_gcs.groupby("subject"):
    print(f"Percent of rows with missing values for {state}: {max(group.isna().sum()) / len(group)}")

# Grade
print()
for state, group in seda_gcs.groupby("grade"):
    print(f"Percent of rows with missing values for {state}: {max(group.isna().sum()) / len(group)}")

# Year
print()
for state, group in seda_gcs.groupby("year"):
    print(f"Percent of rows with missing values for {state}: {max(group.isna().sum()) / len(group)}")

# Per Column

Percent of rows with missing values for AK: 0.8936058700209644
Percent of rows with missing values for AL: 0.9032626055213324
Percent of rows with missing values for AR: 0.9627760252365931
Percent of rows with missing values for AZ: 0.6306203756402959
Percent of rows with missing values for CA: 0.49485210001634256
Percent of rows with missing values for CO: 0.8648886283704572
Percent of rows with missing values for CT: 0.6638743455497382
Percent of rows with missing values for DC: 1.0
Percent of rows with missing values for DE: 0.9111675126903553
Percent of rows with missing values for FL: 0.8746510700518411
Percent of rows with missing values for GA: 0.9898800057828538
Percent of rows with missing values for HI: 0.6660341555977229
Percent of rows with missing values for IA: 0.9825442187379315
Percent of rows with missing values for ID: 0.9560866013071896
Percent of rows with missing values for IL: 0.9599067248382729
Percent of rows with missing values for IN: 0.9966059602649007
Percen