# Child Care Data Prep

In [1]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3
from geopy.extra.rate_limiter import RateLimiter

In [None]:
# read in original data
cc_df = pd.read_csv('data/original/ChildcareCenters.csv')
fc_df = pd.read_csv('data/original/FamilyChildCare.csv')

In [None]:
# append datasets together
df = pd.concat([cc_df, fc_df], ignore_index=True, axis=0)

In [None]:
# create flag for centers that care for under 5 (infant, toddler, pre-school)
center_service = 'infants|toddlers|preschool'
# filter out centers that don't care for under 5
df = df[df['Type Of License'].str.contains(center_service, case=False, na=False)]

In [None]:
# filter out centers with non-active licenses
df = df[df['License Status'].str.contains('Active')]

In [None]:
# save stage out
df.to_csv('data/modified/All_MN_Childcare.csv')

# Geocoding

In [None]:
# strip off anything longer than 5 for zipcode
df['Zip'] = df['Zip'].astype(str).str.slice(0,5)

In [None]:
df['Zip'].head()

In [None]:
# concatenate address into a single string to pass to geocoder
df["full_address"] = df['AddressLine1'] + ' ' + df['City'] + ' ' + df['State'] + ' ' + df['Zip']

In [None]:
# geocode
key = 'https://youtu.be/RfiQYRn7fBg?t=17'
locator = GoogleV3(api_key=key)

# delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=2)

# create location column
df['location'] = df['full_address'].apply(geocode)

# create longitude, laatitude from location
df['latitude'] = [g.latitude for g in df['location']]
df['longitude'] = [g.longitude for g in df['location']]

In [None]:
# write stage out
df.to_csv('data/modified/geocoded_childcare_centers.csv')

# ACS data prep

In [1]:
import geopandas as gpd

shp_df = gpd.read_file('data/original/tl_2021_27_bg/tl_2021_27_bg.shp')
shp_df['GEOG_UNIT'] = shp_df['GEOID']
shp_df = shp_df.astype({'GEOG_UNIT':'int64'})
shp_df = shp_df.set_index(['GEOG_UNIT']).sort_index()
shp_df.crs

<Geographic 2D CRS: EPSG:4269>
Name: NAD83
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: North America - onshore and offshore: Canada - Alberta; British Columbia; Manitoba; New Brunswick; Newfoundland and Labrador; Northwest Territories; Nova Scotia; Nunavut; Ontario; Prince Edward Island; Quebec; Saskatchewan; Yukon. Puerto Rico. United States (USA) - Alabama; Alaska; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Hawaii; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming. US Virgin Islands. British Virgin Islands

In [2]:
# set datum
shp_df = shp_df.to_crs('EPSG:4326')

In [3]:
shp_df.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [4]:
# get centroids
shp_df['coords'] = shp_df.centroid

shp_df['coords'].head()


  shp_df['coords'] = shp_df.centroid


GEOG_UNIT
270017701001    POINT (-93.22335 46.87148)
270017701002    POINT (-93.42318 46.66757)
270017701003    POINT (-93.57031 46.58155)
270017702001    POINT (-93.57060 46.89993)
270017702002    POINT (-93.64449 46.69889)
Name: coords, dtype: geometry

In [5]:
# convert geopandas point to tuple
shp_df['coords'] = shp_df['coords'].apply(lambda p: tuple([p.y, p.x]))
shp_df = shp_df[['coords']]
shp_df.head()

Unnamed: 0_level_0,coords
GEOG_UNIT,Unnamed: 1_level_1
270017701001,"(46.87147950696038, -93.22335054334215)"
270017701002,"(46.667572843779034, -93.42317606004025)"
270017701003,"(46.5815538920114, -93.57031258714842)"
270017702001,"(46.89992976026988, -93.57059554010407)"
270017702002,"(46.69888633010108, -93.6444851979997)"


In [6]:
import pandas as pd 

# read in ACS data and join to shp coords
acs_df = pd.read_excel('data/modified/ACS.xlsx')

acs_df = acs_df.astype({'GEOG_UNIT':'int64'})
# set index
acs_df = acs_df.set_index(['GEOG_UNIT']).sort_index()

acs_df['weight'] = acs_df['POPUNDER5'] * -1
acs_df['point_type'] = 'Census_Blocks'

In [7]:
# join shp coords
acs_merge_df = pd.merge(acs_df, shp_df, left_index=True, right_index=True)
acs_merge_df.shape

(4706, 30)

In [8]:
# remove all extra columns
acs_merge_df = acs_merge_df[['coords', 'weight', 'point_type']]
acs_merge_df.shape

(4706, 3)

In [9]:
# read in child care data
cc_df = pd.read_csv('data/modified/geocoded_childcare_centers.csv')
# set index 
cc_df = cc_df.set_index(['License Number']).sort_index()
cc_df['point_type'] = 'Child Care'

In [10]:
from shapely.geometry import Point

cc_df['coords'] = tuple(zip(cc_df['latitude'], cc_df['longitude']))
# remove the old columns
drop_cols = ['latitude', 'longitude']
cc_df = cc_df.drop(drop_cols, axis=1)

cc_df['coords'].head()

License Number
800003    (44.2917947, -93.3045992)
800020      (44.8787181, -93.38103)
800025    (45.1170803, -93.2164146)
800056    (43.6435327, -93.3647928)
800058    (44.0689429, -92.4763802)
Name: coords, dtype: object

In [11]:
# rename columns and organize
cc_df['weight'] = cc_df['Capacity']
cc_df = cc_df[['coords', 'weight', 'point_type']]

In [12]:
# append datasets together
final_df = pd.concat([cc_df, acs_merge_df], ignore_index=True, axis=0)
# write final data set out
final_df.to_csv('data/modified/data_to_map.csv')

In [13]:
final_df.head()

Unnamed: 0,coords,weight,point_type
0,"(44.2917947, -93.3045992)",94,Child Care
1,"(44.8787181, -93.38103)",91,Child Care
2,"(45.1170803, -93.2164146)",35,Child Care
3,"(43.6435327, -93.3647928)",157,Child Care
4,"(44.0689429, -92.4763802)",168,Child Care
