# Child Care Data Prep

In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.geocoders import GoogleV3
from geopy.extra.rate_limiter import RateLimiter

In [None]:
# read in original data
cc_df = pd.read_csv('data/original/ChildcareCenters.csv')
fc_df = pd.read_csv('data/original/FamilyChildCare.csv')

In [None]:
# append datasets together
df = pd.concat([cc_df, fc_df], ignore_index=True, axis=0)

In [None]:
# create flag for centers that care for under 5 (infant, toddler, pre-school)
center_service = 'infants|toddlers|preschool'
# filter out centers that don't care for under 5
df = df[df['Type Of License'].str.contains(center_service, case=False, na=False)]

In [None]:
# filter out centers with non-active licenses
df = df[df['License Status'].str.contains('Active')]

In [None]:
# save stage out
df.to_csv('data/modified/All_MN_Childcare.csv')

# Geocoding

In [None]:
# strip off anything longer than 5 for zipcode
df['Zip'] = df['Zip'].astype(str).str.slice(0,5)

In [None]:
df['Zip'].head()

In [None]:
# concatenate address into a single string to pass to geocoder
df["full_address"] = df['AddressLine1'] + ' ' + df['City'] + ' ' + df['State'] + ' ' + df['Zip']

In [None]:
# geocode
key = 'https://youtu.be/RfiQYRn7fBg?t=17'
locator = GoogleV3(api_key=key)

# delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=2)

# create location column
df['location'] = df['full_address'].apply(geocode)

# create longitude, laatitude from location
df['latitude'] = [g.latitude for g in df['location']]
df['longitude'] = [g.longitude for g in df['location']]

In [None]:
# write stage out
df.to_csv('data/modified/geocoded_childcare_centers.csv')

# Final data prep

In [None]:
# create 'negative weights' of child care centers for use on weighted KDE
df['weight'] = df['Capacity'] * (-1)

In [None]:
# save out stage
final_df = df[['weight', 'latitude', 'longitude']]
df.to_csv('data/modified/cc_weights.csv')