<a href="https://colab.research.google.com/github/pradh/mixer/blob/countufix2/internal/store/files/Safely_adding_names_without_county_to_NL_Place_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Install

!pip install datacommons

import datacommons as dc
import requests

Collecting datacommons
  Downloading datacommons-1.4.3-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.5/46.5 kB[0m [31m700.8 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: datacommons
Successfully installed datacommons-1.4.3


In [None]:
# @title Get all counties

counties = dc.get_places_in(['country/USA'], 'County')['country/USA']
county2names = dc.get_property_values(counties, 'name')
county2names = {k: v[0] for k, v in county2names.items()}
len(county2names)

3238

In [None]:
# @title Get all states

# 'geoId/xx' => 8
state_dcids = list(set([c[:8] for c in counties]))
state_dcids.sort()
state2names = dc.get_property_values(state_dcids, 'name')
state2names = {k: v[0] for k, v in state2names.items()}
len(state2names)

53

In [None]:
# @title Prepare `recognize/places` queries
county2queries = {}

for cid, cname in county2names.items():
  sid = cid[:8]
  sname = state2names[sid]

  if not cname.lower().endswith(' county'):
    continue
  cname_short = cname.replace(' County', '').replace(' county', '')

  county2queries[cid] = [
    f'{cname_short} {sname}',
    f'{cname_short} US',
  ]

queries = list(set([n[0] for n in county2queries.values()] + [n[1] for n in county2queries.values()]))
print(county2queries['geoId/20145'])
len(county2queries)

['Pawnee Kansas', 'Pawnee US']


3003

In [None]:
# @title Make the `recognize/places` call
def recognize(queries):
  resp = requests.post('https://api.datacommons.org/v1/recognize/places',
                       headers={'X-API-Key': 'AIzaSyCTI4Xz-UW_G2Q2RfknhcfdAnTHq5X5XuI'},
                      json={'queries': queries}).json()
  ans = {}
  for q, v in resp['queryItems'].items():
    item = v.get('items', [{}])[0]
    name = item.get('span')
    dcid = item.get('places', [{}])[0].get('dcid', '')
    if q == name:
      ans[q] = dcid
    else:
      ans[q] = ''
  return ans

ans = recognize(queries)

In [None]:
# @title Classify safe / maybe-unsafe counties

safe_counties = []
maybe_unsafe_counties = {}
for cid, (in_st, in_us) in county2queries.items():
  s_id = ans[in_st]
  us_id = ans[in_us]
  if not s_id:
    if us_id:
      maybe_unsafe_counties[cid] = set([us_id])
    else:
      safe_counties.append(cid)
  else:
    if us_id:
      maybe_unsafe_counties[cid] = set([s_id, us_id])
    else:
      maybe_unsafe_counties[cid] = set([s_id])

len(safe_counties), len(maybe_unsafe_counties)

(578, 2425)

In [None]:
# @title Get population of maybe-unsafe counties and suppressed places

places_for_pop = set()
places_for_pop.update([p for p in maybe_unsafe_counties.keys()])
places_for_pop.update([p for places in maybe_unsafe_counties.values() for p in places])
places_for_pop  =list(places_for_pop)
print(places_for_pop[0])

place2stat = dc.get_stats(places_for_pop, 'Count_Person')
place2stat = {k: list(v.get('data', {}).values())[0] for k, v in place2stat.items()}
print(len(place2stat))

geoId/2167458
4157


In [None]:
# @title Classify the maybe-unsafe counties as def-unsafe (if suppressing sizeable places), or safe

def_unsafe_counties = {}
for k, vals in maybe_unsafe_counties.items():
  kpop = place2stat[k]

  for v in vals:
    vpop = place2stat[v]
    if kpop > vpop and vpop > 10000:
      # `v` is sizeable place that `k` is suppressing!
      def_unsafe_counties.setdefault(k, set()).add(v)

  if k not in def_unsafe_counties:
    safe_counties.append(k)


len(safe_counties), len(def_unsafe_counties)

(2437, 566)

In [None]:
# @title Final list of unsafe counties

for k, vals in def_unsafe_counties.items():
  print(f'{k} ({county2names[k]}) will suppress {", ".join(vals)}')

geoId/01003 (Baldwin County) will suppress geoId/3604143
geoId/01013 (Butler County) will suppress geoId/4210464
geoId/01015 (Calhoun County) will suppress geoId/1312456
geoId/01023 (Choctaw County) will suppress geoId/4014200
geoId/01027 (Clay County) will suppress geoId/0115256
geoId/01043 (Cullman County) will suppress geoId/0118976
geoId/01049 (DeKalb County) will suppress geoId/1719161
geoId/01061 (Geneva County) will suppress geoId/1728872
geoId/01073 (Jefferson County) will suppress geoId/1341988
geoId/01089 (Madison County) will suppress geoId/0145784, geoId/5548000
geoId/01095 (Marshall County) will suppress geoId/4846776
geoId/01097 (Mobile County) will suppress geoId/0150000
geoId/01101 (Montgomery County) will suppress geoId/0151000
geoId/01117 (Shelby County) will suppress geoId/3761200
geoId/01121 (Talladega County) will suppress geoId/0174592
geoId/01125 (Tuscaloosa County) will suppress geoId/0177256
geoId/01127 (Walker County) will suppress geoId/2682960
geoId/04009 (G

In [None]:
# @title List of safe counties CSV for copy/paste into Mixer store
for cid in sorted(safe_counties):
  cname_short = county2names[cid].replace(' County', '').replace(' county', '')
  print(f'{cid},{cname_short}')

geoId/01001,Autauga
geoId/01005,Barbour
geoId/01007,Bibb
geoId/01009,Blount
geoId/01011,Bullock
geoId/01017,Chambers
geoId/01019,Cherokee
geoId/01021,Chilton
geoId/01025,Clarke
geoId/01029,Cleburne
geoId/01031,Coffee
geoId/01033,Colbert
geoId/01035,Conecuh
geoId/01037,Coosa
geoId/01039,Covington
geoId/01041,Crenshaw
geoId/01045,Dale
geoId/01047,Dallas
geoId/01051,Elmore
geoId/01053,Escambia
geoId/01055,Etowah
geoId/01057,Fayette
geoId/01059,Franklin
geoId/01063,Greene
geoId/01065,Hale
geoId/01067,Henry
geoId/01069,Houston
geoId/01071,Jackson
geoId/01075,Lamar
geoId/01077,Lauderdale
geoId/01079,Lawrence
geoId/01081,Lee
geoId/01083,Limestone
geoId/01085,Lowndes
geoId/01087,Macon
geoId/01091,Marengo
geoId/01093,Marion
geoId/01099,Monroe
geoId/01103,Morgan
geoId/01105,Perry
geoId/01107,Pickens
geoId/01109,Pike
geoId/01111,Randolph
geoId/01113,Russell
geoId/01115,St. Clair
geoId/01119,Sumter
geoId/01123,Tallapoosa
geoId/01129,Washington
geoId/01131,Wilcox
geoId/01133,Winston
geoId/04001,Apa