In [2]:
import pickle
import glob
import os
import pandas as pd

In [3]:
os.chdir("pickles/res80")

In [4]:
pkl_files = glob.glob("*.pkl")

In [5]:
len(pkl_files)

6400

In [6]:
map_data = {}
for pkl_file in pkl_files:
    # lat_lng = (float(i) for i in pkl_file[:-4].split("_"))
    with open(pkl_file, "rb") as input:
        map_data[pkl_file] = pickle.load(input)

In [7]:
def get_admin_level_short_names(result, lvl):
    if len(result) == 1:
        return [i["short_name"] for i in result[0]["address_components"] if "administrative_area_level_{}".format(lvl) in i["types"]]
    else:
        return []

In [8]:
map_df = pd.DataFrame([[
        pkl_file,
        [float(i) for i in pkl_file[:-4].split("_")],
        type(map_data[pkl_file]),
        len(map_data[pkl_file]),
        get_admin_level_short_names(map_data[pkl_file], "1"),
        get_admin_level_short_names(map_data[pkl_file], "2")
    ] for pkl_file in pkl_files],
    columns = ["filename", "latlng", "result_type", "result_length", "admin_lvl_1_short_names", "admin_lvl_2_short_names"]
)

for lvl in [1, 2]:
    map_df["admin_lvl_{}_num".format(lvl)] = map_df["admin_lvl_{}_short_names".format(lvl)].map(len)

In [9]:
map_df.head()

Unnamed: 0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num
0,41.40367088607595_-74.98493670886076.pkl,"[41.40367088607595, -74.98493670886076]",<class 'list'>,1,[PA],[Pike County],1,1
1,44.788227848101265_-79.05582278481013.pkl,"[44.788227848101265, -79.05582278481013]",<class 'list'>,1,[ON],[Kawartha Lakes Division],1,1
2,42.92088607594937_-74.47607594936709.pkl,"[42.92088607594937, -74.47607594936709]",<class 'list'>,1,[NY],[Montgomery County],1,1
3,42.162278481012656_-79.66645569620253.pkl,"[42.162278481012656, -79.66645569620253]",<class 'list'>,1,[NY],[Chautauqua County],1,1
4,42.86253164556962_-73.66189873417721.pkl,"[42.86253164556962, -73.66189873417721]",<class 'list'>,1,[NY],[Rensselaer County],1,1


Make sure all result types are list

In [10]:
map_df.groupby("result_type").count()

Unnamed: 0_level_0,filename,latlng,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num
result_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
<class 'list'>,6400,6400,6400,6400,6400,6400,6400


Check lengths of results

In [11]:
map_df.groupby("result_length").count()

Unnamed: 0_level_0,filename,latlng,result_type,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num
result_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,706,706,706,706,706,706,706
1,5694,5694,5694,5694,5694,5694,5694


Check lengths of admin short names

In [12]:
map_df.groupby("admin_lvl_1_num").count()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_2_num
admin_lvl_1_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,706,706,706,706,706,706,706
1,5694,5694,5694,5694,5694,5694,5694


In [13]:
map_df.groupby("admin_lvl_2_num").count()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num
admin_lvl_2_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,706,706,706,706,706,706,706
1,5694,5694,5694,5694,5694,5694,5694


If all counts look good, add lat, long, admin level names

In [14]:
map_df["lat"] = map_df.latlng.map(lambda x: x[0])

In [15]:
map_df["lng"] = map_df.latlng.map(lambda x: x[1])

In [16]:
map_df["state"] = map_df.admin_lvl_1_short_names.map(lambda x: x[0] if len(x) > 0 else "")

In [17]:
map_df["county"] = map_df.admin_lvl_2_short_names.map(lambda x: x[0] if len(x) > 0 else "")

In [18]:
map_df["county_name_list"] = map_df.county.map(lambda x: x.split(" "))
map_df["county_num_names"] = map_df.county_name_list.map(lambda x: len(x))
map_df["county_first_name"] = map_df.county_name_list.map(lambda x: x[0])
map_df["county_last_name"] = map_df.county_name_list.map(lambda x: x[-1])
map_df["county_name"] = map_df.county_name_list.map(lambda x: " ".join(x[:-1]))

In [19]:
map_df.groupby("state").count()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num,lat,lng,county,county_name_list,county_num_names,county_first_name,county_last_name,county_name
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
,706,706,706,706,706,706,706,706,706,706,706,706,706,706,706,706
CT,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232,232
MA,190,190,190,190,190,190,190,190,190,190,190,190,190,190,190,190
NH,144,144,144,144,144,144,144,144,144,144,144,144,144,144,144,144
NJ,136,136,136,136,136,136,136,136,136,136,136,136,136,136,136,136
NY,2300,2300,2300,2300,2300,2300,2300,2300,2300,2300,2300,2300,2300,2300,2300,2300
ON,904,904,904,904,904,904,904,904,904,904,904,904,904,904,904,904
PA,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297
QC,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53,53
RI,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [20]:
map_df.groupby("county").count().head()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num,lat,lng,state,county_name_list,county_num_names,county_first_name,county_last_name,county_name
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
,706,706,706,706,706,706,706,706,706,706,706,706,706,706,706,706
Addison County,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41
Albany County,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24
Allegany County,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46,46
Allegheny County,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7


In [21]:
with open("./../../ny_map_data_res80.json", "w") as output:
    output.write(map_df[["lat", "lng", "state", "county_name"]].to_json(orient='records'))