In [1]:
import pickle
import glob
import os
import pandas as pd

In [2]:
os.chdir("pickles/res30")

In [3]:
pkl_files = glob.glob("*.pkl")

In [4]:
len(pkl_files)

900

In [5]:
map_data = {}
for pkl_file in pkl_files:
    # lat_lng = (float(i) for i in pkl_file[:-4].split("_"))
    with open(pkl_file, "rb") as input:
        map_data[pkl_file] = pickle.load(input)

In [6]:
def get_admin_level_short_names(result, lvl):
    if len(result) == 1:
        return [i["short_name"] for i in result[0]["address_components"] if "administrative_area_level_{}".format(lvl) in i["types"]]
    else:
        return []

In [7]:
map_df = pd.DataFrame([[
        pkl_file,
        [float(i) for i in pkl_file[:-4].split("_")],
        type(map_data[pkl_file]),
        len(map_data[pkl_file]),
        get_admin_level_short_names(map_data[pkl_file], "1"),
        get_admin_level_short_names(map_data[pkl_file], "2")
    ] for pkl_file in pkl_files],
    columns = ["filename", "latlng", "result_type", "result_length", "admin_lvl_1_short_names", "admin_lvl_2_short_names"]
)

for lvl in [1, 2]:
    map_df["admin_lvl_{}_num".format(lvl)] = map_df["admin_lvl_{}_short_names".format(lvl)].map(len)

In [8]:
map_df.head()

Unnamed: 0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num
0,41.423793103448276_-79.03827586206897.pkl,"[41.423793103448276, -79.03827586206897]",<class 'list'>,1,[PA],[Elk County],1,1
1,40.47_-79.03827586206897.pkl,"[40.47, -79.03827586206897]",<class 'list'>,1,[PA],[Indiana County],1,1
2,40.78793103448276_-79.87.pkl,"[40.78793103448276, -79.87]",<class 'list'>,1,[PA],[Butler County],1,1
3,41.900689655172414_-76.82034482758621.pkl,"[41.900689655172414, -76.82034482758621]",<class 'list'>,1,[PA],[Bradford County],1,1
4,41.58275862068965_-74.87965517241379.pkl,"[41.58275862068965, -74.87965517241379]",<class 'list'>,1,[NY],[Sullivan County],1,1


Make sure all result types are list

In [9]:
map_df.groupby("result_type").count()

Unnamed: 0_level_0,filename,latlng,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num
result_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
<class 'list'>,900,900,900,900,900,900,900


Check lengths of results

In [10]:
map_df.groupby("result_length").count()

Unnamed: 0_level_0,filename,latlng,result_type,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num
result_length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,100,100,100,100,100,100,100
1,800,800,800,800,800,800,800


Check lengths of admin short names

In [11]:
map_df.groupby("admin_lvl_1_num").count()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_2_num
admin_lvl_1_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,100,100,100,100,100,100,100
1,800,800,800,800,800,800,800


In [12]:
map_df.groupby("admin_lvl_2_num").count()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num
admin_lvl_2_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,100,100,100,100,100,100,100
1,800,800,800,800,800,800,800


If all counts look good, add lat, long, admin level names

In [13]:
map_df["lat"] = map_df.latlng.map(lambda x: x[0])

In [14]:
map_df["lng"] = map_df.latlng.map(lambda x: x[1])

In [15]:
map_df["state"] = map_df.admin_lvl_1_short_names.map(lambda x: x[0] if len(x) > 0 else "")

In [16]:
map_df["county"] = map_df.admin_lvl_2_short_names.map(lambda x: x[0] if len(x) > 0 else "")

In [17]:
map_df.groupby("state").count()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num,lat,lng,county
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,288,288,288,288,288,288,288,288,288,288,288
CT,92,92,92,92,92,92,92,92,92,92,92
MA,75,75,75,75,75,75,75,75,75,75,75
NH,61,61,61,61,61,61,61,61,61,61,61
NJ,52,52,52,52,52,52,52,52,52,52,52
NY,878,878,878,878,878,878,878,878,878,878,878
ON,358,358,358,358,358,358,358,358,358,358,358
PA,510,510,510,510,510,510,510,510,510,510,510
QC,16,16,16,16,16,16,16,16,16,16,16
RI,1,1,1,1,1,1,1,1,1,1,1


In [18]:
map_df.groupby("county").count().head()

Unnamed: 0_level_0,filename,latlng,result_type,result_length,admin_lvl_1_short_names,admin_lvl_2_short_names,admin_lvl_1_num,admin_lvl_2_num,lat,lng,state
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,288,288,288,288,288,288,288,288,288,288,288
Addison County,18,18,18,18,18,18,18,18,18,18,18
Albany County,11,11,11,11,11,11,11,11,11,11,11
Allegany County,17,17,17,17,17,17,17,17,17,17,17
Allegheny County,5,5,5,5,5,5,5,5,5,5,5


In [19]:
with open("./../ny_map_data.json", "w") as output:
    output.write(map_df[["lat", "lng", "state", "county"]].to_json(orient='records'))