In [1]:
import pandas as pd
import numpy as np
import arcpy
from arcgis.features import GeoAccessor, GeoSeriesAccessor
from datetime import datetime, date
from sklearn.metrics.pairwise import haversine_distances
from math import radians

pd.set_option('display.max_columns', None)

TODAY = date.today().strftime("%Y-%m-%d")
NOW = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

arcpy.env.workspace =  r".\MyProject\MyProject.gdb"

from config import ADDRESS_LOCATOR_PATH

IN_TABLE_PATH = r".\materials\EDGE_GEOCODE_POSTSECSCH_2223\EDGE_GEOCODE_POSTSECSCH_2223.xlsx\IPEDS22_GEOLOAD_230217"
OUT_FEATURE_CLASS_PATH = r".\MyProject\MyProject.gdb\geocode_IPEDS22_GEOLOAD_230217"
IN_ADDRESS_FIELDS_MAP = r"'Address or Place' Street VISIBLE NONE;Address2 <None> VISIBLE NONE;Address3 <None> VISIBLE NONE;Neighborhood <None> VISIBLE NONE;City City VISIBLE NONE;County <None> VISIBLE NONE;State State VISIBLE NONE;ZIP ZIP VISIBLE NONE;ZIP4 <None> VISIBLE NONE;Country <None> VISIBLE NONE"

In [None]:
df_source = pd.read_excel(r".\materials\EDGE_GEOCODE_POSTSECSCH_2223\EDGE_GEOCODE_POSTSECSCH_2223.xlsx", sheet_name="IPEDS22_GEOLOAD_230217")

In [None]:
df = df_source.copy()

print(df.shape)

df.loc[df['NAME'] == 'University of Michigan-Ann Arbor']

In [None]:
# https://pro.arcgis.com/en/pro-app/latest/tool-reference/data-management/delete.htm
arcpy.management.Delete(OUT_FEATURE_CLASS_PATH)

# https://pro.arcgis.com/en/pro-app/latest/tool-reference/geocoding/geocode-addresses.htm
arcpy.geocoding.GeocodeAddresses(IN_TABLE_PATH, ADDRESS_LOCATOR_PATH, IN_ADDRESS_FIELDS_MAP, OUT_FEATURE_CLASS_PATH)

# https://developers.arcgis.com/python/api-reference/arcgis.features.toc.html#arcgis.features.GeoAccessor.from_featureclass
df = pd.DataFrame.spatial.from_featureclass(OUT_FEATURE_CLASS_PATH)

df = df.loc[df['Score'] != 0]

df.to_pickle("./results/geocode_IPEDS22_GEOLOAD_230217.pkl")

df_geocode = df.copy()

In [None]:
df = df_geocode.copy()

df[["Y_radians", "X_radians", "USER_LAT_radians", "USER_LON_radians"]] = df[["Y", "X", "USER_LAT", "USER_LON"]].applymap(radians)

df["haversine_dist"] = df[["Y_radians", "X_radians", "USER_LAT_radians", "USER_LON_radians"]].apply(
    lambda x: (haversine_distances([x[0:2].tolist(), x[2:4].tolist()]) * 6371000)[0][1], axis=1
)

df["haversine_dist"].describe().round(2)

In [None]:
df.loc[df['haversine_dist'] == df['haversine_dist'].max()]

In [None]:
df = df_source.copy()

df.loc[df['NAME'] == 'Erie 2 Chautauqua Cattaraugus BOCES-Practical Nursing Program']