In [1]:
import pandas as pd
import numpy as np

In [2]:
def haversine(p1_lon, p1_lat, p2_lon, p2_lat):

    p1_lon_r, p1_lat_r, p2_lon_r, p2_lat_r = map(
        np.radians, [p1_lon, p1_lat, p2_lon, p2_lat]
    )

    d_lon_r = p2_lon_r - p1_lon_r
    d_lat_r = p2_lat_r - p1_lat_r

    partial = (
        np.sin(d_lat_r / 2) ** 2
        + np.cos(p1_lat_r) * np.cos(p2_lat_r) * np.sin(d_lon_r / 2) ** 2
    )
    d_r = 2 * np.arcsin(np.sqrt(partial))

    # note: the Earth is not perfectly spherical,
    # so there isn't one right number,
    # so, I picked one
    radius_of_earth_km = 6367

    return radius_of_earth_km * d_r


def euclidean(p1_lon, p1_lat, p2_lon, p2_lat):

    p1_lon_r, p1_lat_r, p2_lon_r, p2_lat_r = map(
        np.radians, [p1_lon, p1_lat, p2_lon, p2_lat]
    )

    d_lon_r = p2_lon_r - p1_lon_r
    d_lat_r = p2_lat_r - p1_lat_r

    d_r = np.sqrt(d_lon_r**2 + d_lat_r**2)

    # note: the Earth is not perfectly spherical,
    # so there isn't one right number,
    # so, I picked one
    radius_of_earth_km = 6367

    return radius_of_earth_km * d_r

In [3]:
AID_IDS = ["p116170", "p125829"]
POLL_IDS = [362725, 583653]

In [12]:
aid_df2 = pd.read_csv("data/aid.csv").rename(
    columns={"ids": "aid_id", "latitude": "aid_lat", "longitude": "aid_lon"}
)

In [13]:
aid_df2["aid_id"].value_counts()

p114204    133
p120391     22
p118410     16
p121495      7
p099369      5
p108654      5
p125630      4
p116170      3
p125829      2
p104995      1
p111512      1
p111996      1
p118077      1
Name: aid_id, dtype: int64

In [4]:
aid_df = pd.read_csv("data/aid.csv").rename(
    columns={"ids": "aid_id", "latitude": "aid_lat", "longitude": "aid_lon"}
).set_index("aid_id").loc[AID_IDS].reset_index()

In [5]:
aid_df.set_index("aid_id")

Unnamed: 0_level_0,aid_lat,aid_lon
aid_id,Unnamed: 1_level_1,Unnamed: 2_level_1
p116170,-23.547501,-46.636108
p116170,-23.20657,-46.783199
p116170,-23.6,-46.633331
p125829,-23.73333,-46.73333
p125829,-23.7974,-46.605


In [6]:
poll_df = pd.read_csv("data/polling_stations.csv").rename(
    columns={"local_id": "poll_id", "lat": "poll_lat", "lon": "poll_lon"}
).set_index("poll_id").loc[POLL_IDS].reset_index()

In [7]:
poll_df.set_index("poll_id")

Unnamed: 0_level_0,poll_lat,poll_lon
poll_id,Unnamed: 1_level_1,Unnamed: 2_level_1
362725,-23.547403,-46.300659
583653,-23.547369,-46.358326


In [8]:
df = (
    pd.merge(poll_df, aid_df, how="cross")
    .assign(
        haversine=lambda df_: haversine(
            p1_lon=df_["poll_lon"],
            p1_lat=df_["poll_lat"],
            p2_lon=df_["aid_lon"],
            p2_lat=df_["aid_lat"],
        )
    )
    .assign(
        euclidean=lambda df_: euclidean(
            p1_lon=df_["poll_lon"],
            p1_lat=df_["poll_lat"],
            p2_lon=df_["aid_lon"],
            p2_lat=df_["aid_lat"],
        )
    )
)

In [9]:
df 

Unnamed: 0,poll_id,poll_lat,poll_lon,aid_id,aid_lat,aid_lon,haversine,euclidean
0,362725,-23.547403,-46.300659,p116170,-23.547501,-46.636108,34.172744,37.27681
1,362725,-23.547403,-46.300659,p116170,-23.20657,-46.783199,62.10624,65.649646
2,362725,-23.547403,-46.300659,p116170,-23.6,-46.633331,34.383498,37.427411
3,362725,-23.547403,-46.300659,p125829,-23.73333,-46.73333,48.650837,52.331911
4,362725,-23.547403,-46.300659,p125829,-23.7974,-46.605,41.607432,43.767207
5,583653,-23.547369,-46.358326,p116170,-23.547501,-46.636108,28.29812,30.86856
6,583653,-23.547369,-46.358326,p116170,-23.20657,-46.783199,57.553881,60.526068
7,583653,-23.547369,-46.358326,p116170,-23.6,-46.633331,28.613725,31.11459
8,583653,-23.547369,-46.358326,p125829,-23.73333,-46.73333,43.409513,46.514787
9,583653,-23.547369,-46.358326,p125829,-23.7974,-46.605,37.446734,39.03064


In [10]:
df.to_csv("data/demo_distance.csv", index=False)