In [2]:
import pandas as pd
import requests
import json
from pyproj import Transformer
from math import radians, cos, sin, asin, sqrt
import os.path
import time

datapath='../data/'
secret_file = '../secrets/gcloud.json'

#origin="Haifa Hof Hacarmel"
#origin="כחל"
origin_english = 'Kahal'
MODE='driving'
#MODE='transit'
MAX_ELEM = 25

filename = f'{datapath}dist_{MODE}_{origin_english}.csv'
assert not os.path.isfile(filename), "Data already exists, will not re-fetch it to reduce costs"

with open(secret_file) as json_file:
    secrets = json.load(json_file)
    api_key=secrets['api_key']

In [3]:
transformer = Transformer.from_crs('epsg:2039', 'epsg:4326')

def get_distances(origin, destinations, api_key):
    if isinstance(origin, list):
        origin = ('|').join(origin)
        
    params = {
        'origins' : origin,
        'destinations' : ('|').join(destinations),
        'units' : 'metric',
        'mode' : MODE,
        'key' :  api_key
    }

    response = requests.get(
                    url = 'https://maps.googleapis.com/maps/api/distancematrix/json?',
                    params = params,
                    )
    return response

def coords_to_lonlat(c):
    s = f'{c:.0f}'

    if s == 'nan':
        return None, None, None
    assert len(s) == 10, "Input string must be 10 characters long"
    X = int(s[:5] + '0')
    Y = int(s[5:] + '0')

    lat, lon = transformer.transform(X, Y)
    return lon, lat, f'{lat:.4f},{lon:.4f}'

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [4]:
# source: https://www.cbs.gov.il/he/publications/Pages/2019/יישובים-בישראל.aspx
places = pd.read_excel(datapath + 'places-cbs.xlsx')

places.rename(columns={'שם יישוב באנגלית' : 'name', 'סך הכל אוכלוסייה 2021' : 'pop', 'קואורדינטות' : 'coords'}, inplace=True)

places[['lon', 'lat', 'lonlat']] = places['coords'].apply(lambda x: pd.Series(coords_to_lonlat(x)))
origin_query = places.query('name == @origin_english')
if len(origin_query) == 0:
    raise ValueError(f'No place name in the file found for {origin_english}')
origin_coords = origin_query.iloc[0]['coords']

dests = places.query('pop > 100 and pop < 1000').copy().reset_index().drop(columns='index')
print(f'{len(dests)} destinations')
orig_lon, orig_lat, orig_lonlat = coords_to_lonlat(origin_coords)

698 destinations


In [106]:
dests['distance'] = None
dests['duration'] = None
dests['g_orig'] = None
dests['g_dest'] = None
dests['crow_dist'] = None
dests['mode'] = MODE

s_idx = 0
dlist = dests['lonlat'].to_list()
while s_idx < len(dlist):
    print(s_idx, end='...')
    e_idx = min(s_idx+MAX_ELEM, len(dests))
    r = range(s_idx, e_idx)
    dests_call = dlist[s_idx:e_idx]
    response = get_distances(origin, dests_call, api_key)
    time.sleep(0.5)
    matrix = response.json()
    DESTINATIONS=matrix['destination_addresses']
    ORIGIN=matrix['origin_addresses'][0]
    matrix = matrix['rows'][0]['elements']

    o_origin, o_dest, o_distance, o_duration, o_crow = [], [], [], [], []

    num_failed = 0
    for i, element in enumerate(matrix):
        o_origin.append(ORIGIN)
        o_dest.append(DESTINATIONS[i])
        o_crow.append(1000.*haversine(orig_lon, orig_lat, dests.loc[s_idx+i, 'lon'], dests.loc[s_idx+i, 'lat']))
        if element['status'] == 'OK':
            o_distance.append(element['distance']['value'])
            o_duration.append(element['duration']['value'])
        else:
            num_failed += 1
            o_distance.append(None)
            o_duration.append(None)
    if num_failed/len(dests_call) > 0.1:
        raise ValueError(f'Too many failed calls: {num_failed}/{len(dests_call)}, check the origin place name')
    
    dests.loc[r, 'distance'] = o_distance
    dests.loc[r, 'duration'] = o_duration
    dests.loc[r, 'g_orig'] = o_origin
    dests.loc[r, 'g_dest'] = o_dest
    dests.loc[r, 'crow_dist'] = o_crow
    
    s_idx += MAX_ELEM


0...25...50...75...100...125...150...175...200...225...250...275...300...325...350...375...400...425...450...475...500...525...550...575...600...625...650...675...

In [21]:
def batch_get_distances(places, origin_idx, destination_idxs, api_key, mode='driving', max_elem=25):
    results = []#= pd.DataFrame(columns=['distance', 'duration', 'g_orig', 'g_dest', 'crow_dist', 'mode', 'origin_idx', 'dest_idx'])
    
    origin = places.loc[origin_idx, 'lonlat']
    origin_lat, origin_lon, origin_name = places.loc[origin_idx, ['lat', 'lon', 'name']]

    s_idx = 0
    while s_idx < len(destination_idxs):
        print(s_idx, end='...')
        e_idx = min(s_idx + max_elem, len(destination_idxs))
        
        batch_dest_idxs = destination_idxs[s_idx:e_idx]
        destinations = places.loc[batch_dest_idxs, 'lonlat'].tolist()
        
        response = get_distances(origin, destinations, api_key)
        time.sleep(0.25)
        matrix = response.json()
        
        DESTINATIONS = matrix['destination_addresses']
        ORIGIN = matrix['origin_addresses'][0]
        elements = matrix['rows'][0]['elements']

        for j, element in enumerate(elements):
            dest_idx = batch_dest_idxs[j]
            dest_lat, dest_lon, dest_name = places.loc[dest_idx, ['lat', 'lon', 'name']]
            
            result_row = {
                'g_orig': ORIGIN,
                'g_dest': DESTINATIONS[j],
                'crow_dist': 1000 * haversine(origin_lon, origin_lat, dest_lon, dest_lat),
                'mode': mode,
                'origin_idx': origin_idx,
                'dest_idx': dest_idx,
                'orig_name': origin_name,
                'dest_name': dest_name,
            }
            
            if element['status'] == 'OK':
                result_row['distance'] = element['distance']['value']
                result_row['duration'] = element['duration']['value']
            else:
                result_row['distance'] = None
                result_row['duration'] = None
            
            results.append(result_row)
    
        s_idx += max_elem
    
    ret = pd.DataFrame(results)
    ret['speed'] = ret['crow_dist']/ret['duration']*3.6
    ret['ratio'] = ret['crow_dist']/ret['distance']

    return ret

origin_query = places.query('name == @origin_english')
if len(origin_query) == 0:
    raise ValueError(f'No place name in the file found for {origin_name}')
origin_idx = origin_query.index[0]
destination_idxs = places.query('pop > 100 and pop < 1000').index.to_list()
results = batch_get_distances(places, origin_idx, destination_idxs, api_key, MODE, MAX_ELEM)

0...25...50...75...100...125...150...175...200...225...250...275...300...325...350...375...400...425...450...475...500...525...550...575...600...625...650...675...700...725...750...775...

In [22]:
results.query('crow_dist < 15000').sort_values(by='ratio')[['distance', 'duration', 'speed', 'crow_dist', 'orig_name', 'dest_name']]

Unnamed: 0,distance,duration,speed,crow_dist,orig_name,dest_name
265,19859,1286,5.083085,1815.790763,Kahal,Huqoq
403,20325,1313,9.174789,3346.249373,Kahal,Livnim
67,35609,2295,14.168824,9032.62511,Kahal,Arbel
363,36901,2302,15.33481,9805.759369,Kahal,Kefar Zetim
479,39509,1833,21.975958,11189.425378,Kahal,Mizpa
175,17343,1117,15.920815,4939.875251,Kahal,Ginnosar
684,21643,1108,20.110319,6189.509318,Kahal,Ravid
667,11995,764,16.427653,3486.313007,Kahal,Qaddarim
365,33719,2173,16.552368,9991.193445,Kahal,Kefar Hittim
337,15985,850,22.945147,5417.604215,Kahal,Kallanit


In [19]:
results.to_csv(filename, index=False)

### All pairs of nearby places which are far in driving distance

In [14]:
def nearby_places(places, max_dist):
    # find all pairs of places which are less than max_dist apart in aerial distance
    pairs = []
    for i, p1 in places.iterrows():
        for j, p2 in places.iterrows():
            crow_dist = haversine(p1['lon'], p1['lat'], p2['lon'], p2['lat'])
            if i < j and crow_dist < max_dist:
                pairs.append((p1['name'], p2['name']))
    return pairs

In [25]:
nearby_pairs = nearby_places(places.query("pop < 1200"), 2)
print(len(nearby_pairs))

448


In [26]:
def get_pair_distances(places, nearby_pairs):
    pairs = pd.DataFrame(data=nearby_pairs, columns=['place1', 'place2'])
    # for each value of place1, find all pairs which have it as the first element, and call get_batch_distances
    for place1 in pairs['place1'].unique():
        pair_query = pairs.query('place1 == @place1')
        dest_indices = places.query('name in @pair_query.place2').index.tolist()
        results = batch_get_distances(places, places.query('name == @place1').index[0], dest_indices, api_key, MODE, MAX_ELEM)
        if results is not None:
            pairs = pd.concat([pairs, results], axis=0, ignore_index=True)
    return pairs

In [27]:
results_pairs = get_pair_distances(places, nearby_pairs[:1000])

0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...0...

In [28]:
results_pairs.sort_values(by='ratio')[['ratio', 'distance', 'duration', 'speed', 'crow_dist', 'orig_name', 'dest_name']]


Unnamed: 0,ratio,distance,duration,speed,crow_dist,orig_name,dest_name
860,0.063501,21621.0,1248.0,3.960441,1372.952831,Sawa'id(Hamriyye),Ras Ali
718,0.090020,20171.0,1343.0,4.867347,1815.790763,Huqoq,Kahal
881,0.090417,14222.0,907.0,5.103968,1285.916506,Zurit,Shorashim
635,0.090436,8761.0,678.0,4.206933,792.305731,Gid'ona,Nurit
691,0.093884,21053.0,1202.0,5.919779,1976.548517,Harduf,Ras Ali
...,...,...,...,...,...,...,...
443,,,,,,,
444,,,,,,,
445,,,,,,,
446,,,,,,,


In [29]:
pairs_filename = f'{datapath}dist_{MODE}_PAIRS.csv'

results_pairs.to_csv(pairs_filename, index=False)

In [None]:
places