This notebook is used to calculate the duration matrix. We use [Open Source Routing Engine](https://github.com/Project-OSRM/osrm-backend) running locally for imporved performance. We build on [Michael Yan's](https://www.thinkdatascience.com/post/2020-03-03-osrm/osrm/) tutorial

In [1]:
import requests
import numpy as np
import pandas as pd
import warnings
import pickle

## Testing
This first test calculates

In [64]:
url = "http://127.0.0.1:5000/route/v1/driving/9.1895,45.4643;10.4036000,43.7085300"
r = requests.get(url)
res = r.json()
res

{'code': 'Ok',
 'routes': [{'geometry': 'iwntGkyaw@kCa[~u@cyDkCcsBhmA{rBJmeAt}@eHfs@g_@gRyyDu_@epCvm@goG|d@c^lPyy@kA{b@rMmSuHamBxsA{{EnPssCtj@slBbkAieNp_AoUtIazAzVo_@~RgoA~jA_h@xNw[pjDcn@|qAqwD`eBg~BpEix@|X{G@uqCb_A_GnbAu`A_VsvB~]sx@',
   'legs': [{'steps': [],
     'summary': '',
     'weight': 42062.9,
     'duration': 42062.9,
     'distance': 58198.1}],
   'weight_name': 'duration',
   'weight': 42062.9,
   'duration': 42062.9,
   'distance': 58198.1}],
 'waypoints': [{'hint': 'NuQLgNt1AYA6AQAAKgAAAAAAAABvAAAAz90uQlGxtkAAAAAAnRl3QToBAAAqAAAAAAAAAG8AAAABAAAAfjiMAC27tQJ8OIwA7Lq1AgAAvwXjQlzw',
   'distance': 7.225861389,
   'name': '',
   'location': [9.189502, 45.464365]},
  {'hint': '-CgDgPooA4B9AAAAAAAAALABAAAAAAAAtG-LQQAAAACOIXBCAAAAAH0AAAAAAAAAsAEAAAAAAAABAAAAxcWUACqrsgIQv54AcvCaAgQAnwrjQlzw',
   'distance': 180464.148026036,
   'name': 'Via Pallavicina',
   'location': [9.749957, 45.263658]}]}

In [70]:
def get_route(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat):
# car, bike or foot
    loc = "{},{};{},{}".format(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat)
    url = "http://127.0.0.1:5000/route/v1/driving/"
    r = requests.get(url + loc)
    if r.status_code!= 200:
        return {}

    res = r.json()
    duration = res['routes'][0]['duration']
    distance = res['routes'][0]['distance']

    dict_out = {'duration':duration,
           'distance':distance
           }

    return duration

In [10]:
get_route(9.1930893, 45.5053767	,9.23444555, 45.49610165
)

3344.7

In [73]:
lon1 = np.random.uniform(45.5025,45.4368, 1000).round(6)
lon2 = np.random.uniform(45.5025,45.4368, 1000).round(6)
lat1 = np.random.uniform(9.1459,9.3056, 1000).round(6)
lat2 = np.random.uniform(9.1459,9.3056, 1000).round(6)
df = pd.DataFrame({'pickup_lon': lon1,
                   'pickup_lat': lat1,
                   'dropoff_lon': lon2,
                   'dropoff_lat': lat2,
                   })

In [74]:
print(len(df)*4)
df.head()

4000


Unnamed: 0,pickup_lon,pickup_lat,dropoff_lon,dropoff_lat
0,45.438514,9.152328,45.475939,9.218967
1,45.46321,9.14756,45.45198,9.212398
2,45.498095,9.249516,45.495051,9.287159
3,45.451607,9.240196,45.48114,9.246026
4,45.441173,9.281007,45.487307,9.26954


In [85]:
%%time
df['routes'] = df.apply(lambda x: get_route(x['pickup_lat'],
                                            x['pickup_lon'],
                                            x['dropoff_lat'],
                                            x['dropoff_lon']), axis=1)

CPU times: total: 828 ms
Wall time: 15.7 s


In [86]:
df

Unnamed: 0,pickup_lon,pickup_lat,dropoff_lon,dropoff_lat,routes
0,45.438514,9.152328,45.475939,9.218967,5504.3
1,45.463210,9.147560,45.451980,9.212398,4229.1
2,45.498095,9.249516,45.495051,9.287159,3224.2
3,45.451607,9.240196,45.481140,9.246026,3110.8
4,45.441173,9.281007,45.487307,9.269540,6464.6
...,...,...,...,...,...
995,45.492740,9.280712,45.488020,9.227810,3662.4
996,45.468197,9.164892,45.450982,9.167218,1689.0
997,45.486291,9.219670,45.485807,9.177628,2817.7
998,45.455273,9.224342,45.491985,9.299629,6495.4


## Full Code Sperimentation

In [2]:
#importing datasets

mm_df=pd.read_csv('mm_dataset.csv')
mm_df = mm_df.replace('not found', np.nan)
services_df=pd.read_csv('services_df.csv')
calc_df=pd.read_csv('calc_dataset.csv')

# ignore warnings
warnings.filterwarnings('ignore')

#last_row = pickle.load(open("row_file","rb"))
#last_row=0 #to restrart from beginning remove comment

In [24]:
def get_route(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat):
    # car, bike or foot
    loc = "{},{};{},{}".format(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat)
    url = "http://127.0.0.1:5000/route/v1/driving/"
    r = requests.get(url + loc)
    if r.status_code!= 200:
        return {}

    res = r.json()
    duration = res['routes'][0]['duration']
    distance = res['routes'][0]['distance']

    dict_out = {'duration':duration,
                'distance':distance
                }

    return duration

In [4]:
def get_fake_route(pickup_lon, pickup_lat, dropoff_lon, dropoff_lat):
    duration=2
    return duration

In [3]:
dd_dict=mm_df[['CODICE EDIFICIO', 'lat', 'lon']].set_index('CODICE EDIFICIO').T.to_dict('list')
dd_dict

{10001708: ['45.4518933', '9.2280527'],
 10001709: ['45.4518933', '9.2280527'],
 10001710: ['45.4518933', '9.2280527'],
 10001711: ['45.4518933', '9.2280527'],
 10001712: ['45.4518933', '9.2280527'],
 10001713: ['45.4518933', '9.2280527'],
 10001714: ['45.4518933', '9.2280527'],
 10001716: ['45.4516612', '9.2271411'],
 10001717: ['45.4518933', '9.2280527'],
 10001801: ['45.4615841', '9.2111592'],
 10001901: ['45.4364827', '9.2160527'],
 10001902: ['45.436934', '9.2111513'],
 10002001: ['45.4373796', '9.2270668'],
 10002002: ['45.4373796', '9.2270668'],
 10002003: ['45.4373796', '9.2270668'],
 10002004: ['45.4373796', '9.2270668'],
 10002005: ['45.4373796', '9.2270668'],
 10002006: ['45.4373796', '9.2270668'],
 10002101: ['45.5201425', '9.205604'],
 10002102: ['45.5044215', '9.1946081'],
 10002201: ['45.5087493', '9.1917205'],
 10002202: ['45.5087493', '9.1917205'],
 10002203: ['45.5087493', '9.1917205'],
 10002204: ['45.5087493', '9.1917205'],
 10002205: ['45.5087493', '9.1917205'],
 1

In [17]:

def check_duplicate_values(data):
    values_seen = {}
    result = []

    for key, value in data.items():
        # Convert the list of values to a tuple to use as a key in the dictionary
        value_key = tuple(value)

        # Check if the values have already been seen
        if value_key in values_seen:
            result.append(values_seen[value_key])
        else:
            result.append(np.nan)
            values_seen[value_key] = key

    for i, key in enumerate(data.keys()):
        data[key].append(result[i])

    return data


check_duplicate_values(dd_dict)

In [13]:
my_dict = {"key1": [1, 2, 3], "key2": [4, 5, 6], "key3": [7, 8, 9]}

if [4, 5, 6] in my_dict.values():
    print("The value [4, 5, 6] is present in the dictionary as a value.")
else:
    print("The value [4, 5, 6] is not present in the dictionary as a value.")

The value [4, 5, 6] is present in the dictionary as a value.


In [46]:
def lat_form_id(id):
    return float(dd_dict.get(id)[0])
def long_form_id(id):
    return float(dd_dict.get(id)[1])

long_form_id(10001708)

9.2280527

In [27]:
%%time
def find_loc(index):
    row_df=services_df.iloc[[index]]
    r_lat=float(row_df['lat'])
    r_long=float(row_df['long'])
    for key, values in dd_dict.items():
        if values[2]!=np.nan:
            d_lat=float(values[0])
            d_long=float(values[1])
            time=get_route(d_long, d_lat, r_long, r_lat)
            row_df[str(key)]=time
        else:
            key_dup=values[2]
            row_df[str(key)]=row_df[str(key_dup)]
    return row_df
find_loc(2)

CPU times: total: 344 ms
Wall time: 12.9 s


Unnamed: 0.1,Unnamed: 0,name,long,lat,df_name,cat1,cat2,cat1_name,cat2_name,10001708,...,51039101,51039401,51039402,51039403,51039404,51039405,51039406,51039407,51039408,51039410
2,2,BANDE NERE,9.136149,45.461792,acqua,,,,,5797.8,...,,7023.8,7023.8,7023.8,7023.8,7023.8,7023.8,7023.8,7023.8,7023.8


In [28]:
%%time
def find_loc(index):
    row_df=services_df.iloc[[index]]
    df_calc=mm_df[['CODICE EDIFICIO', 'lat', 'lon']]
    r_lat=float(row_df['lat'])
    r_long=float(row_df['long'])
    df_calc['time']=df_calc.apply(lambda x:get_route(x['lon'], x['lat'], r_long, r_lat), axis=1)
    df_calc=df_calc[['CODICE EDIFICIO', 'time']].T
    df_calc.columns=df_calc.iloc[0]
    row_df=pd.concat([row_df, df_calc.iloc[1]], axis=0, ignore_index=True)
    return row_df
find_loc(2)

CPU times: total: 2.2 s
Wall time: 19.2 s


Unnamed: 0.1,Unnamed: 0,name,long,lat,df_name,cat1,cat2,cat1_name,cat2_name,0
0,2.0,BANDE NERE,9.136149,45.461792,acqua,,,,,
1,,,,,,,,,,5797.8
2,,,,,,,,,,5797.8
3,,,,,,,,,,5797.8
4,,,,,,,,,,5797.8
...,...,...,...,...,...,...,...,...,...,...
976,,,,,,,,,,7023.8
977,,,,,,,,,,7023.8
978,,,,,,,,,,7023.8
979,,,,,,,,,,7023.8


In [None]:
def main(df):
    last_row=len(df)
    last_row_txt= pickle.load(open("row_file","rb"))
    df = df.reset_index()
    try:
        last_row = last_row_txt
        print('No conflict detected. Starting program')
    except Exception as e:
        print('CONFLICT DETECTED. Stopping code', e)

    print("Process starting form" + last_row)
    if last_row!=len(df):
        df_calc=find_loc(last_row)
        df_calc=pd.concat([df, df_calc])
        df_calc.to_csv('calc_dataset.csv')
        last_row=last_row+1
        pickle.dump(last_row,open("row_file","wb"))
        print('Calculated correctly row', last_row)