#### Experimentation du principes de tirages de plus court chemins entre sources et destinations à l'échelle global

Objectif :

- Avant tout, convertir les données du graphe Noe4j utiles dans les bons types (code insee -> Integer, travel_time -> Float)

- Récupérer, à l'échelle du territoire global cette fois, les deux listes sources et destinations, organisées sur la distribution respectives du nombres de véhicules immatriculés (pour les sources) et du score touristique (pour les destinations)

- Expérimenter une preuve de concept de 10 000 tirages aléatoires entre source et destination (pris aléatoirement dans les deux listes), avec requête sous Neo4j et algorithme de plus court chemin.

- Les résultats s'organisent en chemin de noeud. Dans la même requête Neo4j, il faut ensuite transformer ces noeuds de ROAD_POINT en communes les plus proches. La reqête est ainsi décomposée en 2 sous-requêtes.

- Récupérer le résultat final sous forme d'un dataframe (ou d'une liste de liste)

Imports :

In [117]:
import numpy as np

import random

import pandas as pd

import pickle

from neo4j import GraphDatabase, basic_auth

import plotly.express as px

import geopy.distance

In [118]:
PASSWORD_NEO4J = 'passwordneo4j'

In [119]:
driver = GraphDatabase.driver(
  "bolt://localhost:7687",
  auth=basic_auth("neo4j", PASSWORD_NEO4J))

In [120]:
NB_DRAWS = 10000

RATIO_EXCLUDE = 0.3 # proportion des neouds exclus du chemin (pour éviter une surrestimation des grandes villes de départ)

MINIMAL_SUBPATH = 6 # longueur de chemin final minimal (après réduction grâce au ratio précédent) pour que ce chemin soit pris en compte et transmis

---

Cypher Request :

In [1]:
def convert_insee_to_integer(tx):

    query = "MATCH (c:CITY) \
                SET c.insee = ToInteger(c.insee)"
    
    result = tx.run(query)
    return result.data()

In [9]:
def convert_travel_time_to_float(tx):

    query = "MATCH ()-[r:RELATED]-() \
                SET r.travel_time = round(ToFloat(r.travel_time), 2)"
    
    result = tx.run(query)
    return result.data()

In [88]:
def travel_time_to_NEARLY_rel(tx):

    query = "MATCH ()-[r:NEARLY_TO]-() \
                SET r.travel_time = ToFloat(0)"
    
    result = tx.run(query)
    return result.data()

In [16]:
def projection_graph(tx):

    query = "CALL gds.graph.project( \
    'map_shortest_path_5', \
    { \
        CITY: {properties: ['insee', 'x', 'y']}, \
        ROAD_POINT: {properties: ['x', 'y']} \
            }, \
    { \
        NEARLY_TO: {properties:'travel_time', orientation : 'UNDIRECTED'}, \
        RELATED: {properties:'travel_time', orientation : 'NATURAL'} \
        }) \
    YIELD graphName, nodeProjection,  nodeCount, relationshipProjection, relationshipCount \
    RETURN graphName, nodeProjection,  nodeCount, relationshipProjection, relationshipCount"

    result = tx.run(query)
    return result.data()


Algorithme de Dijkstra :

In [71]:
def shortest_path(tx, props_list, ratio):

    query = "UNWIND $props_list AS map \
            MATCH (source:CITY {insee: map.source_insee}), (target:CITY {insee: map.target_insee}) \
                CALL gds.shortestPath.dijkstra.stream( \
                'map_shortest_path_5', \
                    { \
                    sourceNode: source, \
                    targetNode: target, \
                    relationshipWeightProperty: 'travel_time' \
                        }) \
            YIELD nodeIds \
            CALL { \
                    WITH nodeIds \
                    WITH nodeIds[ToInteger(size(nodeIds)*"+ str(ratio) + ")..] AS path_list \
                    MATCH (p:ROAD_POINT)-[NEARLY_TO]->(c:CITY) \
                    WHERE ID(p) IN path_list \
                    RETURN apoc.coll.toSet(collect(c.insee)) AS collection \
                }\
                RETURN collection"
    
    
    result = tx.run(query, props_list = props_list)
    return result.data()


Autre essai d'alogorithme  (A*) (attention: en réalité, trois fois plus long !) :

In [25]:
""" def shortest_path_A(tx, props_list):

    query = "UNWIND $props_list AS map \
            MATCH (source:CITY {insee: map.source_insee}), (target:CITY {insee: map.target_insee}) \
                CALL gds.shortestPath.astar.stream( \
                'map_shortest_path_4', \
                    { \
                    sourceNode: source, \
                    targetNode: target, \
                    latitudeProperty: 'y', \
                    longitudeProperty: 'x', \
                    relationshipWeightProperty: 'travel_time' \
                        }) \
            YIELD nodeIds \
            CALL { \
                        WITH nodeIds \
                        MATCH (p:ROAD_POINT)-[NEARLY_TO]->(c:CITY) \
                        WHERE ID(p) IN nodeIds \
                        RETURN apoc.coll.toSet(collect(c.insee)) AS collection \
                }\
                RETURN collection"
    
    
    result = tx.run(query, props_list = props_list)
    return result.data() """

In [453]:
def get_communes_nearliest(tx, props_list):

    query = "UNWIND $props_list AS map \
                CALL { \
                        WITH map \
                        MATCH (p:ROAD_POINT)-[NEARLY_TO]->(c:CITY) \
                        WHERE ID(p) IN map.nodeIds \
                        RETURN apoc.coll.toSet(collect(c.insee)) AS collection \
                } \
                RETURN collection"
    
    result = tx.run(query, props_list = props_list)
    return result.data()

In [27]:
def get_all_communes_with_coords(tx):

    query = "MATCH (n:CITY) RETURN n.insee as insee, n.x as x, n.y as y"

    result = tx.run(query)
    return result.data()


---

Requêtes de preprocessing :

Conversion des codes insee en entiers (pour accélérer les calculs)

In [228]:
with driver.session() as session:

    result = session.execute_write(convert_insee_to_integer)

driver.close()

print(result)

[]


Conversion des "travel_time" des routes en float (car les "travel_time" vont servir de poids aux algorithmes de plus court chemins)

In [229]:
with driver.session() as session:

    result = session.execute_write(convert_travel_time_to_float)

driver.close()

print(result)

[]


Ajout d'un "travel_time" par défaut à zéro pour les relations "NEARLY_TO" entre CITY et ROAD POINT pour prévenir les bugs

In [89]:
with driver.session() as session:

    result = session.execute_write(travel_time_to_NEARLY_rel)

driver.close()

print(result)

[]


---

Requête de projection (pour emploi de l'algo de plus court chemin)

In [18]:
with driver.session() as session:

    count_list = []

    result = session.execute_write(projection_graph)

driver.close()

print(result)

[{'graphName': 'map_shortest_path_5', 'nodeProjection': {'ROAD_POINT': {'label': 'ROAD_POINT', 'properties': {'x': {'defaultValue': None, 'property': 'x'}, 'y': {'defaultValue': None, 'property': 'y'}}}, 'CITY': {'label': 'CITY', 'properties': {'insee': {'defaultValue': None, 'property': 'insee'}, 'x': {'defaultValue': None, 'property': 'x'}, 'y': {'defaultValue': None, 'property': 'y'}}}}, 'nodeCount': 557768, 'relationshipProjection': {'RELATED': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'RELATED', 'properties': {'travel_time': {'defaultValue': None, 'property': 'travel_time', 'aggregation': 'DEFAULT'}}}, 'NEARLY_TO': {'orientation': 'UNDIRECTED', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'NEARLY_TO', 'properties': {'travel_time': {'defaultValue': None, 'property': 'travel_time', 'aggregation': 'DEFAULT'}}}}, 'relationshipCount': 1359891}]


----

Récupération des listes aléatoires de sources et destinations :

In [6]:
with open('./datas/source_list_test.pkl', 'rb') as f:
    source_list = pickle.load(f)

with open('./datas/target_list_test.pkl', 'rb') as f:
    target_list = pickle.load(f)

Shuffling (selon nombre de tirages)

In [121]:
shuffle_source = [random.choice(source_list) for i in range(NB_DRAWS)]
shuffle_target = [random.choice(target_list) for i in range(NB_DRAWS)]

Conversion en dataframe :

In [122]:
df_shuffle = pd.DataFrame({"source_insee" : shuffle_source, "target_insee" : shuffle_target})

In [99]:
# df_shuffle_test = df_shuffle.iloc[0:10,:]

In [123]:
props_list = df_shuffle.to_dict('records')

In [124]:
props_list

[{'source_insee': 6085, 'target_insee': 5097},
 {'source_insee': 6117, 'target_insee': 13004},
 {'source_insee': 85178, 'target_insee': 3023},
 {'source_insee': 77058, 'target_insee': 67348},
 {'source_insee': 59378, 'target_insee': 13055},
 {'source_insee': 33051, 'target_insee': 1030},
 {'source_insee': 92048, 'target_insee': 1275},
 {'source_insee': 60036, 'target_insee': 14478},
 {'source_insee': 33090, 'target_insee': 59374},
 {'source_insee': 74056, 'target_insee': 66171},
 {'source_insee': 82037, 'target_insee': 38176},
 {'source_insee': 30004, 'target_insee': 34322},
 {'source_insee': 26198, 'target_insee': 62225},
 {'source_insee': 74019, 'target_insee': 69287},
 {'source_insee': 31555, 'target_insee': 33544},
 {'source_insee': 93001, 'target_insee': 74013},
 {'source_insee': 74112, 'target_insee': 7186},
 {'source_insee': 38151, 'target_insee': 24585},
 {'source_insee': 34172, 'target_insee': 83009},
 {'source_insee': 29161, 'target_insee': 36228},
 {'source_insee': 92044, 't

In [125]:
with driver.session() as session:

    result = session.execute_write(shortest_path, props_list, RATIO_EXCLUDE)

    #print(result[0])

    #result = session.execute_write(get_communes_nearliest, result)

driver.close()

In [126]:
paths = pd.DataFrame(result)

In [127]:
MINIMAL_SUBPATH

6

In [128]:
paths['condition'] = paths['collection'].map(lambda l : True if len(l) >= MINIMAL_SUBPATH else False)

In [129]:
paths = paths.loc[paths['condition'],:]

In [131]:
paths.drop(['condition'], axis=1, inplace=True)

In [133]:
paths.to_parquet("./datas/path_sample_10000_new.parquet")

Re-import :

In [38]:
"test_df = pd.read_parquet("./datas/test_paths_mini.parquet")"

----

Visualisations de test

Réupération des coordonnées des communes :

In [106]:
with driver.session() as session:

    result = session.execute_write(get_all_communes_with_coords)

driver.close()

df_communes = pd.DataFrame(result)

In [107]:
def visualize_one_path(df, index):

    index_list = df.iloc[index,:].tolist()[0]

    print(f" Path : {index_list}")

    df_reduce = df_communes.loc[df_communes['insee'].isin(index_list),:]

    fig = px.scatter_mapbox(df_reduce, lat='y', lon='x', hover_name='insee', mapbox_style='open-street-map')

    fig.show()

    

In [108]:
visualize_one_path(paths, 3)

 Path : [11399, 11284, 11076, 11430, 11438, 11024, 66136]


In [80]:
for row in paths.values:

    print(len(row[0]))

18
38
15
66
49
51
73
93
73
59
20
74
42
28
18
59
24
18
42
15
38
58
59
12
6
49
58
77
32
74
20
25
85
69
23
54
39
31
68
30
40
57
66
10
65
58
49
43
63
60
7
18
47
28
57
61
53
15
58
33
59
35
87
57
9
21
82
32
15
12
10
50
51
60
39
53
58
26
36
61
61
15
45
46
48
28
48
64
59
76
38
52
48
20
21
68
51
11
44
24
4
20
82
28
39
42
33
56
57
34
50
8
12
5
26
34
10
3
82
6
42
49
30
32
18
67
65
34
31
36
25
37
81
23
3
57
48
7
29
38
11
55
35
40
60
55
16
14
37
72
36
60
21
49
54
73
28
9
19
30
35
25
22
37
53
91
35
14
62
61
41
78
71
23
46
21
29
58
42
25
58
17
7
46
56
56
44
51
6
37
17
29
107
76
20
59
96
3
39
8
9
49
74
64
20
39
28
70
30
47
62
78
7
64
28
30
42
24
57
81
92
37
37
33
84
57
39
49
21
12
21
8
48
44
8
30
61
27
57
50
79
14
35
62
4
41
39
67
44
12
69
44
39
22
21
82
13
54
42
20
62
10
89
11
16
54
68
30
5
28
13
44
32
9
22
4
30
66
15
18
20
5
25
61
11
82
97
63
46
28
66
75
16
67
22
46
70
59
13
33
28
72
12
90
12
44
108
40
63
67
63
15
15
35
21
15
82
18
39
18
89
22
48
24
30
80
64
52
15
38
14
9
63
18
64
34
10
66
27
46
35
