In [1]:
from datetime import datetime
from pyproj import CRS, Transformer
from itertools import combinations
from os.path import isfile, join
from os import listdir
from collections import Counter
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import sys
import math
import requests
import json
import time

# Bus

***dictionnaire : ville -> position gps en france ET inversement***

In [2]:
# lecture toutes les villes desservies d'Europe
transformer = Transformer.from_crs("EPSG:2154", "EPSG:4326")
tree = ET.parse("bus/flexibus/arrets.xml")
root_arret = tree.getroot()
dic_temp_1 = {}
for child in root_arret[2][0][0]:
    to_gps = child[1][0][0].text.split()
    to_gps = transformer.transform(to_gps[0], to_gps[1])
    dic_temp_1[child[0].text] = to_gps

# premier tri par points gps extremes france métropolitaine continentale (décimal)
lat_nord = 51.09 # arrondi au centieme au dessus pour inclure le point
lat_sud = 42.34
long_est = 8.24
long_ouest = -4.80
dic_temp_2 = {}
for key, value in dic_temp_1.items():
    lat, long = value[0], value[1]
    if lat < lat_nord and lat > lat_sud and long < long_est and long > long_ouest:
        dic_temp_2[key] = value

# deuxième tri par rapport aux noms des communes de France
array_communes_fr = pd.read_csv("bus/flexibus/liste-des-communes-francaises.csv", sep=";", usecols=[1]).values
dic_temp_3 = {}
for key, value in dic_temp_2.items():
    key_split = key.split() # on prend chaque morceau du split
    condition = 0
    for i in range(1, len(key_split)+1):
        for j in combinations(key_split, i):
            if " ".join(j) in array_communes_fr:
                condition = 1
    if condition == 1:
        dic_temp_3[key] = value

# troisième tri à la main..
to_remove = ["Cologne central station (FlixTrain)", "Cologne East (Bergisch Gladbach, Refrath)",
             "Siegen (Koblenzer Straße)", "Cologne South (Airport CGN Terminal 2)", 
             "Maastricht (International Bus Station)", "Luxembourg, P+R Bouillon", "Fribourg (Poya)",
             "Liège Guillemins (Bus parking Rue Varin/Rue Bovy)"]
dic_villes_pos = {}
for key, value in dic_temp_3.items():
    if key not in to_remove:
        dic_villes_pos[key] = value
        
dic_pos_villes = {} # vérification faite : il y a bien une ville pour une position et inversement
for key, value in dic_villes_pos.items():
    dic_pos_villes[value] = key

***DataFrame (ville1, ville2, temps)***

In [3]:
ville1_to_ville2_temps = []
xmls = [f for f in listdir('bus/flexibus/reseau_FlixBuseu_52fae30d90072cf43c6ff50fbd00c07d') if 
          isfile(join('bus/flexibus/reseau_FlixBuseu_52fae30d90072cf43c6ff50fbd00c07d', f))]

for xml in xmls:
    tree = ET.parse(f"bus/flexibus/reseau_FlixBuseu_52fae30d90072cf43c6ff50fbd00c07d/{xml}")
    root = tree.getroot()
    doc = root[2][0][0]
    patterns = {}
    for child in root[2][0][0].findall("{http://www.netex.org.uk/netex}ServiceJourneyPattern"):    
        patterns[child.attrib["id"]] = None

    patterns_time = patterns.copy()
    patterns
    scheduled = None
    racine = "FR:ServiceJourneyPattern:"
    for child in root[2][0][0].findall("{http://www.netex.org.uk/netex}ScheduledStopPoint"):
        pattern = child.attrib["id"].split("_")[0].split(":")[-1] + ":"
        if scheduled != pattern:
            if scheduled != None:
                patterns[racine + scheduled] = stop_points
            stop_points = []
        to_gps = child[0][0].text.split()
        stop_points.append(dic_pos_villes.get(transformer.transform(to_gps[0], to_gps[1]), None))
        scheduled = pattern
    patterns[racine + scheduled] = stop_points

    scheduled = None
    for child in root[2][0][0].findall("{http://www.netex.org.uk/netex}ServiceJourney"):
        pattern = child[1].attrib["ref"]
        departure_time = None
        if scheduled != pattern: # suppose même temps de trajet sur tous les scheduled de même pattern
            stop_points = []
            for child_bis in child[3].findall("{http://www.netex.org.uk/netex}TimetabledPassingTime"):
                arrival_day_offset = int(child_bis[1].text) + 1
                arrival_time = datetime.strptime(str(arrival_day_offset) + " " + child_bis[0].text,
                                                          "%d %H:%M:%S")
                if departure_time != None:
                    delta = arrival_time - departure_time
                    stop_points.append(delta.days*86400 + delta.seconds)
                departure_day_offset = int(child_bis[3].text) + 1
                departure_time = datetime.strptime(str(departure_day_offset) + " " + child_bis[2].text, 
                                                            "%d %H:%M:%S")
            patterns_time[pattern] = stop_points
        scheduled = pattern

    for key, value in patterns_time.items():
        temps = value
        villes = patterns[key]
        for i in range(len(temps)):
            if (villes[i] != None) and (villes[i+1] != None):
                pos_i = dic_villes_pos[villes[i]]
                pos_i_1 = dic_villes_pos[villes[i+1]]
                ville1_to_ville2_temps.append([villes[i], round(pos_i[0], 5), round(pos_i[1], 5), villes[i+1], 
                                               round(pos_i_1[0], 5), round(pos_i_1[1], 5), int(temps[i]/60)])
            

df_bus = pd.DataFrame(ville1_to_ville2_temps, columns = ["depart", "latitude depart", "longitude depart", 
                                                         "arrivee", "latitude arrivee", "longitude arrivee", 
                                                         "duree (min)"])
    
col = [["depart", "depart", "depart", "arrivee", "arrivee", "arrivee", "duree (min)"], 
       ["nom", "latitude", "longitude", "nom", "latitude", "longitude", ""]]
col = pd.MultiIndex.from_arrays(col)
df_2_bus = pd.DataFrame(columns=col)
df_2_bus[[("depart", "nom"), ("depart", "latitude"), ("depart", "longitude")]] = \
df_bus[["depart", "latitude depart", "longitude depart"]]
df_2_bus[[("arrivee", "nom"), ("arrivee", "latitude"), ("arrivee", "longitude")]] = \
df_bus[["arrivee", "latitude arrivee", "longitude arrivee"]]
df_2_bus[("duree (min)", "")] = df_bus["duree (min)"]

***DataFrame multiindex suite (enlève doublons, ajoute prix et empreinte éco)***  

In [4]:
col = [["point_1", "point_1", "point_1", "point_2", "point_2", "point_2", "duree (min)", "distance (km)"], 
       ["nom", "latitude", "longitude", "nom", "latitude", "longitude", "", ""]]
col = pd.MultiIndex.from_arrays(col)
df_2_bus = pd.read_csv("bus/df_bus_temp.csv", sep=",", header=1, names = col)
df_2_bus[[("point_1", "latitude"), ("point_1", "longitude"), ("point_2", "latitude"), ("point_2", "longitude")]] =\
df_2_bus[[("point_1", "latitude"), ("point_1", "longitude"), ("point_2", "latitude"), ("point_2", "longitude")]].\
                                                                                    applymap(lambda x: round(x, 5))

"""# si il y a A->B et B->A on enlève B->A et on garde la moyenne des valeurs de durée
to_keep = []
duree = []
for v1, v2 in df_2_bus[[("point_1", "nom"), ("point_2", "nom")]].values:
    if ([v1, v2] and [v2, v1]) not in to_keep:
        to_keep.append([v1, v2])
        duree.append(round(df_2_bus[("duree (min)", "")].loc[df_2_bus[[("point_1", "nom"), ("point_2", "nom")]]
                               .apply(lambda x: (list(x) == [v1, v2]) | (list(x) == [v2, v1]), axis=1)].mean(), 2))
df_2_bus = df_2_bus.loc[df_2_bus[[("point_1", "nom"), ("point_2", "nom")]].apply(lambda x: list(x) in to_keep, 
                                                                                axis=1)]
df_2_bus[("duree (min)", "")] = duree"""
# si il y a A->B et B->A on enlève B->A et on garde la moyenne des valeurs de durée
to_keep = []
duree = {}
to_divide = {}
for v1, v2, t in df_2_bus[[("point_1", "nom"), ("point_2", "nom"), ("duree (min)", "")]].values:
    if ([v1, v2] in to_keep) or ([v2, v1] in to_keep):
        pass
    else:
        to_keep.append([v1, v2])
    if [v1, v2] in to_keep:
        duree[f"{v1} : {v2}"] = duree.get(f"{v1} : {v2}", 0.0) + t
        to_divide[f"{v1} : {v2}"] = to_divide.get(f"{v1} : {v2}", 0) + 1
    elif [v2, v1] in to_keep:
        duree[f"{v2} : {v1}"] = duree.get(f"{v2} : {v1}", 0.0) + t
        to_divide[f"{v2} : {v1}"]= to_divide.get(f"{v2} : {v1}", 0) + 1
df_2_bus = df_2_bus.loc[df_2_bus[[("point_1", "nom"), ("point_2", "nom")]].apply(lambda x: 
                                                                                       list(x) in to_keep, axis=1)]
duree = dict(zip(list(duree.keys()),np.array(list(duree.values()))/np.array(list(to_divide.values()))))

df_3_bus = []
temp = df_2_bus[[("point_1", "nom"),("point_2" ,"nom")]].values.tolist()
depart_arrivee = []
for element in temp:
    depart_arrivee.append(f"{element[0]} : {element[1]}")
doublons = Counter(depart_arrivee).most_common()

for doublon in doublons:
    doublon = doublon[0].split(" : ")
    df_temp = df_2_bus.loc[df_2_bus[("point_1", "nom")].values == doublon[0]]
    df_temp = df_temp.loc[df_temp[("point_2", "nom")].values == doublon[1]]
    df_3_bus.append(df_temp[[("point_1", "nom"), ("point_1", "latitude"), ("point_1", "longitude"),
                         ("point_2", "nom"), ("point_2", "latitude"), ("point_2", "longitude")]].values[0].tolist()
          + [round(df_temp[("duree (min)", "")].mean(), 2)] + [round(df_temp[("distance (km)", "")].values[0], 3)])
df_3_bus = pd.DataFrame(df_3_bus, columns=col)

duree_bis = []
for points in df_3_bus[[("point_1", "nom"), ("point_2", "nom")]].values:
    duree_bis.append(round(duree[f"{points[0]} : {points[1]}"], 2))
df_3_bus[("duree (min)", "")] = duree_bis

df_3_bus[("empreinte carbone (gCO2)")] = df_3_bus[("distance (km)", "")].apply(lambda x: round(58.5*x, 1))
df_3_bus[("prix (euros)", "")] = df_3_bus[("distance (km)", "")].apply(lambda x: round(7*x/100, 2))
df_3_bus.to_csv("results/df_bus.csv", index=False)

# Résultats

In [5]:
df_3_bus

Unnamed: 0_level_0,point_1,point_1,point_1,point_2,point_2,point_2,duree (min),distance (km),prix (euros),empreinte carbone (gCO2)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Paris Charles de Gaulle (Central Bus Station -...,49.01067,2.55926,Paris (Bercy Seine),48.83532,2.38052,42.69,29.749,2.08,1740.3
1,Lyon Perrache (Central Bus Station),45.74875,4.82602,Grenoble (central bus station),45.19283,5.71394,95.50,104.401,7.31,6107.5
2,Paris (Orly Airport),48.73156,2.37358,Paris (Bercy Seine),48.83532,2.38052,31.25,15.046,1.05,880.2
3,Lyon Perrache (Central Bus Station),45.74875,4.82602,Clermont-Ferrand (bus terminal Les Salins),45.77076,3.08225,128.12,166.360,11.65,9732.1
4,Paris Charles de Gaulle (Central Bus Station -...,49.01067,2.55926,Lille,50.63920,3.07620,138.93,198.490,13.89,11611.7
...,...,...,...,...,...,...,...,...,...,...
224,Saint-Étienne,45.44250,4.40286,Givors (South Lyon),45.59105,4.77193,35.00,37.148,2.60,2173.2
225,Givors (South Lyon),45.59105,4.77193,Lyon Perrache (Central Bus Station),45.74875,4.82602,20.00,23.717,1.66,1387.4
226,Dunkerque,51.03119,2.36896,Lille,50.63920,3.07620,60.00,82.963,5.81,4853.3
227,Calais (Bus Stop on Terminal Ferry),50.96679,1.86345,Lille,50.63920,3.07620,90.00,116.484,8.15,6814.3
