In [1]:
from datetime import datetime
from pyproj import CRS, Transformer
from itertools import combinations
from os.path import isfile, join
from os import listdir
from collections import Counter
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import sys
import math
import requests
import json
import time

PATH_data = "data" # chemin vers les data pour intercites
PATH_result = "../results" # chemin vers résultat du dataframe

# Intercites

***dictionnaire : ville -> position gps en france ET inversement***

In [2]:
errors = []
transformer = Transformer.from_crs("EPSG:2154", "EPSG:4326")
tree = ET.parse(f"{PATH_data}/arrets.xml")
root_arret = tree.getroot()
dic_villes_pos = {}
for child in root_arret[2][0][0]:
    try:
        to_gps = child[1][0][0].text.split()
        to_gps = transformer.transform(to_gps[0], to_gps[1])
        dic_villes_pos[child[0].text] = to_gps
    except:
        errors.append(child.attrib["id"])

dic_pos_villes = {} # vérification faite : il y a bien une ville pour une position et inversement
for key, value in dic_villes_pos.items():
    dic_pos_villes[value] = key

errors

['FR:Quay:StopPoint_OCECar TER-87733931:',
 'FR:Quay:StopPoint_OCECar TER-87733923:',
 'FR:Quay:StopPoint_OCECar TER-87733857:',
 'FR:Quay:StopPoint_OCECar TER-87733840:',
 'FR:Quay:StopPoint_OCECar TER-87733899:',
 'FR:Quay:StopPoint_OCECar TER-87699298:',
 'FR:Quay:StopPoint_OCECar TER-00:',
 'FR:Quay:StopPoint_OCECar TER-87732222:',
 'FR:Quay:StopPoint_OCECar TER-87733733:',
 'FR:StopPlace:StopArea_OCE87733931_bus:',
 'FR:StopPlace:StopArea_OCE87733931:',
 'FR:StopPlace:StopArea_OCE87733923_bus:',
 'FR:StopPlace:StopArea_OCE87733923:',
 'FR:StopPlace:StopArea_OCE87733857_bus:',
 'FR:StopPlace:StopArea_OCE87733857:',
 'FR:StopPlace:StopArea_OCE87733840_bus:',
 'FR:StopPlace:StopArea_OCE87733840:',
 'FR:StopPlace:StopArea_OCE87733899_bus:',
 'FR:StopPlace:StopArea_OCE87733899:',
 'FR:StopPlace:StopArea_OCE87699298_bus:',
 'FR:StopPlace:StopArea_OCE87699298:',
 'FR:StopPlace:StopArea_OCE00_bus:',
 'FR:StopPlace:StopArea_OCE00:',
 'FR:StopPlace:StopArea_OCE87732222_bus:',
 'FR:StopPlace

***DataFrame (ville1, ville2, temps)***

In [3]:
ville1_to_ville2_temps = []
xmls = [f for f in listdir(f"{PATH_data}/reseau_SNCF_89ccb86166d6498e06e8ca28adb2dec1") if 
          isfile(join(f"{PATH_data}/reseau_SNCF_89ccb86166d6498e06e8ca28adb2dec1", f))]

for xml in xmls:
    tree = ET.parse(f"{PATH_data}/reseau_SNCF_89ccb86166d6498e06e8ca28adb2dec1/{xml}")
    root = tree.getroot()
    doc = root[2][0][0]
    patterns = {}
    for child in root[2][0][0].findall("{http://www.netex.org.uk/netex}ServiceJourneyPattern"):    
        patterns[child.attrib["id"]] = None
    patterns_time = patterns.copy()
    scheduled = None
    racine = "FR:ServiceJourneyPattern:"
    try:
        for child in root[2][0][0].findall("{http://www.netex.org.uk/netex}ScheduledStopPoint"):
            pattern = child.attrib["id"].split("_")[0].split(":")[-1] + ":"
            if scheduled != pattern:
                if scheduled != None:
                    patterns[racine + scheduled] = stop_points
                stop_points = []
            to_gps = child[0][0].text.split()
            stop_points.append(dic_pos_villes.get(transformer.transform(to_gps[0], to_gps[1]), None))
            scheduled = pattern
        patterns[racine + scheduled] = stop_points
    except:
        print(xml)

    scheduled = None
    for child in root[2][0][0].findall("{http://www.netex.org.uk/netex}ServiceJourney"):
        pattern = child[1].attrib["ref"]
        departure_time = None
        if scheduled != pattern: # suppose même temps de trajet sur tous les scheduled de même pattern
            stop_points = []
            for child_bis in child[3].findall("{http://www.netex.org.uk/netex}TimetabledPassingTime"):
                arrival_day_offset = int(child_bis[1].text) + 1
                arrival_time = datetime.strptime(str(arrival_day_offset) + " " + child_bis[0].text,
                                                          "%d %H:%M:%S")
                if departure_time != None:
                    delta = arrival_time - departure_time
                    stop_points.append(delta.days*86400 + delta.seconds)
                departure_day_offset = int(child_bis[3].text) + 1
                departure_time = datetime.strptime(str(departure_day_offset) + " " + child_bis[2].text, 
                                                            "%d %H:%M:%S")
            patterns_time[pattern] = stop_points
        scheduled = pattern
    
    for key, value in patterns_time.items():
        temps = value
        villes = patterns[key]
        try:
            for i in range(len(temps)):
                if (villes[i] != None) and (villes[i+1] != None):
                    pos_i = dic_villes_pos[villes[i]]
                    pos_i_1 = dic_villes_pos[villes[i+1]]
                    ville1_to_ville2_temps.append([villes[i], round(pos_i[0], 5), round(pos_i[1], 5), villes[i+1], 
                                                   round(pos_i_1[0], 5), round(pos_i_1[1], 5), int(temps[i]/60)])
        except:
            pass
            

df_intercites = pd.DataFrame(ville1_to_ville2_temps, columns = ["depart", "latitude depart", "longitude depart", 
                                                         "arrivee", "latitude arrivee", "longitude arrivee", 
                                                         "duree (min)"])

col = [["depart", "depart", "depart", "arrivee", "arrivee", "arrivee", "duree (min)"], 
       ["nom", "latitude", "longitude", "nom", "latitude", "longitude", ""]]
col = pd.MultiIndex.from_arrays(col)
df_2_intercites = pd.DataFrame(columns=col)
df_2_intercites[[("depart", "nom"), ("depart", "latitude"), ("depart", "longitude")]] = \
df_intercites[["depart", "latitude depart", "longitude depart"]]
df_2_intercites[[("arrivee", "nom"), ("arrivee", "latitude"), ("arrivee", "longitude")]] = \
df_intercites[["arrivee", "latitude arrivee", "longitude arrivee"]]
df_2_intercites[("duree (min)", "")] = df_intercites["duree (min)"]

offre_6606e9b387694381bad9670de669ce1e.xml
offre_5158a30264e43367495e74b8c6735a7d.xml
offre_4a823d2a6a44543506265922a2815090.xml
offre_87ad5d4ce01ebbce4a7d08a35aaa73d9.xml
offre_ee971893e93fa0966860efcd711977d9.xml
offre_c2470a0ab5aba7edaba195520336f846.xml
offre_cd4698571d2697dbd8df59e9bdcdf6cf.xml
offre_f3972a93319038a8608b172c340856ee.xml
offre_9d5f0a8a4683373297428372ceb01b05.xml
offre_d1b9a952abe4aa6472a7a034ad4965b3.xml
offre_b2dc1344b6765647b68d1824eb506d41.xml


In [4]:
# Exception (trouvé avec les graph)

df_2_intercites.loc[df_2_intercites[("depart", "nom")] == "Millançay-Bourg"]

Unnamed: 0_level_0,depart,depart,depart,arrivee,arrivee,arrivee,duree (min)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1
2922,Millançay-Bourg,1.77289,47.44759,Chaumont-sur-Tharonne,47.61062,1.90438,16


In [5]:
df_2_intercites.loc[df_2_intercites[("arrivee", "nom")] == "Millançay-Bourg"]

Unnamed: 0_level_0,depart,depart,depart,arrivee,arrivee,arrivee,duree (min)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1
2921,Romorantin-Gendarmerie,1.7379,47.38437,Millançay-Bourg,1.77289,47.44759,7


In [6]:
df_2_intercites.loc[df_2_intercites[("depart", "nom")] == "Romorantin-Gendarmerie"]

Unnamed: 0_level_0,depart,depart,depart,arrivee,arrivee,arrivee,duree (min)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1
2921,Romorantin-Gendarmerie,1.7379,47.38437,Millançay-Bourg,1.77289,47.44759,7


In [7]:
df_2_intercites.loc[df_2_intercites[("arrivee", "nom")] == "Romorantin-Gendarmerie"]

Unnamed: 0_level_0,depart,depart,depart,arrivee,arrivee,arrivee,duree (min)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1
2920,Romorantin-Lycée-de-F,47.36936,1.73913,Romorantin-Gendarmerie,1.7379,47.38437,3


In [8]:
df_2_intercites.at[2922, [("depart", "latitude"), ("depart", "longitude")]] = [47.44759, 1.77289]
df_2_intercites.at[2921, [("arrivee", "latitude"), ("arrivee", "longitude")]] = [47.44759, 1.77289]
df_2_intercites.at[2921, [("depart", "latitude"), ("depart", "longitude")]] = [47.38437, 1.7379]
df_2_intercites.at[2920, [("arrivee", "latitude"), ("arrivee", "longitude")]] = [47.38437, 1.7379]

In [9]:
# Exception (trouvé avec les graph) : incohérence dans la postion, actualisation avec recherche internet

df_2_intercites.loc[df_2_intercites[("depart", "nom")] == "Riedseltz S Polyvalente"]

Unnamed: 0_level_0,depart,depart,depart,arrivee,arrivee,arrivee,duree (min)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1
2584,Riedseltz S Polyvalente,48.93195,-4.4771,Wissembourg-Altenstadt.,49.02825,7.96388,8
2588,Riedseltz S Polyvalente,48.93195,-4.4771,Riedseltz-Chapelle,48.99034,7.95072,1


In [10]:
df_2_intercites.loc[df_2_intercites[("arrivee", "nom")] == "Riedseltz S Polyvalente"]

Unnamed: 0_level_0,depart,depart,depart,arrivee,arrivee,arrivee,duree (min)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1
2583,Riedseltz-Chapelle,48.99034,7.95072,Riedseltz S Polyvalente,48.93195,-4.4771,1
2587,Wissembourg-Altenstadt.,49.02825,7.96388,Riedseltz S Polyvalente,48.93195,-4.4771,8


In [11]:
df_2_intercites.at[2584, [("depart", "latitude"), ("depart", "longitude")]] = [48.9907019, 7.9628874]
df_2_intercites.at[2588, [("depart", "latitude"), ("depart", "longitude")]] = [48.9907019, 7.9628874]
df_2_intercites.at[2583, [("arrivee", "latitude"), ("arrivee", "longitude")]] = [48.9907019, 7.9628874]
df_2_intercites.at[2587, [("arrivee", "latitude"), ("arrivee", "longitude")]] = [48.9907019, 7.9628874]

***DataFrame multiindex suite (enlève doublons, ajoute prix et empreinte éco)***  

In [12]:
col = [["point_1", "point_1", "point_1", "point_2", "point_2", "point_2"], 
       ["nom", "latitude", "longitude", "nom", "latitude", "longitude"]]
col = pd.MultiIndex.from_arrays(col)

# si il y a A->B et B->A on enlève B->A et on garde la moyenne des valeurs de durée
to_keep = []
duree = {}
to_divide = {}
for v1, v2, t in df_2_intercites[[("depart", "nom"), ("arrivee", "nom"), ("duree (min)", "")]].values:
    if ([v1, v2] in to_keep) or ([v2, v1] in to_keep):
        pass
    else:
        to_keep.append([v1, v2])
    if [v1, v2] in to_keep:
        duree[f"{v1} : {v2}"] = duree.get(f"{v1} : {v2}", 0.0) + t
        to_divide[f"{v1} : {v2}"] = to_divide.get(f"{v1} : {v2}", 0) + 1
    elif [v2, v1] in to_keep:
        duree[f"{v2} : {v1}"] = duree.get(f"{v2} : {v1}", 0.0) + t
        to_divide[f"{v2} : {v1}"]= to_divide.get(f"{v2} : {v1}", 0) + 1
df_2_intercites = df_2_intercites.loc[df_2_intercites[[("depart", "nom"), ("arrivee", "nom")]].apply(lambda x: 
                                                                                       list(x) in to_keep, axis=1)]
# j'aurais pu direct gardé les temps car dans un sens ou l'autre c'est les mêmes..
duree = dict(zip(list(duree.keys()),np.array(list(duree.values()))/np.array(list(to_divide.values()))))

df_3_intercites = []
temp = df_2_intercites[[("depart", "nom"),("arrivee", "nom")]].values.tolist()
depart_arrivee = []
for element in temp:
    depart_arrivee.append(f"{element[0]} : {element[1]}")
doublons = Counter(depart_arrivee).most_common()

for doublon in doublons:
    doublon = doublon[0].split(" : ")
    df_temp = df_2_intercites.loc[df_2_intercites[("depart", "nom")] == doublon[0]]
    df_temp = df_temp.loc[df_temp[("arrivee", "nom")] == doublon[1]]
    df_3_intercites.append(df_temp[[("depart", "nom"), ("depart", "latitude"), ("depart", "longitude"),
                         ("arrivee", "nom"), ("arrivee", "latitude"), ("arrivee", "longitude")]].values[0].tolist())
df_3_intercites = pd.DataFrame(df_3_intercites, columns=col)

duree_bis = []
for points in df_3_intercites[[("point_1", "nom"), ("point_2", "nom")]].values:
    duree_bis.append(round(duree[f"{points[0]} : {points[1]}"], 2))
df_3_intercites[("duree (min)", "")] = duree_bis
    
def ecart(lat_depart,lat_arrivee,long_depart,long_arrivee): #formule de haversine
    r = 6378
    lat_depart, long_depart = math.radians(lat_depart), math.radians(long_depart)
    lat_arrivee, long_arrivee = math.radians(lat_arrivee), math.radians(long_arrivee)
    return 2*r*math.asin(math.sqrt(math.sin((lat_depart - lat_arrivee)/2)**2 + math.cos(lat_depart)*
                                   math.cos(lat_arrivee)*math.sin((long_depart-long_arrivee)/2)**2))

dist = []
for lat_depart, lng_depart, lat_arrivee, lng_arrivee in df_3_intercites[[("point_1", "latitude"), 
                              ("point_1", "longitude"), ("point_2", "latitude"), ("point_2", "longitude")]].values:
    dist.append(round(ecart(lat_depart,lat_arrivee,lng_depart,lng_arrivee), 3))
df_3_intercites[("distance (km)", "")] = dist
df_3_intercites[("empreinte carbone (gCO2)")] = df_3_intercites[("distance (km)", "")].apply(lambda x: 
                                                                                             round(10.8*x, 1))
df_3_intercites[("prix (euros)")] = df_3_intercites[("distance (km)", "")].apply(lambda x: round(9*x/100, 2))

In [13]:
df_3_intercites.loc[df_3_intercites[("distance (km)", "")] == 0.0] # a enlever

Unnamed: 0_level_0,point_1,point_1,point_1,point_2,point_2,point_2,duree (min),distance (km),empreinte carbone (gCO2),prix (euros)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1446,Le Mont Dore Ski,45.54211,2.81673,Le Mont Dore Ski,45.54211,2.81673,10.0,0.0,0.0,0.0


In [14]:
df_3_intercites = df_3_intercites.drop(index=1446)

In [15]:
df_3_intercites.to_csv(f"{PATH_result}/df_intercites.csv", index=False)

# Résultats

In [16]:
df_3_intercites

Unnamed: 0_level_0,point_1,point_1,point_1,point_2,point_2,point_2,duree (min),distance (km),empreinte carbone (gCO2),prix (euros)
Unnamed: 0_level_1,nom,latitude,longitude,nom,latitude,longitude,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Gare de Cormery,47.26235,0.83382,Cormery-Pl-du-Croissant,47.26861,0.83527,1.95,0.705,7.6,0.06
1,Cormery-Pl-du-Croissant,47.26861,0.83527,Truyes-Chapelle,47.27528,0.83055,1.96,0.824,8.9,0.07
2,Gare de Loches,47.13040,1.00097,Loches-St-Jacques,47.13755,0.99982,2.11,0.801,8.7,0.07
3,Chambourg-Chopin,47.18221,0.96805,Azay-Rivieres,47.20921,0.94616,3.86,3.431,37.1,0.31
4,Chambray-CHR-Trousseau,47.34777,0.71000,Gare de Tours,47.38981,0.69351,15.28,4.842,52.3,0.44
...,...,...,...,...,...,...,...,...,...,...
3293,Gare de Albens,45.78613,5.94855,Grésy-S/Aix-Ondea,45.72324,5.91845,7.00,7.381,79.7,0.66
3294,Grésy-S/Aix-Ondea,45.72324,5.91845,Gare de Aix-les-Bains-le-Revard,45.68786,5.90935,14.00,4.001,43.2,0.36
3295,Gare de Aillevillers,47.91387,6.33739,St-Loup-sur-Semouse,47.88563,6.27575,10.00,5.572,60.2,0.50
3296,St-Loup-sur-Semouse,47.88563,6.27575,Gare de Luxeuil-les-Bains,47.81492,6.37268,17.00,10.695,115.5,0.96
