In [None]:
import pandas as pd
import geopandas as gpd
import h3pandas
from shapely.geometry import Point, Polygon, LineString
import json
import folium
import os
import numpy as np
from mappymatch.constructs.geofence import Geofence
from shapely import prepare

In [None]:
# configuration
OUTPUT_FILE ='../static/data/emg_compare.json'

In [None]:
result_dict = {}

In [None]:
geofence_paris = Geofence.from_geojson("sources/paris.geojson")
geofence_idf = Geofence.from_geojson("sources/region-ile-de-france.geojson")
geofence_idf_pc_only = Geofence.from_geojson("sources/idf_pc_only.geojson")

In [None]:
# Load csv data, only for idf
li = []
li.append(pd.read_csv("sources/data_france_2024-07-24.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-07-25.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-07-26.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-07-27.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-07-28.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-07-29.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-07-30.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-07-31.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-08-01.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-08-02.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-08-03.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-08-04.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-08-05.csv", engine="pyarrow")[lambda x: x['code'] == 11])
li.append(pd.read_csv("sources/data_france_2024-08-06.csv", engine="pyarrow")[lambda x: x['code'] == 11])
df_src = pd.concat(li, axis=0, ignore_index=True)
df_src = df_src.drop(columns=["geometry", "index_right", "nom", "code"])
df_src

In [None]:
tr = {
-10 : "NOT_DEFINED",
0 : "UNKNOWN",
1 : "PASSENGER_CAR",
2 : "MOTORCYCLE",
3 : "HEAVY_DUTY_VEHICLE",
4 : "BUS",
5 : "COACH",
6 : "RAIL_TRIP",
7 : "BOAT_TRIP",
8 : "BIKE_TRIP",
9 : "PLANE",
10 : "SKI",
11 : "FOOT",
12 : "IDLE",
13 : "OTHER",
101 : "SCOOTER",
102 : "HIGH_SPEED_TRAIN"
}
tre = {
    -10: "",
    1: "Essence",
    2: "Diesel",
    3: "Electrique",
    4: "Essence Hybride",
    5: "Diesiel Hybride"
}
df_src['transportation_mode_tr'] = df_src['transportation_mode'].apply(lambda x: tr[x])
df_src['engine_type_tr'] = df_src['engine_type'].apply(lambda x: tre[x]) # Unused

In [None]:
# Convert columns to datetime
df_src['start_time'] = pd.to_datetime(df_src['start_time'], format='mixed')
df_src['end_time'] = pd.to_datetime(df_src['end_time'], format='mixed')

# Calculate duration in seconds
df_src['duration'] = (df_src['end_time'] - df_src['start_time']).dt.total_seconds()

In [None]:
journey_df = df_src.groupby("journey_id").agg(
    journey_starting_longitude=('starting_longitude', 'first'),
    journey_starting_latitude=('starting_latitude', 'first'),
    journey_ending_longitude=('ending_longitude', 'last'),
    journey_ending_latitude=('ending_latitude', 'last'),
    journey_start_time=('start_time', 'first'),
    journey_end_time=('end_time', 'last'),
    journey_duration=('duration', 'sum'),
    journey_distance=('distance_km', 'sum')
).reset_index()
df_src = pd.merge(df_src, journey_df, on="journey_id")
df_src

In [None]:
gdf = gpd.GeoDataFrame(df_src, geometry=gpd.points_from_xy(df_src.journey_starting_longitude, df_src.journey_starting_latitude), crs="EPSG:4326")
gdf['end_geometry'] = gpd.points_from_xy(gdf.journey_ending_longitude, gdf.journey_ending_latitude)

In [None]:
prepare(geofence_paris.geometry)
prepare(geofence_idf.geometry)
prepare(geofence_idf_pc_only.geometry)
gdf["start_in_paris"] = geofence_paris.geometry.contains(gdf.geometry)
gdf["end_in_paris"] = geofence_paris.geometry.contains(gdf.end_geometry)
gdf["start_in_idf"] = geofence_idf.geometry.contains(gdf.geometry)
gdf["end_in_idf"] = geofence_idf.geometry.contains(gdf.end_geometry)
gdf["start_in_pc"] = geofence_idf_pc_only.geometry.contains(gdf.geometry)
gdf["end_in_pc"] = geofence_idf_pc_only.geometry.contains(gdf.end_geometry)
gdf["start_in_gc"] = gdf.start_in_idf & ~gdf.start_in_paris & ~gdf.start_in_pc
gdf["end_in_gc"] = gdf.end_in_idf & ~gdf.end_in_paris & ~gdf.end_in_pc

In [None]:
df = gdf[gdf.start_in_idf & gdf.end_in_idf]

In [None]:
df_gp_journey = df.groupby('journey_id').nth(0)

## Nombre de déplacements par jour de la semaine

In [None]:
result_dict["nb_journey_days"] = {"data": {}, "emg": {}, "diff": {}}

In [None]:
nb_journey_days = df_gp_journey.groupby(df_gp_journey["journey_start_time"].dt.dayofweek).agg(NbJourney=('journey_id', 'count'))
total = nb_journey_days.reset_index()["NbJourney"].sum()
nb_journey_days["percent"] = nb_journey_days.NbJourney / total
nb_journey_days = nb_journey_days.set_index(pd.Index(['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']))
result_dict["nb_journey_days"]["data"] = nb_journey_days.to_dict()
nb_journey_days

In [None]:
emg_nb_journey_days = pd.DataFrame({
    'index': ['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche'],
    'NbJourney': [34.5, 35.5, 34.9, 34.2, 33.8, 29.0, 19.0]
})
total = emg_nb_journey_days.reset_index()["NbJourney"].sum()
emg_nb_journey_days["percent"] = emg_nb_journey_days.NbJourney / total
emg_nb_journey_days = emg_nb_journey_days.set_index('index')
result_dict["nb_journey_days"]["emg"] = emg_nb_journey_days.to_dict()
emg_nb_journey_days

In [None]:
result_dict["nb_journey_days"]["diff"] = ((nb_journey_days["percent"] - emg_nb_journey_days["percent"]) * 100).to_dict()

## % des individus ne se déplaçant pas par jour de la semaine
Pour éviter les faux positifs, avec des gens qui quittent ou arrive en idf pendant la période, je ne considère que les utilsiateurs avec au moins 7 jours de déplacement au cours des 2 semaines

In [None]:
result_dict["users_not_moving"] = {"data": {}, "emg": {}, "diff": {}}

In [None]:
df_gp_journey['day'] = df_gp_journey["journey_start_time"].dt.day
distinctDay = df_gp_journey.groupby('moover_id').agg(DistinctDay=('day', 'nunique'))
active_users = distinctDay[distinctDay.DistinctDay > 6].index
df_gp_journey.drop(columns='day')

In [None]:
nb_users_days = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)].groupby(df_gp_journey["journey_start_time"].dt.dayofweek).agg(NbUsers=('moover_id', 'nunique'))
total = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)]["moover_id"].nunique()
nb_users_days["percent"] = nb_users_days.NbUsers * 100 / total
nb_users_days["percent_users_not_moving"] = 100 - nb_users_days["percent"]
nb_users_days = nb_users_days.set_index(pd.Index(['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']))
result_dict["users_not_moving"]["data"] = nb_users_days.to_dict()
nb_users_days

In [None]:
emg_users_not_moving = pd.DataFrame({
    'percent_users_not_moving': [5.1, 4.4, 4.9, 5.4, 6.4, 10.8, 24]
})
emg_users_not_moving = emg_users_not_moving.set_index(pd.Index(['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']))
result_dict["users_not_moving"]["emg"] = emg_users_not_moving.to_dict()
emg_users_not_moving

In [None]:
result_dict["users_not_moving"]["diff"] = ((nb_users_days["percent_users_not_moving"] - emg_users_not_moving["percent_users_not_moving"])).to_dict()

## Nb déplacement par heure de la journée

In [None]:
result_dict["nb_journey_hour"] = {"data": {}, "emg": {}, "diff": {}}

In [None]:
nb_journey_hour = df_gp_journey.groupby(df_gp_journey["journey_start_time"].dt.hour).agg(NbJourney=('journey_id', 'count'))
total = nb_journey_hour.reset_index()["NbJourney"].sum()
nb_journey_hour["percent"] = nb_journey_hour.NbJourney * 100 / total
nb_journey_hour = nb_journey_hour.set_index((nb_journey_hour.reset_index()["journey_start_time"] + 2) % 24).sort_index()
result_dict["nb_journey_hour"]["data"] = nb_journey_hour.to_dict()
nb_journey_hour

In [None]:
nb_journey_hour['NbJourney'].plot()

In [None]:
emg_nb_journey_hour = pd.DataFrame({
    'NbJourney': [1126, 7314, 37907, 80506, 183966, 714481, 1105175, 641844, 611138, 790746, 981349, 766795, 597375, 705335, 1072581, 1287459, 1035738, 603330, 343792, 182376, 119474, 61715, 28364, 5942]
})
emg_nb_journey_hour["percent"] = emg_nb_journey_hour["NbJourney"] * 100 / emg_nb_journey_hour["NbJourney"].sum()
emg_nb_journey_hour = emg_nb_journey_hour.set_index(pd.Index(range(0,24)))
emg_nb_journey_hour = emg_nb_journey_hour.set_index((emg_nb_journey_hour.reset_index()["index"] + 2) % 24).sort_index()
result_dict["nb_journey_hour"]["emg"] = emg_nb_journey_hour.to_dict()
emg_nb_journey_hour

In [None]:
result_dict["nb_journey_hour"]["diff"] = (nb_journey_hour.percent - emg_nb_journey_hour.percent).to_dict()

## Grandes moyennes

In [None]:
# Nb deplacement moyen par jour par personne, naif
total_users = df_gp_journey["moover_id"].nunique()
total_journey = df_gp_journey['journey_id'].nunique()
total_journey / 14 / total_users

In [None]:
# Nb deplacement moyen par jour par personne, actifs seulement
total_users = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)]["moover_id"].nunique()
total_journey = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)]['journey_id'].nunique()
total_journey / 14 / total_users

In [None]:
# Nb deplacement moyen par jour par personne, jours de deplacement uniquement
value = df_gp_journey.groupby(['moover_id', df_gp_journey["journey_start_time"].dt.day])['journey_id'].nunique().reset_index(drop=True).mean()
result_dict["nb_daily_journey_per_user"] = {"data": value, "emg": 3.82}
value

In [None]:
# Nb deplacement moyen par jour par personne, EMG
3.82

In [None]:
# Durée moyenne de déplacement par personne par jour, jours de deplacement uniquement
value = df_gp_journey.groupby(['moover_id', df_gp_journey["journey_start_time"].dt.day])['journey_duration'].sum().reset_index(drop=True).median() / 60
result_dict["avg_daily_traveling_time"] = {"data": value, "emg": 92}
value

In [None]:
# Durée moyenne de déplacement par personne par jour à paris, jours de deplacement uniquement
df_gp_journey[df_gp_journey.start_in_paris & df_gp_journey.end_in_paris].groupby(['moover_id', df_gp_journey["journey_start_time"].dt.day])['journey_duration'].sum().reset_index(drop=True).median() / 60

In [None]:
# Durée moyenne de déplacement par personne par jour, EMG
92

## Parts modales

In [None]:
result_dict["modal_shares"] = {"data": {}, "emg": {}, "diff": {}}

In [None]:
df_filtered = df[df.journey_duration < 18000] # ignore trips longer than 6 hours, they cannot be right...
dominant_mode = df_filtered.groupby(['journey_id', 'transportation_mode_tr'])[['duration']].sum().sort_values(by=['journey_id', 'duration'], ascending=[True, False]).reset_index().groupby('journey_id').nth(0).drop(columns="duration").rename(columns={'transportation_mode_tr': 'dominant_transportation_mode_tr'})
df_gp_journey_with_dominant = df_gp_journey.merge(dominant_mode, on='journey_id')
df_gp_journey_with_dominant

In [None]:
# Overall
overall_modal_share = df_gp_journey_with_dominant.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / df_gp_journey_with_dominant[['journey_id']].nunique()
overall_modal_share = overall_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["overall"] = overall_modal_share.to_dict()
overall_modal_share

In [None]:
# Paris -> Paris
ddf = df_gp_journey_with_dominant[df_gp_journey_with_dominant.start_in_paris & df_gp_journey_with_dominant.end_in_paris]
paris_paris_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
paris_paris_modal_share = paris_paris_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["paris_paris"] = paris_paris_modal_share.to_dict()
paris_paris_modal_share

In [None]:
# Paris <-> Idf GC
ddf = df_gp_journey_with_dominant[
    (df_gp_journey_with_dominant.start_in_paris & df_gp_journey_with_dominant.end_in_gc)
    |
    (df_gp_journey_with_dominant.start_in_gc & df_gp_journey_with_dominant.end_in_paris)
]
paris_idf_gc_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
paris_idf_gc_modal_share = paris_idf_gc_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["paris_idf_gc"] = paris_idf_gc_modal_share.to_dict()
paris_idf_gc_modal_share


In [None]:
# Paris <-> Idf PC
ddf = df_gp_journey_with_dominant[
    (df_gp_journey_with_dominant.start_in_paris & df_gp_journey_with_dominant.end_in_pc)
    |
    (df_gp_journey_with_dominant.start_in_pc & df_gp_journey_with_dominant.end_in_paris)
]
paris_idf_pc_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
paris_idf_pc_modal_share = paris_idf_pc_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["paris_idf_pc"] = paris_idf_pc_modal_share.to_dict()
paris_idf_pc_modal_share


In [None]:
# Idf GC <-> Idf GC
ddf = df_gp_journey_with_dominant[df_gp_journey_with_dominant.start_in_gc & df_gp_journey_with_dominant.end_in_gc]
idf_idf_gc_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
idf_idf_gc_modal_share = idf_idf_gc_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["idf_idf_gc"] = idf_idf_gc_modal_share.to_dict()
idf_idf_gc_modal_share

In [None]:
# Idf PC <-> Idf PC
ddf = df_gp_journey_with_dominant[df_gp_journey_with_dominant.start_in_pc & df_gp_journey_with_dominant.end_in_pc]
idf_idf_pc_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
idf_idf_pc_modal_share = idf_idf_pc_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["idf_idf_pc"] = idf_idf_pc_modal_share.to_dict()
idf_idf_pc_modal_share

In [None]:
emg_paris_paris_modal_share = pd.DataFrame({
    'percent': [0.535, 0.112, 0.3, 0.047, 0.006]
})
emg_paris_paris_modal_share = emg_paris_paris_modal_share.set_index(pd.Index(['FOOT', 'BIKE_TRIP', 'PT', 'PASSENGER_CAR', 'MOTORCYCLE']))
result_dict["modal_shares"]["emg"]["paris_paris"] = emg_paris_paris_modal_share.to_dict()

emg_paris_idf_gc_modal_share = pd.DataFrame({
    'percent': [0.005, 0.029, 0.773, 0.179, 0.013]
})
emg_paris_idf_gc_modal_share = emg_paris_idf_gc_modal_share.set_index(pd.Index(['FOOT', 'BIKE_TRIP', 'PT', 'PASSENGER_CAR', 'MOTORCYCLE']))
result_dict["modal_shares"]["emg"]["paris_idf_gc"] = emg_paris_idf_gc_modal_share.to_dict()

emg_paris_idf_pc_modal_share = pd.DataFrame({
    'percent': [0.055, 0.14, 0.661, 0.125, 0.018]
})
emg_paris_idf_pc_modal_share = emg_paris_idf_pc_modal_share.set_index(pd.Index(['FOOT', 'BIKE_TRIP', 'PT', 'PASSENGER_CAR', 'MOTORCYCLE']))
result_dict["modal_shares"]["emg"]["paris_idf_pc"] = emg_paris_idf_pc_modal_share.to_dict()

emg_idf_idf_pc_modal_share = pd.DataFrame({
    'percent': [0.418, 0.098, 0.195, 0.213, 0.016]
})
emg_idf_idf_pc_modal_share = emg_idf_idf_pc_modal_share.set_index(pd.Index(['FOOT', 'BIKE_TRIP', 'PT', 'PASSENGER_CAR', 'MOTORCYCLE']))
result_dict["modal_shares"]["emg"]["idf_idf_pc"] = emg_idf_idf_pc_modal_share.to_dict()

emg_idf_idf_gc_modal_share = pd.DataFrame({
    'percent': [0.244, 0.04, 0.092, 0.619, 0.005]
})
emg_idf_idf_gc_modal_share = emg_idf_idf_gc_modal_share.set_index(pd.Index(['FOOT', 'BIKE_TRIP', 'PT', 'PASSENGER_CAR', 'MOTORCYCLE']))
result_dict["modal_shares"]["emg"]["idf_idf_gc"] = emg_idf_idf_gc_modal_share.to_dict()

In [None]:
# Diff computations

# Aggregate Public transports
paris_paris_modal_share.loc["PT"] = paris_paris_modal_share.loc["BUS"] + paris_paris_modal_share.loc["RAIL_TRIP"] + paris_paris_modal_share.loc["HIGH_SPEED_TRAIN"]
paris_idf_gc_modal_share.loc["PT"] = paris_idf_gc_modal_share.loc["BUS"] + paris_idf_gc_modal_share.loc["RAIL_TRIP"] + paris_idf_gc_modal_share.loc["HIGH_SPEED_TRAIN"]
paris_idf_pc_modal_share.loc["PT"] = paris_idf_pc_modal_share.loc["BUS"] + paris_idf_pc_modal_share.loc["RAIL_TRIP"] + paris_idf_pc_modal_share.loc["HIGH_SPEED_TRAIN"]
idf_idf_gc_modal_share.loc["PT"] = idf_idf_gc_modal_share.loc["BUS"] + idf_idf_gc_modal_share.loc["RAIL_TRIP"] + idf_idf_gc_modal_share.loc["HIGH_SPEED_TRAIN"]
idf_idf_pc_modal_share.loc["PT"] = idf_idf_pc_modal_share.loc["BUS"] + idf_idf_pc_modal_share.loc["RAIL_TRIP"] + idf_idf_pc_modal_share.loc["HIGH_SPEED_TRAIN"]

result_dict["modal_shares"]["diff"]["paris_paris"] = ((paris_paris_modal_share.percent - emg_paris_paris_modal_share.percent).dropna() * 100).to_dict()
result_dict["modal_shares"]["diff"]["paris_idf_gc"] = ((paris_idf_gc_modal_share.percent - emg_paris_idf_gc_modal_share.percent).dropna() * 100).to_dict()
result_dict["modal_shares"]["diff"]["paris_idf_pc"] = ((paris_idf_pc_modal_share.percent - emg_paris_idf_pc_modal_share.percent).dropna() * 100).to_dict()
result_dict["modal_shares"]["diff"]["idf_idf_pc"] = ((idf_idf_gc_modal_share.percent - emg_idf_idf_pc_modal_share.percent).dropna() * 100).to_dict()
result_dict["modal_shares"]["diff"]["idf_idf_gc"] = ((idf_idf_pc_modal_share.percent - emg_idf_idf_gc_modal_share.percent).dropna() * 100).to_dict()

## Durée moyenne des deplacements par mode
Dans les deux cas les résultats sont très différents de l'EMG, je ne sais pas trop si c'est JO ou les données qui sont pas précises sur les horaires

Aussi on mesure pas la même chose: durée moyenne de s déplacement vs budget temps par mode (ce que je comprends, c'est le temps passé dans le bus par jour .. pour les gens uqi prennent le bus)

Le budget temps est un indicateur qui a plus de valeur avec la regularité des déplacements, ce qui n'existe pas trop dans les JO

In [None]:
result_dict["avg_duration_per_mode"] = {"data": {}, "emg": {}, "diff": {}}

In [None]:
# Giving the whole journey duration to the dominant mode
avg_duration_per_mode_full_journey = ddf.groupby('dominant_transportation_mode_tr')[['journey_duration']].median() / 60
result_dict["avg_duration_per_mode"]["data"]["full_journey"] = avg_duration_per_mode_full_journey.to_dict()
avg_duration_per_mode_full_journey

In [None]:
# Keeping the duration per mode (but still summing by journeys and mode, because often the same mode is split into different trips)
avg_duration_per_mode_mode_only = df.groupby(['journey_id', 'transportation_mode_tr'])[['duration']].sum().reset_index().groupby('transportation_mode_tr')[['duration']].median() / 60
result_dict["avg_duration_per_mode"]["data"]["mode_only"] = avg_duration_per_mode_mode_only.to_dict()
avg_duration_per_mode_mode_only

In [None]:
emg_avg_duration_per_mode = pd.DataFrame({
    'duration': [12, 25, 31, 20, 26, 57, 32, 11]
})
emg_avg_duration_per_mode = emg_avg_duration_per_mode.set_index(pd.Index(['FOOT', 'BIKE_TRIP', 'BUS', 'PASSENGER_CAR', 'MOTORCYCLE', 'HIGH_SPEED_TRAIN', 'RAIL_TRIP', 'SCOOTER']))
result_dict["avg_duration_per_mode"]["emg"] = emg_avg_duration_per_mode.to_dict()

In [None]:
result_dict["avg_duration_per_mode"]["diff"] = (avg_duration_per_mode_mode_only - emg_avg_duration_per_mode).dropna().to_dict()

## Multimodalité des trajets avec du train
La marche est ignorée dans le comptage des modes multimodaux

Difficilement comparable avec l'EMD, car on différencie difficilement entre Metro et Train

Plus grave, les stats de bus me semble vraiment faibles, dans l'EMG, on a 10 fois plus de trajets en BUS qu'en voiture pour ces trajets avec une composante train. Ici on a 20 fois plus de voiture que de bus !!

In [None]:
result_dict["multimodal_train_trips"] = {"data": {}, "emg": {}, "diff": {}}

In [None]:
journeys_with_train = df[(df["transportation_mode_tr"] == "RAIL_TRIP") | (df["transportation_mode_tr"] == "HIGH_SPEED_TRAIN")]["journey_id"]
res = df[df.journey_id.isin(journeys_with_train) & (df.transportation_mode_tr != "FOOT")].groupby('journey_id').agg(NbUniqueModes=('transportation_mode_tr', 'nunique')).reset_index().groupby('NbUniqueModes').nunique()
res['percent'] = res['journey_id'] / df[df.journey_id.isin(journeys_with_train) & (df.transportation_mode_tr != "FOOT")]['journey_id'].nunique()
res = res.rename(columns={"journey_id": "nb_journey"})
result_dict["multimodal_train_trips"]["data"]["nb_unique_modes"] = res.to_dict()
res


In [None]:
ddf = df[df.journey_id.isin(journeys_with_train) & (df.transportation_mode_tr != "FOOT") & (df.transportation_mode_tr != "RAIL_TRIP") & (df.transportation_mode_tr != "HIGH_SPEED_TRAIN")]
ddfg = ddf.groupby('transportation_mode_tr')[['journey_id']].nunique()
ddfg["percent"] = ddfg * 100 / ddf[['journey_id']].nunique()
ddfg = ddfg.rename(columns={"journey_id": "nb_journey"})
result_dict["multimodal_train_trips"]["data"]["nb_journey_per_modes"] = ddfg.to_dict()
ddfg

In [None]:
emg_multimodal_train_trips_nb_unique_modes_rer = pd.DataFrame({
    'percent': [0.281, 0.529, 0.187, 0.003]
})
emg_multimodal_train_trips_nb_unique_modes_rer = emg_multimodal_train_trips_nb_unique_modes_rer.set_index(pd.Index(["1", "2", "3", "4+"]))
result_dict["multimodal_train_trips"]["emg"]["nb_unique_modes_rer"] = emg_multimodal_train_trips_nb_unique_modes_rer.to_dict()

emg_multimodal_train_trips_nb_journey_per_modes_rer = pd.DataFrame({
    'percent': [x*100 for x in [0.038, 0.27, 0.025, 0.42, 0.009, 0.78]]
})
emg_multimodal_train_trips_nb_journey_per_modes_rer = emg_multimodal_train_trips_nb_journey_per_modes_rer.set_index(pd.Index(['BIKE_TRIP', 'BUS', 'PASSENGER_CAR', 'RAIL_TRIP', 'SCOOTER', 'TRAM']))
result_dict["multimodal_train_trips"]["emg"]["nb_journey_per_modes_rer"] = emg_multimodal_train_trips_nb_journey_per_modes_rer.to_dict()

emg_multimodal_train_trips_nb_unique_modes_metro = pd.DataFrame({
    'percent': [0.535, 0.332, 0.131, 0.002]
})
emg_multimodal_train_trips_nb_unique_modes_metro = emg_multimodal_train_trips_nb_unique_modes_metro.set_index(pd.Index(["1", "2", "3", "4+"]))
result_dict["multimodal_train_trips"]["emg"]["nb_unique_modes_metro"] = emg_multimodal_train_trips_nb_unique_modes_metro.to_dict()

emg_multimodal_train_trips_nb_journey_per_modes_metro = pd.DataFrame({
    'percent': [x*100 for x in [0.006, 0.135, 0.009, 0.355, 0.001, 0.065]]
})
emg_multimodal_train_trips_nb_journey_per_modes_metro = emg_multimodal_train_trips_nb_journey_per_modes_metro.set_index(pd.Index(['BIKE_TRIP', 'BUS', 'PASSENGER_CAR', 'HIGH_SPEED_TRAIN', 'SCOOTER', 'TRAM']))
result_dict["multimodal_train_trips"]["emg"]["nb_journey_per_modes_metro"] = emg_multimodal_train_trips_nb_journey_per_modes_metro.to_dict()

In [None]:
result_dict["multimodal_train_trips"]["diff"] = {
    "nb_unique_modes_rer": ((res[res.index < 5].percent -  emg_multimodal_train_trips_nb_unique_modes_rer.set_index(pd.Index([1,2,3,4])).percent) *100).to_dict(),
    "nb_unique_modes_metro": ((res[res.index < 5].percent -  emg_multimodal_train_trips_nb_unique_modes_metro.set_index(pd.Index([1,2,3,4])).percent)*100).to_dict(),
    "nb_journey_per_modes_rer": (ddfg.percent -  emg_multimodal_train_trips_nb_journey_per_modes_rer.percent).dropna().to_dict(),
    "nb_journey_per_modes_metro": (ddfg.percent -  emg_multimodal_train_trips_nb_journey_per_modes_metro.percent).dropna().to_dict()
}

## Taux d'occupation des véhicules
Biais covoiturage ici, 1.73 est énorme comparé aux 1.11 normaux

In [None]:
val = df[df['transportation_mode_tr'] == "PASSENGER_CAR"].passenger_count.mean()
result_dict["occupancy"] = {"data": val, "emg": 1.11}
val

## Save to file

In [None]:
with open(OUTPUT_FILE, 'w') as f:
    f.write(json.dumps(result_dict, indent=2))