In [1]:
import pandas as pd
import geopandas as gpd
import h3pandas
from shapely.geometry import Point, Polygon, LineString
import json
import folium
import os
import numpy as np
from mappymatch.constructs.geofence import Geofence
from shapely import prepare

In [2]:
# import osmnx as ox

# # Récupérer le périmètre de la ville de Lyon
# lyon = ox.geocode_to_gdf("Lyon, France")
# lyon.to_file("lyon.geojson", driver="GeoJSON")

# # Récupérer le périmètre de la métropole du Grand Lyon
# grand_lyon = ox.geocode_to_gdf("Métropole de Lyon, France")
# grand_lyon.to_file("grand_lyon.geojson", driver="GeoJSON")

In [106]:
# configuration
OUTPUT_FILE ='../static/data/grand_lyon.json'
DEPARTURES_OUTPUT_FILE = '../static/data/lyon/departures.geojson'
OUTPUT_FOLDER="../static/data/lyon"
REGION_CODE = 84
CSV_SOURCE_FOLDER = "sources"

In [4]:
result_dict = {}

In [5]:
geofence_city = Geofence.from_geojson("sources/lyon.geojson")
geofence_metro_area = Geofence.from_geojson("sources/grand_lyon.geojson")

In [None]:
# Load csv data, only for region
li = []
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-24.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-25.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-26.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-27.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-28.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-29.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-30.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-07-31.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-08-01.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-08-02.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-08-03.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-08-04.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-08-05.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
li.append(pd.read_csv(CSV_SOURCE_FOLDER + "/data_france_2024-08-06.csv", engine="pyarrow")[lambda x: x['code'] == REGION_CODE])
df_src = pd.concat(li, axis=0, ignore_index=True)
df_src = df_src.drop(columns=["geometry", "index_right", "nom", "code"])
df_src

In [7]:
tr = {
-10 : "NOT_DEFINED",
0 : "UNKNOWN",
1 : "PASSENGER_CAR",
2 : "MOTORCYCLE",
3 : "HEAVY_DUTY_VEHICLE",
4 : "BUS",
5 : "COACH",
6 : "RAIL_TRIP",
7 : "BOAT_TRIP",
8 : "BIKE_TRIP",
9 : "PLANE",
10 : "SKI",
11 : "FOOT",
12 : "IDLE",
13 : "OTHER",
101 : "SCOOTER",
102 : "HIGH_SPEED_TRAIN"
}
tre = {
    -10: "",
    1: "Essence",
    2: "Diesel",
    3: "Electrique",
    4: "Essence Hybride",
    5: "Diesiel Hybride"
}
df_src['transportation_mode_tr'] = df_src['transportation_mode'].apply(lambda x: tr[x])
df_src['engine_type_tr'] = df_src['engine_type'].apply(lambda x: tre[x]) # Unused

In [8]:
# Convert columns to datetime
df_src['start_time'] = pd.to_datetime(df_src['start_time'], format='mixed')
df_src['end_time'] = pd.to_datetime(df_src['end_time'], format='mixed')

# Calculate duration in seconds
df_src['duration'] = (df_src['end_time'] - df_src['start_time']).dt.total_seconds()

In [None]:
journey_df = df_src.groupby("journey_id").agg(
    journey_starting_longitude=('starting_longitude', 'first'),
    journey_starting_latitude=('starting_latitude', 'first'),
    journey_ending_longitude=('ending_longitude', 'last'),
    journey_ending_latitude=('ending_latitude', 'last'),
    journey_start_time=('start_time', 'first'),
    journey_end_time=('end_time', 'last'),
    journey_duration=('duration', 'sum'),
    journey_distance=('distance_km', 'sum')
).reset_index()
df_src = pd.merge(df_src, journey_df, on="journey_id")
df_src

In [10]:
gdf = gpd.GeoDataFrame(df_src, geometry=gpd.points_from_xy(df_src.journey_starting_longitude, df_src.journey_starting_latitude), crs="EPSG:4326")
gdf['end_geometry'] = gpd.points_from_xy(gdf.journey_ending_longitude, gdf.journey_ending_latitude)

In [11]:
prepare(geofence_city.geometry)
prepare(geofence_metro_area.geometry)
gdf["start_in_city"] = geofence_city.geometry.contains(gdf.geometry)
gdf["end_in_city"] = geofence_city.geometry.contains(gdf.end_geometry)
gdf["start_in_metro_area"] = geofence_metro_area.geometry.contains(gdf.geometry)
gdf["end_in_metro_area"] = geofence_metro_area.geometry.contains(gdf.end_geometry)

In [12]:
df = gdf[gdf.start_in_metro_area & gdf.end_in_metro_area]

In [13]:
df_gp_journey = df.groupby('journey_id').nth(0)

## Nombre de déplacements par jour de la semaine

In [14]:
result_dict["nb_journey_days"] = {"data": {}}

In [None]:
nb_journey_days = df_gp_journey.groupby(df_gp_journey["journey_start_time"].dt.dayofweek).agg(NbJourney=('journey_id', 'count'))
total = nb_journey_days.reset_index()["NbJourney"].sum()
nb_journey_days["percent"] = nb_journey_days.NbJourney / total
nb_journey_days = nb_journey_days.set_index(pd.Index(['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']))
result_dict["nb_journey_days"]["data"] = nb_journey_days.to_dict()
nb_journey_days

## % des individus ne se déplaçant pas par jour de la semaine
Pour éviter les faux positifs, avec des gens qui quittent ou arrive en idf pendant la période, je ne considère que les utilsiateurs avec au moins 7 jours de déplacement au cours des 2 semaines

In [16]:
result_dict["users_not_moving"] = {"data": {}}

In [None]:
df_gp_journey['day'] = df_gp_journey["journey_start_time"].dt.day
distinctDay = df_gp_journey.groupby('moover_id').agg(DistinctDay=('day', 'nunique'))
active_users = distinctDay[distinctDay.DistinctDay > 6].index
df_gp_journey.drop(columns='day')

In [None]:
df_gp_journey.moover_id.nunique()

In [None]:
nb_users_days = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)].groupby(df_gp_journey["journey_start_time"].dt.dayofweek).agg(NbUsers=('moover_id', 'nunique'))
total = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)]["moover_id"].nunique()
nb_users_days["percent"] = nb_users_days.NbUsers * 100 / total
nb_users_days["percent_users_not_moving"] = 100 - nb_users_days["percent"]
nb_users_days = nb_users_days.set_index(pd.Index(['Lundi', 'Mardi', 'Mercredi', 'Jeudi', 'Vendredi', 'Samedi', 'Dimanche']))
result_dict["users_not_moving"]["data"] = nb_users_days.to_dict()
nb_users_days

## Nb déplacement par heure de la journée

In [19]:
result_dict["nb_journey_hour"] = {"data": {}}

In [None]:
nb_journey_hour = df_gp_journey.groupby(df_gp_journey["journey_start_time"].dt.hour).agg(NbJourney=('journey_id', 'count'))
total = nb_journey_hour.reset_index()["NbJourney"].sum()
nb_journey_hour["percent"] = nb_journey_hour.NbJourney * 100 / total
nb_journey_hour = nb_journey_hour.set_index((nb_journey_hour.reset_index()["journey_start_time"] + 2) % 24).sort_index()
result_dict["nb_journey_hour"]["data"] = nb_journey_hour.to_dict()
nb_journey_hour

In [None]:
nb_journey_hour['NbJourney'].plot()

## Grandes moyennes

In [None]:
# Nb deplacement moyen par jour par personne, naif
total_users = df_gp_journey["moover_id"].nunique()
total_journey = df_gp_journey['journey_id'].nunique()
total_journey / 14 / total_users

In [None]:
# Nb deplacement moyen par jour par personne, actifs seulement
total_users = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)]["moover_id"].nunique()
total_journey = df_gp_journey[df_gp_journey['moover_id'].isin(active_users)]['journey_id'].nunique()
total_journey / 14 / total_users

In [None]:
# Nb deplacement moyen par jour par personne, jours de deplacement uniquement
value = df_gp_journey.groupby(['moover_id', df_gp_journey["journey_start_time"].dt.day])['journey_id'].nunique().reset_index(drop=True).mean()
result_dict["nb_daily_journey_per_user"] = {"data": value}
value

In [None]:
# Durée moyenne de déplacement par personne par jour, jours de deplacement uniquement
value = df_gp_journey.groupby(['moover_id', df_gp_journey["journey_start_time"].dt.day])['journey_duration'].sum().reset_index(drop=True).median() / 60
result_dict["avg_daily_traveling_time"] = {"data": value}
value

In [None]:
# Durée moyenne de déplacement par personne par jour à paris, jours de deplacement uniquement
df_gp_journey[df_gp_journey.start_in_city & df_gp_journey.end_in_city].groupby(['moover_id', df_gp_journey["journey_start_time"].dt.day])['journey_duration'].sum().reset_index(drop=True).median() / 60

## Parts modales

In [28]:
result_dict["modal_shares"] = {"data": {}}

In [None]:
df_filtered = df[df.journey_duration < 18000] # ignore trips longer than 6 hours, they cannot be right...
dominant_mode = df_filtered.groupby(['journey_id', 'transportation_mode_tr'])[['duration']].sum().sort_values(by=['journey_id', 'duration'], ascending=[True, False]).reset_index().groupby('journey_id').nth(0).drop(columns="duration").rename(columns={'transportation_mode_tr': 'dominant_transportation_mode_tr'})
df_gp_journey_with_dominant = df_gp_journey.merge(dominant_mode, on='journey_id')
df_gp_journey_with_dominant

In [None]:
# Overall
overall_modal_share = df_gp_journey_with_dominant.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / df_gp_journey_with_dominant[['journey_id']].nunique()
overall_modal_share = overall_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["overall"] = overall_modal_share.to_dict()
overall_modal_share

In [None]:
# City -> City
ddf = df_gp_journey_with_dominant[df_gp_journey_with_dominant.start_in_city & df_gp_journey_with_dominant.end_in_city]
city_city_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
city_city_modal_share = city_city_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["city_city"] = city_city_modal_share.to_dict()
city_city_modal_share

In [None]:
# City <-> Metro Area
ddf = df_gp_journey_with_dominant[
    (df_gp_journey_with_dominant.start_in_city & df_gp_journey_with_dominant.end_in_metro_area)
    |
    (df_gp_journey_with_dominant.start_in_metro_area & df_gp_journey_with_dominant.end_in_city)
]
city_metro_area_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
city_metro_area_modal_share = city_metro_area_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["city_metro_area"] = city_metro_area_modal_share.to_dict()
city_metro_area_modal_share


In [None]:
# Metro Area <-> Metro Area
ddf = df_gp_journey_with_dominant[df_gp_journey_with_dominant.start_in_metro_area & df_gp_journey_with_dominant.end_in_metro_area]
metro_area_metro_area_modal_share = ddf.groupby('dominant_transportation_mode_tr')[['journey_id']].nunique() / ddf[['journey_id']].nunique()
metro_area_metro_area_modal_share = metro_area_metro_area_modal_share.rename(columns={"journey_id": "percent"})
result_dict["modal_shares"]["data"]["metro_area_metro_area"] = metro_area_metro_area_modal_share.to_dict()
metro_area_metro_area_modal_share

## Durée moyenne des deplacements par mode
Dans les deux cas les résultats sont très différents de l'EMG, je ne sais pas trop si c'est JO ou les données qui sont pas précises sur les horaires

Aussi on mesure pas la même chose: durée moyenne de s déplacement vs budget temps par mode (ce que je comprends, c'est le temps passé dans le bus par jour .. pour les gens uqi prennent le bus)

Le budget temps est un indicateur qui a plus de valeur avec la regularité des déplacements, ce qui n'existe pas trop dans les JO

In [34]:
result_dict["avg_duration_per_mode"] = {"data": {}}

In [None]:
# Giving the whole journey duration to the dominant mode
avg_duration_per_mode_full_journey = ddf.groupby('dominant_transportation_mode_tr')[['journey_duration']].median() / 60
result_dict["avg_duration_per_mode"]["data"]["full_journey"] = avg_duration_per_mode_full_journey.to_dict()
avg_duration_per_mode_full_journey

In [None]:
# Keeping the duration per mode (but still summing by journeys and mode, because often the same mode is split into different trips)
avg_duration_per_mode_mode_only = df.groupby(['journey_id', 'transportation_mode_tr'])[['duration']].sum().reset_index().groupby('transportation_mode_tr')[['duration']].median() / 60
result_dict["avg_duration_per_mode"]["data"]["mode_only"] = avg_duration_per_mode_mode_only.to_dict()
avg_duration_per_mode_mode_only

## Multimodalité des trajets avec du train
La marche est ignorée dans le comptage des modes multimodaux

In [37]:
result_dict["multimodal_train_trips"] = {"data": {}}

In [None]:
journeys_with_train = df[(df["transportation_mode_tr"] == "RAIL_TRIP") | (df["transportation_mode_tr"] == "HIGH_SPEED_TRAIN")]["journey_id"]
res = df[df.journey_id.isin(journeys_with_train) & (df.transportation_mode_tr != "FOOT")].groupby('journey_id').agg(NbUniqueModes=('transportation_mode_tr', 'nunique')).reset_index().groupby('NbUniqueModes').nunique()
res['percent'] = res['journey_id'] / df[df.journey_id.isin(journeys_with_train) & (df.transportation_mode_tr != "FOOT")]['journey_id'].nunique()
res = res.rename(columns={"journey_id": "nb_journey"})
result_dict["multimodal_train_trips"]["data"]["nb_unique_modes"] = res.to_dict()
res


In [None]:
ddf = df[df.journey_id.isin(journeys_with_train) & (df.transportation_mode_tr != "FOOT") & (df.transportation_mode_tr != "RAIL_TRIP") & (df.transportation_mode_tr != "HIGH_SPEED_TRAIN")]
ddfg = ddf.groupby('transportation_mode_tr')[['journey_id']].nunique()
ddfg["percent"] = ddfg * 100 / ddf[['journey_id']].nunique()
ddfg = ddfg.rename(columns={"journey_id": "nb_journey"})
result_dict["multimodal_train_trips"]["data"]["nb_journey_per_modes"] = ddfg.to_dict()
ddfg

## Taux d'occupation des véhicules

In [None]:
val = df[df['transportation_mode_tr'] == "PASSENGER_CAR"].passenger_count.mean()
result_dict["occupancy"] = {"data": val}
val

## Save to file

In [41]:
with open(OUTPUT_FILE, 'w') as f:
    f.write(json.dumps(result_dict, indent=2))

In [None]:
df_gp_journey

In [97]:
dfh3_start = df_gp_journey.h3.geo_to_h3(7, lat_col="journey_starting_latitude", lng_col="journey_starting_longitude", set_index=False)

In [None]:
drawgeoframe_start = dfh3_start.groupby(['h3_07']).agg(Count=('journey_id', 'nunique'), UserCount=('moover_id', 'nunique'))
drawgeoframe_start = drawgeoframe_start[drawgeoframe_start["UserCount"] > 3]
drawgeoframe_start = drawgeoframe_start.h3.h3_to_geo_boundary()
drawgeoframe_start

In [102]:
import branca.colormap as cm
colormap = cm.LinearColormap(["green", "yellow", "red"], vmin=0, vmax=drawgeoframe_start["Count"].max())
drawgeoframe_start["color"] = drawgeoframe_start["Count"].apply(lambda x: colormap(x)[:-2])

drawgeoframe_start[["geometry", "color", "Count", 'UserCount']].to_file(DEPARTURES_OUTPUT_FILE, driver="GeoJSON")

In [None]:
import folium
start_lat = 48.8915079
start_long = 2.3495425
m = folium.Map(location=[start_lat, start_long], zoom_start=13)
folium.GeoJson(drawgeoframe_start[["geometry", "color"]], style_function=lambda f: {"color": f['properties']['color']}).add_to(m)
m

In [98]:
dfh3_start = dfh3_start.set_index("h3_07").h3.h3_to_geo().reset_index()
dfh3_start["center_geom"] = dfh3_start["geometry"]

In [None]:
def gen_end_map(start_h3):
    fixed_point = dfh3_start[dfh3_start["h3_07"] == start_h3]["center_geom"].iloc[0]
    dfh3_start_filter = dfh3_start[dfh3_start["h3_07"] == start_h3].drop(columns=["geometry", "h3_07"])
    dfh3_start_filter["geometry"] = dfh3_start_filter["end_geometry"]
    dfh3_end = dfh3_start_filter.h3.geo_to_h3(7, lat_col="journey_ending_latitude", lng_col="journey_ending_longitude", set_index=False)
    
    drawgeoframe_end = dfh3_end.groupby(['h3_07']).agg(Count=('journey_id', 'nunique'), UserCount=('moover_id', 'nunique'))
    drawgeoframe_end = drawgeoframe_end[drawgeoframe_end["UserCount"] > 3]
    # Find center of h3 cell and store the point in center_geom
    drawgeoframe_end = drawgeoframe_end.h3.h3_to_geo()
    drawgeoframe_end["center_geom"] = drawgeoframe_end["geometry"]

    # Store the hexagon shape in "geometry" (default)
    drawgeoframe_end = drawgeoframe_end.h3.h3_to_geo_boundary()
    
    # Function to create a line from the fixed point to each point
    def create_line(point):
        return LineString([fixed_point, point])

    # Store in geom a line between the center of IDF and the center of the cell, this will be used for the lines viz
    drawgeoframe_end['geom'] = drawgeoframe_end['center_geom'].apply(create_line)

    colormap = cm.LinearColormap(["green", "yellow", "red"], vmin=0, vmax=drawgeoframe_end["Count"].max())
    drawgeoframe_end["color"] = drawgeoframe_end["Count"].apply(lambda x: colormap(x)[:-2])
    drawgeoframe_end[["geometry", "color", "Count", 'UserCount']].to_file(OUTPUT_FOLDER + "/arrival_" + start_h3 + ".geojson", driver="GeoJSON")
    drawgeoframe_end[["geom", "color", "Count", "UserCount"]].rename(columns={"geom": "geometry"}).to_file(OUTPUT_FOLDER + "/arrival_" + start_h3 + "_lines.geojson", driver="GeoJSON")
    return drawgeoframe_end

for start_h3 in drawgeoframe_start.reset_index()["h3_07"]:
    gen_end_map(start_h3)

