In [13]:
from datetime import datetime, timedelta
from google.transit import gtfs_realtime_pb2
import pandas as pd
import requests

### Parse GTFS (Update)

In [7]:
def parse_gtfs_rt() -> pd.DataFrame:
    """
    Parse GTFS-RT feed data and return a DataFrame.
    Returns:
        pd.DataFrame: A DataFrame containing the parsed GTFS-RT feed data with columns:
            - 'Trip ID': The ID of the trip.
            - 'Departure Time': The departure time of the trip.
            - 'Departure Date': The departure date of the trip.
            - 'Stops': A list of stops with their respective arrival and departure times and delays.
    """
    
    # Feed URL
    url = "https://proxy.transport.data.gouv.fr/resource/sncf-tgv-gtfs-rt-trip-updates"
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

    # Get feed instance
    feed = gtfs_realtime_pb2.FeedMessage()
    feed.ParseFromString(response.content)


    # Get stop id dictionary
    df_stops = pd.read_csv("../datas/gtfs/stops.txt")
    df_stops = df_stops[['stop_id', 'stop_name']]
    # Get stop dictionary
    stop_dict = dict(zip(df_stops['stop_id'], df_stops['stop_name']))
       
    
    # Initialize data
    data = []

    # Iterate over entities
    for entity in feed.entity:
        trip_id = entity.trip_update.trip.trip_id
        departure_time = entity.trip_update.trip.start_time
        departure_date = entity.trip_update.trip.start_date


        # Initialize stops and iterate over them
        stops = []
        for stop in entity.trip_update.stop_time_update:
            stop_id = stop.stop_id
            # Get stop_name based on stop_id dictionary
            stop_name = stop_dict.get(stop_id, stop_id)

            stop_arrival_time = stop.arrival.time
            stop_arrival_delay = stop.arrival.delay
            stop_departure_time = stop.departure.time
            stop_departure_delay = stop.departure.delay

            # Convert stop times to datetime
            if stop_arrival_time != 0:
                stop_arrival_time = datetime.fromtimestamp(stop_arrival_time).strftime('%H:%M:%S')
            if stop_departure_time != 0:
                stop_departure_time = datetime.fromtimestamp(stop_departure_time).strftime('%H:%M:%S')


            stops.append([stop_name, stop_arrival_time, stop_arrival_delay, stop_departure_time, stop_departure_delay])



        # Last stop is terminus, so arrival time:
        arrival_time = stops[-1][1]

        # Append data    
        data.append([trip_id, departure_date, departure_time, arrival_time, stops])




    # Return DataFrame
    return pd.DataFrame(data, columns=['Trip ID', 'Departure Date', 'Departure Time', 'Arrival Time', 'Stops'])

In [8]:
# Get DF raw
df = parse_gtfs_rt()
df.tail()

Unnamed: 0,Trip ID,Departure Date,Departure Time,Arrival Time,Stops
178,OCESN5316F3160808:2024-08-10T00:26:19Z,20240810,15:56:00,22:24:00,"[[Le Havre, 0, 0, 15:56:00, 0], [Rouen Rive Dr..."
179,OCESN7226F2993648:2024-08-05T00:28:13Z,20240810,15:56:00,17:15:00,"[[Lille Europe, 0, 0, 15:56:00, 0], [Arras, 16..."
180,OCESN7228F3099391:2024-08-05T00:28:13Z,20240810,15:56:00,17:15:00,"[[Lille Europe, 0, 0, 15:56:00, 0], [Arras, 16..."
181,OCESN9896F3338294:2024-06-22T00:38:13Z,20240810,15:56:00,23:02:00,"[[Marseille Saint-Charles, 0, 0, 15:56:00, 0],..."
182,OCESN5444F3087948:2024-08-07T00:26:45Z,20240810,15:57:00,21:29:00,"[[Bordeaux Saint-Jean, 0, 0, 15:57:00, 0], [An..."


### Clean DF

In [4]:
def clean_gtfs_dataframe(df:pd.DataFrame) -> pd.DataFrame:
    """
    Clean the GTFS dataframe by performing the following operations:
    1. Replace Trip ID values by circulation nb.
    2. Drop unnecessary columns.
    3. Remove rows with missing trip_headsign values.
    4. Convert trip_headsign column to integer.
    5. Convert Departure Date column to datetime.
    6. Reorganize the columns order.
    Parameters:
        df (pd.DataFrame): The input GTFS dataframe to be cleaned.
    Returns:
        pd.DataFrame: The cleaned GTFS dataframe.
    """
    # Get Origin & Destination
    df["Origin"] = df["Stops"].apply(lambda x: x[0][0])
    df["Destination"] = df["Stops"].apply(lambda x: x[-1][0])


    # Replace Trip ID values by circulation nb
    df_trips = pd.read_csv("./gtfs/export_gtfs_voyages/trips.txt")
    df_trips = df_trips[['trip_id', 'trip_headsign']]
    df = pd.merge(df, df_trips, left_on='Trip ID', right_on='trip_id', how='left')

    # Drop columns
    df = df.drop(columns=['Trip ID', "trip_id"])
    df.dropna(subset=['trip_headsign'], inplace=True)

    # Convert trip_headsign to int
    df['trip_headsign'] = df['trip_headsign'].astype('int')

    # Convert Departure Date to datetime
    df['Departure Date'] = pd.to_datetime(df['Departure Date'], format='%Y%m%d')



    # Reorganize columns order
    df = df[['trip_headsign', 'Departure Date', 'Departure Time', 'Origin', 'Arrival Time', 'Destination', 'Stops']]

    return df

In [11]:
# Get DF cleaned
df_cleaned = clean_gtfs_dataframe(df)
df_cleaned.head(30)

Unnamed: 0,trip_headsign,Departure Date,Departure Time,Origin,Arrival Time,Destination,Stops
0,5537,2024-08-03,12:26:00,Nancy,22:28:00,Nice-Ville,"[[Nancy, 0, 0, 12:26:00, 0], [Strasbourg, 13:2..."
1,2239,2024-08-03,13:37:00,Strasbourg,22:28:00,Nice-Ville,"[[Strasbourg, 0, 0, 14:07:00, 1800], [Colmar, ..."
2,9580,2024-08-03,14:39:00,Mannheim Hbf,21:46:00,Marseille Saint-Charles,"[[Mannheim Hbf, 0, 0, 14:49:00, 600], [Karlsru..."
3,9255,2024-08-03,14:48:00,Paris Gare de Lyon Hall 1 - 2,23:10:00,MILANO PORTA GARIBALDI,"[[Paris Gare de Lyon Hall 1 - 2, 0, 0, 14:48:0..."
4,9250,2024-08-03,15:10:00,MILANO PORTA GARIBALDI,00:34:00,Paris Gare de Lyon Hall 1 - 2,"[[MILANO PORTA GARIBALDI, 0, 0, 15:10:00, 0], ..."
5,2096,2024-08-03,15:56:00,Marseille Saint-Charles,22:07:00,Strasbourg,"[[Marseille Saint-Charles, 0, 0, 15:56:00, 0],..."
6,5316,2024-08-03,15:56:00,Le Havre,22:24:00,Marseille Saint-Charles,"[[Le Havre, 0, 0, 15:56:00, 0], [Rouen Rive Dr..."
7,9896,2024-08-03,15:56:00,Marseille Saint-Charles,23:02:00,Metz,"[[Marseille Saint-Charles, 0, 0, 15:56:00, 0],..."
9,6168,2024-08-03,16:03:00,Nice-Ville,21:50:00,Paris Gare de Lyon Hall 1 - 2,"[[Nice-Ville, 0, 0, 16:03:00, 0], [Antibes, 16..."
10,8544,2024-08-03,16:05:00,Hendaye,21:45:00,Paris Montparnasse Hall 1 - 2,"[[Hendaye, 0, 0, 16:35:00, 1800], [Saint-Jean-..."


### Next Trains

In [6]:
def get_next_trains(df: pd.DataFrame, nb_trains = 10, station = "all") -> pd.DataFrame:
    """
    Parameters:
    - df (pd.DataFrame): The DataFrame containing train data.
    - nb_trains (int): The number of next trains to retrieve (default: 10).
    - station (str): The station to filter the trains by (default: "all").
    Returns:
    - pd.DataFrame: The DataFrame containing the next trains.
    """
    now = datetime.now().strftime('%H:%M:%S')
    df_filtered = df[df['Departure Time'] >= now]

    if station in df_filtered['Origin'].unique():
        df_filtered = df_filtered[df_filtered['Origin'] == station]
        next_trains = df_filtered.head(nb_trains)
    else:
        next_trains = df_filtered.head(nb_trains)

    return next_trains

In [7]:
origin_stations = set(df_cleaned['Origin'].unique())
available_stations = list(origin_stations)
for station in df_cleaned['Destination'].unique():
    if station not in origin_stations:
        available_stations.append(station)
available_stations = sorted(available_stations)
available_stations

['Annecy',
 'Basel SBB',
 'Bordeaux Saint-Jean',
 'Brest',
 'Bruxelles Midi',
 'Dunkerque',
 'Francfort sur le Main',
 'Geneve',
 'Grenoble',
 'Hendaye',
 'La Rochelle',
 'Le Croisic',
 'Le Havre',
 "Les Sables-d'Olonne",
 'Lille Europe',
 'Lille Flandres',
 'Lyon Part Dieu',
 'Lyon Perrache',
 'MILANO PORTA GARIBALDI',
 'Mannheim Hbf',
 'Marseille Saint-Charles',
 'Metz',
 'Miramas',
 'Modane',
 'Montpellier Saint-Roch',
 'Montpellier Sud de France',
 'Mulhouse',
 'Munich',
 'Nancy',
 'Nantes',
 'Nice-Ville',
 'Paris Est',
 'Paris Gare de Lyon Hall 1 - 2',
 'Paris Gare du Nord',
 'Paris Montparnasse Hall 1 - 2',
 'Perpignan',
 'Quimper',
 'Rang-du-Fliers - Verton - Berck',
 'Reims',
 'Rennes',
 'Saint-Gervais-les-Bains-Le Fayet',
 'Saint-Étienne Châteaucreux',
 'Strasbourg',
 'Stuttgart Hbf',
 'Tarbes',
 'Toulouse Matabiau',
 'Tourcoing',
 'Zuerich HB',
 'Évian-les-Bains']

In [8]:
# Get next rains by station
next_trains = get_next_trains(df_cleaned, nb_trains = 10, station = "Paris Gare du Nord")
next_trains

Unnamed: 0,trip_headsign,Departure Date,Departure Time,Origin,Arrival Time,Destination,Stops
92,7297,2024-08-03,22:22:00,Paris Gare du Nord,00:05:00,Tourcoing,"[[Paris Gare du Nord, 0, 0, 22:22:00, 0], [Arr..."


### Train Infos

In [9]:
def get_train_informations(circulation_nb:int, df: pd.DataFrame) -> pd.DataFrame:
    """
    Retrieve train information based on the circulation number.
    Parameters:
    - circulation_nb (int): The circulation number of the train.
    - df (pd.DataFrame): The DataFrame containing the train information.
    Returns:
    - pd.DataFrame: A DataFrame containing the train stops information, including stop ID, arrival time, arrival delay, departure time, and departure delay.
    """

    df = df[df['trip_headsign'] == circulation_nb]
 
    # Decompile Stops
    stops = []
    for stops_list in df["Stops"]:
        for elt in stops_list:
            stop_name = elt[0]
            stop_arrival_time = elt[1]
            stop_arrival_delay = elt[2]
            stop_departure_time = elt[3]
            stop_departure_delay = elt[4]

            stops.append([stop_name, stop_arrival_time, stop_arrival_delay, stop_departure_time, stop_departure_delay])

    # Create DataFrame
    df = pd.DataFrame(stops, columns=['Stop Name', 'Arrival Time', 'Arrival Delay (secs)', 'Departure Time', 'Departure Delay (secs)'])

    return df

In [12]:
# Get train infos
train = get_train_informations(9882, df_cleaned)
train

Unnamed: 0,Stop Name,Arrival Time,Arrival Delay (secs),Departure Time,Departure Delay (secs)
0,Marseille Saint-Charles,0,0,17:12:00,0
1,Aix-en-Provence TGV,17:28:00,300,17:31:00,300
2,Avignon TGV,17:51:00,300,17:54:00,300
3,Lyon Part Dieu,18:54:00,0,19:00:00,0
4,Marne-la-Vallée Chessy,20:47:00,0,20:51:00,0
5,Aéroport Charles de Gaulle 2 TGV,21:01:00,0,21:07:00,0
6,Lille Europe,21:58:00,0,22:14:00,300
7,Bruxelles Midi,22:48:00,300,0,0
