In [53]:
from datetime import datetime, timedelta
from google.transit import gtfs_realtime_pb2
import pandas as pd
import requests

### Parse GTFS (Update)

In [167]:
def parse_gtfs_rt() -> pd.DataFrame:
    """
    Parse GTFS-RT feed data and return a DataFrame.
    Returns:
        pd.DataFrame: A DataFrame containing the parsed GTFS-RT feed data with columns:
            - 'Trip ID': The ID of the trip.
            - 'Departure Time': The departure time of the trip.
            - 'Departure Date': The departure date of the trip.
            - 'Stops': A list of stops with their respective arrival and departure times and delays.
    """
    
    # Feed URL
    url = "https://proxy.transport.data.gouv.fr/resource/sncf-tgv-gtfs-rt-trip-updates"
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

    # Get feed instance
    feed = gtfs_realtime_pb2.FeedMessage()
    feed.ParseFromString(response.content)


    # Get stop id dictionary
    df_stops = pd.read_csv("../datas/gtfs/stops.txt")
    df_stops = df_stops[['stop_id', 'stop_name']]
    # Get stop dictionary
    stop_dict = dict(zip(df_stops['stop_id'], df_stops['stop_name']))
       
    
    # Initialize data
    data = []

    # Iterate over entities
    for entity in feed.entity:
        trip_id = entity.trip_update.trip.trip_id
        departure_time = entity.trip_update.trip.start_time
        departure_date = entity.trip_update.trip.start_date


        # Initialize stops and iterate over them
        stops = []
        for stop in entity.trip_update.stop_time_update:
            stop_id = stop.stop_id
            # Get stop_name based on stop_id dictionary
            stop_name = stop_dict.get(stop_id, stop_id)

            stop_arrival_time = stop.arrival.time
            stop_arrival_delay = stop.arrival.delay
            stop_departure_time = stop.departure.time
            stop_departure_delay = stop.departure.delay

            # Convert stop times to datetime
            if stop_arrival_time != 0:
                stop_arrival_time = datetime.fromtimestamp(stop_arrival_time).strftime('%H:%M:%S')
            if stop_departure_time != 0:
                stop_departure_time = datetime.fromtimestamp(stop_departure_time).strftime('%H:%M:%S')


            stops.append([stop_name, stop_arrival_time, stop_arrival_delay, stop_departure_time, stop_departure_delay])



        # Last stop is terminus, so arrival time:
        arrival_time = stops[-1][1]

        # Append data    
        data.append([trip_id, departure_date, departure_time, arrival_time, stops])




    # Return DataFrame
    return pd.DataFrame(data, columns=['Trip ID', 'Departure Date', 'Departure Time', 'Arrival Time', 'Stops'])

In [168]:
# Get DF raw
df = parse_gtfs_rt()
df.tail()

Unnamed: 0,Trip ID,Departure Date,Departure Time,Arrival Time,Stops
172,OCESN2738F3165320:2024-08-01T00:28:09Z,20240811,14:15:00,15:01:00,"[[Reims, 0, 0, 14:15:00, 0], [Paris Est, 15:01..."
173,OCESN7124F3229888:2024-08-06T00:26:38Z,20240811,14:16:00,16:08:00,"[[Valenciennes, 0, 0, 14:16:00, 0], [Douai, 14..."
174,OCESN8485F3268103:2024-08-11T00:28:08Z,20240811,14:16:00,17:42:00,"[[Paris Montparnasse Hall 1 - 2, 0, 0, 14:16:0..."
175,OCESN9775F3061460:2024-08-11T00:28:08Z,20240811,14:18:00,17:29:00,"[[Paris Gare de Lyon Hall 1 - 2, 0, 0, 14:18:0..."
176,OCESN7672F3062801:2024-08-11T00:28:08Z,20240811,14:21:00,18:52:00,"[[Toulouse Matabiau, 0, 0, 14:21:00, 0], [Mont..."


### Clean DF

In [169]:
def clean_gtfs_dataframe(df:pd.DataFrame) -> pd.DataFrame:
    """
    Clean the GTFS dataframe by performing the following operations:
    1. Replace Trip ID values by circulation nb.
    2. Drop unnecessary columns.
    3. Remove rows with missing trip_headsign values.
    4. Convert trip_headsign column to integer.
    5. Convert Departure Date column to datetime.
    6. Reorganize the columns order.
    Parameters:
        df (pd.DataFrame): The input GTFS dataframe to be cleaned.
    Returns:
        pd.DataFrame: The cleaned GTFS dataframe.
    """
    # Get Origin & Destination
    df["Origin"] = df["Stops"].apply(lambda x: x[0][0])
    df["Destination"] = df["Stops"].apply(lambda x: x[-1][0])


    # Replace Trip ID values by circulation nb
    df_trips = pd.read_csv("../datas/gtfs/trips.txt")
    df_trips = df_trips[['trip_id', 'trip_headsign']]
    df = pd.merge(df, df_trips, left_on='Trip ID', right_on='trip_id', how='left')

    # Drop columns
    df = df.drop(columns=['Trip ID', "trip_id"])
    df.dropna(subset=['trip_headsign'], inplace=True)

    # Convert trip_headsign to int
    df['trip_headsign'] = df['trip_headsign'].astype('int')

    # Convert Departure Date to datetime
    df['Departure Date'] = pd.to_datetime(df['Departure Date'], format='%Y%m%d')



    # Reorganize columns order
    df = df[['trip_headsign', 'Departure Date', 'Departure Time', 'Origin', 'Arrival Time', 'Destination', 'Stops']]

    return df

In [170]:
# Get DF cleaned
df_cleaned = clean_gtfs_dataframe(df)
df_cleaned.tail(30)

Unnamed: 0,trip_headsign,Departure Date,Departure Time,Origin,Arrival Time,Destination,Stops
145,8378,2024-08-11,13:47:00,La Rochelle,16:54:00,Paris Montparnasse Hall 1 - 2,"[[La Rochelle, 0, 0, 13:47:00, 0], [Surgères, ..."
146,9752,2024-08-11,13:47:00,Marseille Saint-Charles,18:50:00,Lausanne,"[[Marseille Saint-Charles, 0, 0, 13:47:00, 0],..."
147,6204,2024-08-11,13:50:00,Montpellier Saint-Roch,17:18:00,Paris Gare de Lyon Hall 1 - 2,"[[Montpellier Saint-Roch, 0, 0, 13:50:00, 0], ..."
148,6620,2024-08-11,13:50:00,Lyon Perrache,16:08:00,Paris Gare de Lyon Hall 1 - 2,"[[Lyon Perrache, 0, 0, 13:50:00, 0], [Lyon Par..."
149,7371,2024-08-11,13:52:00,Paris Gare du Nord,16:04:00,Dunkerque,"[[Paris Gare du Nord, 0, 0, 13:52:00, 0], [Arr..."
150,7571,2024-08-11,13:52:00,Paris Gare du Nord,16:41:00,Rang-du-Fliers - Verton - Berck,"[[Paris Gare du Nord, 0, 0, 13:52:00, 0], [Arr..."
151,2369,2024-08-11,13:53:00,Paris Est,16:27:00,Colmar,"[[Paris Est, 0, 0, 13:53:00, 0], [Strasbourg, ..."
152,6617,2024-08-11,13:54:00,Paris Gare de Lyon Hall 1 - 2,16:10:00,Lyon Perrache,"[[Paris Gare de Lyon Hall 1 - 2, 0, 0, 13:54:0..."
154,8922,2024-08-11,13:54:00,Le Croisic,17:22:00,Paris Montparnasse Hall 1 - 2,"[[Le Croisic, 0, 0, 13:54:00, 0], [La Baule-Es..."
155,7324,2024-08-11,13:56:00,Dunkerque,16:08:00,Paris Gare du Nord,"[[Dunkerque, 0, 0, 13:56:00, 0], [Hazebrouck, ..."


### Departures by station

In [174]:
chosen_station = 'Montpellier Saint-Roch'

In [179]:
import pandas as pd
from datetime import datetime

def display_station_info(chosen_station: str, train_display_nb: int = 10, filter_from_now=True) -> pd.DataFrame:
    """
    Display the next train departures from a given station.
    """

    datas = []

    # Iterate through each row in the DataFrame
    for index, row in df_cleaned.iterrows():
        trip_headsign = row['trip_headsign']
        stops = row['Stops']
    


    
        # Iterate through each stop in Stops
        for stop in stops:
            station, arrival_time, arrival_delay, departure_time, departure_delay = stop
            if station == chosen_station:
                datas.append({
                    'trip_headsign': trip_headsign,
                    'arrival_time': arrival_time,
                    'arrival_delay': arrival_delay,
                    'departure_time': departure_time,
                    'departure_delay': departure_delay
                })
    
    # Convert the list to a new DataFrame
    df = pd.DataFrame(datas, columns=['trip_headsign', 'arrival_time', 'arrival_delay', 'departure_time', 'departure_delay'])
    
    # Filter according to the next 'train_display_nb' trains
    df = df.head(train_display_nb)

    # Generate df_departures and df_arrivals
    df_departures = df.drop(columns=['arrival_time', 'arrival_delay'])
    df_departures = df_departures[df_departures['departure_time'] != 0]
    df_departures['departure_time'] = pd.to_datetime(df_departures['departure_time'], format='%H:%M:%S').dt.time

    df_arrivals = df.drop(columns=['departure_time', 'departure_delay'])
    df_arrivals = df_arrivals[df_arrivals['arrival_time'] != 0]
    df_arrivals['arrival_time'] = pd.to_datetime(df_arrivals['arrival_time'], format='%H:%M:%S').dt.time

    if filter_from_now:
        # Get the current time without microseconds
        now = datetime.now().time()

        # Filter departures and arrivals to include only times from now
        df_departures = df_departures[df_departures['departure_time'] >= now]
        df_arrivals = df_arrivals[df_arrivals['arrival_time'] >= now]


    return df_departures, df_arrivals

In [180]:
df_departures, df_arrivals = display_station_info(chosen_station, filter_from_now=True)
df_departures

Unnamed: 0,trip_headsign,departure_time,departure_delay
6,7885,14:17:00,0
8,6204,13:50:00,0
9,2204,13:57:00,0


### Next Trains

In [6]:
def get_next_trains(df: pd.DataFrame, nb_trains = 10, station = "all") -> pd.DataFrame:
    """
    Parameters:
    - df (pd.DataFrame): The DataFrame containing train data.
    - nb_trains (int): The number of next trains to retrieve (default: 10).
    - station (str): The station to filter the trains by (default: "all").
    Returns:
    - pd.DataFrame: The DataFrame containing the next trains.
    """
    now = datetime.now().strftime('%H:%M:%S')
    df_filtered = df[df['Departure Time'] >= now]

    if station in df_filtered['Origin'].unique():
        df_filtered = df_filtered[df_filtered['Origin'] == station]
        next_trains = df_filtered.head(nb_trains)
    else:
        next_trains = df_filtered.head(nb_trains)

    return next_trains

In [7]:
origin_stations = set(df_cleaned['Origin'].unique())
available_stations = list(origin_stations)
for station in df_cleaned['Destination'].unique():
    if station not in origin_stations:
        available_stations.append(station)
available_stations = sorted(available_stations)
available_stations

['Annecy',
 'Basel SBB',
 'Bordeaux Saint-Jean',
 'Brest',
 'Bruxelles Midi',
 'Dunkerque',
 'Francfort sur le Main',
 'Geneve',
 'Grenoble',
 'Hendaye',
 'La Rochelle',
 'Le Croisic',
 'Le Havre',
 "Les Sables-d'Olonne",
 'Lille Europe',
 'Lille Flandres',
 'Lyon Part Dieu',
 'Lyon Perrache',
 'MILANO PORTA GARIBALDI',
 'Mannheim Hbf',
 'Marseille Saint-Charles',
 'Metz',
 'Miramas',
 'Modane',
 'Montpellier Saint-Roch',
 'Montpellier Sud de France',
 'Mulhouse',
 'Munich',
 'Nancy',
 'Nantes',
 'Nice-Ville',
 'Paris Est',
 'Paris Gare de Lyon Hall 1 - 2',
 'Paris Gare du Nord',
 'Paris Montparnasse Hall 1 - 2',
 'Perpignan',
 'Quimper',
 'Rang-du-Fliers - Verton - Berck',
 'Reims',
 'Rennes',
 'Saint-Gervais-les-Bains-Le Fayet',
 'Saint-Étienne Châteaucreux',
 'Strasbourg',
 'Stuttgart Hbf',
 'Tarbes',
 'Toulouse Matabiau',
 'Tourcoing',
 'Zuerich HB',
 'Évian-les-Bains']

In [8]:
# Get next rains by station
next_trains = get_next_trains(df_cleaned, nb_trains = 10, station = "Paris Gare du Nord")
next_trains

Unnamed: 0,trip_headsign,Departure Date,Departure Time,Origin,Arrival Time,Destination,Stops
92,7297,2024-08-03,22:22:00,Paris Gare du Nord,00:05:00,Tourcoing,"[[Paris Gare du Nord, 0, 0, 22:22:00, 0], [Arr..."


### Train Infos

In [9]:
def get_train_informations(circulation_nb:int, df: pd.DataFrame) -> pd.DataFrame:
    """
    Retrieve train information based on the circulation number.
    Parameters:
    - circulation_nb (int): The circulation number of the train.
    - df (pd.DataFrame): The DataFrame containing the train information.
    Returns:
    - pd.DataFrame: A DataFrame containing the train stops information, including stop ID, arrival time, arrival delay, departure time, and departure delay.
    """

    df = df[df['trip_headsign'] == circulation_nb]
 
    # Decompile Stops
    stops = []
    for stops_list in df["Stops"]:
        for elt in stops_list:
            stop_name = elt[0]
            stop_arrival_time = elt[1]
            stop_arrival_delay = elt[2]
            stop_departure_time = elt[3]
            stop_departure_delay = elt[4]

            stops.append([stop_name, stop_arrival_time, stop_arrival_delay, stop_departure_time, stop_departure_delay])

    # Create DataFrame
    df = pd.DataFrame(stops, columns=['Stop Name', 'Arrival Time', 'Arrival Delay (secs)', 'Departure Time', 'Departure Delay (secs)'])

    return df

In [12]:
# Get train infos
train = get_train_informations(9882, df_cleaned)
train

Unnamed: 0,Stop Name,Arrival Time,Arrival Delay (secs),Departure Time,Departure Delay (secs)
0,Marseille Saint-Charles,0,0,17:12:00,0
1,Aix-en-Provence TGV,17:28:00,300,17:31:00,300
2,Avignon TGV,17:51:00,300,17:54:00,300
3,Lyon Part Dieu,18:54:00,0,19:00:00,0
4,Marne-la-Vallée Chessy,20:47:00,0,20:51:00,0
5,Aéroport Charles de Gaulle 2 TGV,21:01:00,0,21:07:00,0
6,Lille Europe,21:58:00,0,22:14:00,300
7,Bruxelles Midi,22:48:00,300,0,0
