In [66]:
import pandas as pd
import numpy as np
from datetime import datetime, time
import networkx as nx

In [67]:
atac_state = pd.read_csv(r"../output_data/atac_state_full.csv")
atac_state['arrival_time'] = pd.to_datetime(atac_state['arrival_time'], format='%H:%M:%S', errors='coerce')
atac_state['departure_time'] = pd.to_datetime(atac_state['departure_time'], format='%H:%M:%S', errors='coerce')
# Filter out rows with invalid times (e.g., "25:00:00" becomes NaT)
atac_state = atac_state.dropna(subset=['arrival_time', 'departure_time'])

  atac_state = pd.read_csv(r"../output_data/atac_state_full.csv")


In [68]:
atac_state.dtypes

trip_id                        object
arrival_time           datetime64[ns]
departure_time         datetime64[ns]
stop_id                        object
stop_sequence                   int64
stop_headsign                 float64
shape_dist_traveled             int64
timepoint                       int64
route_id                       object
service_id                     object
trip_headsign                  object
dtype: object

In [69]:
atac_state.columns

Index(['trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
       'stop_headsign', 'shape_dist_traveled', 'timepoint', 'route_id',
       'service_id', 'trip_headsign'],
      dtype='object')

In [70]:
start_time_range = datetime.strptime('13:55:00', "%H:%M:%S")
end_time_range = datetime.strptime('16:00:00', "%H:%M:%S")

# Apply the filter
atac_state_100 = atac_state[(atac_state['arrival_time'] >= start_time_range) & (atac_state['arrival_time'] <= end_time_range) &
                 (atac_state['departure_time'] >= start_time_range) & (atac_state['departure_time'] <= end_time_range)]

atac_state_100 = atac_state_100[['trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence',
       'stop_headsign', 'shape_dist_traveled',
       'timepoint', 'route_id', 'service_id', 'trip_headsign']]
atac_state_100.to_csv("../output_data/atac_state_100.csv", index = False)

In [71]:
# Mapping trip_headsigns to stop_ids

In [72]:
atac_state_100.shape

(450951, 11)

In [29]:
atac_state_100.sort_values(by=['trip_id', 'stop_sequence'])

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,shape_dist_traveled,timepoint,route_id,service_id,trip_headsign
27067,0#1-7,1900-01-01 14:00:32,1900-01-01 14:00:32,72710,33,,19761,0,C11,0#31,CINECITTA' (MA)
27068,0#1-7,1900-01-01 14:01:04,1900-01-01 14:01:04,72711,34,,19992,0,C11,0#31,CINECITTA' (MA)
27069,0#1-7,1900-01-01 14:02:22,1900-01-01 14:02:22,72712,35,,20563,0,C11,0#31,CINECITTA' (MA)
27070,0#1-7,1900-01-01 14:02:56,1900-01-01 14:02:56,72713,36,,20809,0,C11,0#31,CINECITTA' (MA)
27071,0#1-7,1900-01-01 14:04:30,1900-01-01 14:04:30,72714,37,,21484,0,C11,0#31,CINECITTA' (MA)
...,...,...,...,...,...,...,...,...,...,...,...
3764595,1#999-9,1900-01-01 15:19:32,1900-01-01 15:19:32,71366,33,,9867,0,19L,1#58,VALLE GIULIA
3764596,1#999-9,1900-01-01 15:20:52,1900-01-01 15:20:52,71369,34,,10106,0,19L,1#58,VALLE GIULIA
3764597,1#999-9,1900-01-01 15:23:36,1900-01-01 15:23:36,71373,35,,10588,0,19L,1#58,VALLE GIULIA
3764598,1#999-9,1900-01-01 15:26:32,1900-01-01 15:26:32,71378,36,,11113,0,19L,1#58,VALLE GIULIA


In [119]:
df = atac_state_100.sort_values(by=['trip_id', 'stop_sequence']).drop(columns = ["stop_headsign","timepoint","service_id"])
df['next_stop_id'] = df.sort_values(["trip_id", "stop_sequence"]).groupby('trip_id')['stop_id'].shift(-1)
df['distance'] = df.sort_values(["trip_id", "stop_sequence"]).groupby('trip_id')['shape_dist_traveled'].diff()
df["departure_next_stop_time"] = df.sort_values(["trip_id", "stop_sequence"]).groupby('trip_id')['departure_time'].shift(-1)

df["estimated_trip_time"] = (df["departure_next_stop_time"]-df["departure_time"])
df["estimated_trip_time"] = df["estimated_trip_time"].apply(lambda x: x.total_seconds())
df["meters_per_second"] = df["distance"]/df["estimated_trip_time"]
df.reset_index(drop = True, inplace = True)
df.sort_values(["trip_id", "stop_sequence"], ascending = [False, False]).head(50)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,route_id,trip_headsign,next_stop_id,distance,departure_next_stop_time,estimated_trip_time,meters_per_second
450950,1#999-9,1900-01-01 15:28:00,1900-01-01 15:28:00,71842,37,11371,19L,VALLE GIULIA,,258.0,NaT,,
450949,1#999-9,1900-01-01 15:26:32,1900-01-01 15:26:32,71378,36,11113,19L,VALLE GIULIA,71842.0,525.0,1900-01-01 15:28:00,88.0,5.965909
450948,1#999-9,1900-01-01 15:23:36,1900-01-01 15:23:36,71373,35,10588,19L,VALLE GIULIA,71378.0,482.0,1900-01-01 15:26:32,176.0,2.738636
450947,1#999-9,1900-01-01 15:20:52,1900-01-01 15:20:52,71369,34,10106,19L,VALLE GIULIA,71373.0,239.0,1900-01-01 15:23:36,164.0,1.457317
450946,1#999-9,1900-01-01 15:19:32,1900-01-01 15:19:32,71366,33,9867,19L,VALLE GIULIA,71369.0,338.0,1900-01-01 15:20:52,80.0,4.225
450945,1#999-9,1900-01-01 15:17:38,1900-01-01 15:17:38,70546,32,9529,19L,VALLE GIULIA,71366.0,289.0,1900-01-01 15:19:32,114.0,2.535088
450944,1#999-9,1900-01-01 15:16:02,1900-01-01 15:16:02,72086,31,9240,19L,VALLE GIULIA,70546.0,373.0,1900-01-01 15:17:38,96.0,3.885417
450943,1#999-9,1900-01-01 15:13:56,1900-01-01 15:13:56,71363,30,8867,19L,VALLE GIULIA,72086.0,529.0,1900-01-01 15:16:02,126.0,4.198413
450942,1#999-9,1900-01-01 15:10:58,1900-01-01 15:10:58,71359,29,8338,19L,VALLE GIULIA,71363.0,270.0,1900-01-01 15:13:56,178.0,1.516854
450941,1#999-9,1900-01-01 15:09:26,1900-01-01 15:09:26,72089,28,8068,19L,VALLE GIULIA,71359.0,245.0,1900-01-01 15:10:58,92.0,2.663043


In [120]:
df.sort_values("meters_per_second", ascending = False).head(50)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,route_id,trip_headsign,next_stop_id,distance,departure_next_stop_time,estimated_trip_time,meters_per_second
295564,1#22-5,1900-01-01 15:32:18,1900-01-01 15:32:18,72401,25,49546,C4,CIMITERO FLAMINIO,72402,37852.0,1900-01-01 15:32:30,12.0,3154.333333
54132,0#21-5,1900-01-01 15:32:18,1900-01-01 15:32:18,72401,25,49546,C4,CIMITERO FLAMINIO,72402,37852.0,1900-01-01 15:32:30,12.0,3154.333333
300075,1#23-5,1900-01-01 15:32:18,1900-01-01 15:32:18,72401,25,49546,C4,CIMITERO FLAMINIO,72402,37852.0,1900-01-01 15:32:30,12.0,3154.333333
48057,0#20-5,1900-01-01 15:32:18,1900-01-01 15:32:18,72401,25,49546,C4,CIMITERO FLAMINIO,72402,37852.0,1900-01-01 15:32:30,12.0,3154.333333
105915,0#31-4,1900-01-01 15:44:40,1900-01-01 15:44:40,20033,54,46091,C7,CIMITERO FLAMINIO,78527,25441.0,1900-01-01 15:45:00,20.0,1272.05
362539,1#33-4,1900-01-01 15:44:40,1900-01-01 15:44:40,20033,54,46091,C7,CIMITERO FLAMINIO,78527,25441.0,1900-01-01 15:45:00,20.0,1272.05
355686,1#32-4,1900-01-01 15:44:40,1900-01-01 15:44:40,20033,54,46091,C7,CIMITERO FLAMINIO,78527,25441.0,1900-01-01 15:45:00,20.0,1272.05
100126,0#30-4,1900-01-01 15:44:40,1900-01-01 15:44:40,20033,54,46091,C7,CIMITERO FLAMINIO,78527,25441.0,1900-01-01 15:45:00,20.0,1272.05
178511,0#4987-12,1900-01-01 15:56:18,1900-01-01 15:56:18,82952,7,10766,712,GIOJA,72110,3754.0,1900-01-01 15:56:24,6.0,625.666667
334158,1#2886-12,1900-01-01 15:56:18,1900-01-01 15:56:18,82952,8,10953,712,GIOJA,72110,3754.0,1900-01-01 15:56:26,8.0,469.25


In [121]:
# Assuming you have a DataFrame called 'df' with columns like 'source', 'target', and 'route_id'
# Group the edges by 'route_id' and count the occurrences
route_counts = df.groupby('route_id').size().reset_index(name='count')
# Sort the routes by count in descending order to find the most common ones
most_common_routes = route_counts.sort_values(by='count', ascending=False)
# Print the top N most common routes (adjust N as needed)
N = 100  # Change this to the number of most common routes you want to retrieve
top_routes = most_common_routes.head(N).reset_index(drop=True)
print("Top", N, "most common routes:")
print(top_routes)

Top 100 most common routes:
   route_id  count
0      8BUS   7197
1        63   4848
2        87   4574
3        85   4494
4       716   4444
..      ...    ...
95       01   2057
96      781   2044
97      715   2042
98       53   2020
99      905   2000

[100 rows x 2 columns]


In [122]:
dfopt = df.merge(top_routes.drop_duplicates(), on = "route_id", how = "inner")

In [129]:
dfopt.shape

(298577, 14)

In [128]:
dfopt["next_stop_id"].isna().sum()

12951

In [123]:
dfopt.reset_index(drop = True, inplace = True)

In [124]:
dfopt.sort_values(by=['departure_time','meters_per_second'], ascending = [True, False])

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,route_id,trip_headsign,next_stop_id,distance,departure_next_stop_time,estimated_trip_time,meters_per_second,count
903,0#990-11,1900-01-01 13:55:00,1900-01-01 13:55:00,75090,31,11258,170,AGRICOLTURA,,,NaT,,,2814
2182,1#967-11,1900-01-01 13:55:00,1900-01-01 13:55:00,75090,31,11258,170,AGRICOLTURA,,,NaT,,,2814
3944,0#1024-8,1900-01-01 13:55:00,1900-01-01 13:55:00,71317,42,12119,19L,GERANI,,,NaT,,,2336
4094,1#1001-8,1900-01-01 13:55:00,1900-01-01 13:55:00,71317,42,12119,19L,GERANI,,,NaT,,,2336
5905,0#1132-15,1900-01-01 13:55:00,1900-01-01 13:55:00,20501,24,16213,20,CAMBELLOTTI,,,NaT,,,3666
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278995,0#597-10,1900-01-01 16:00:00,1900-01-01 16:00:00,80685,34,15381,058,PONTE MAMMOLO (MB),,433.0,NaT,,,2111
280055,1#597-10,1900-01-01 16:00:00,1900-01-01 16:00:00,80685,34,15381,058,PONTE MAMMOLO (MB),,433.0,NaT,,,2111
291357,0#905-10,1900-01-01 16:00:00,1900-01-01 16:00:00,74416,37,13092,160,MONTAGNOLA,,145.0,NaT,,,3000
292857,1#882-10,1900-01-01 16:00:00,1900-01-01 16:00:00,74416,37,13092,160,MONTAGNOLA,,145.0,NaT,,,3000


In [130]:
# Create a new DataFrame with 'stop_id' as source and 'next_stop_id' as destination
df_edges = dfopt.copy()

In [131]:
df_edges.reset_index(drop = True, inplace = True)

In [132]:
df_edges.head(3)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,route_id,trip_headsign,next_stop_id,distance,departure_next_stop_time,estimated_trip_time,meters_per_second,count
0,0#1000-11,1900-01-01 13:56:32,1900-01-01 13:56:32,70078,24,8874,170,TERMINI (MA-MB-FS),70079,,1900-01-01 13:59:12,160.0,,2814
1,0#1000-11,1900-01-01 13:59:12,1900-01-01 13:59:12,70079,25,9433,170,TERMINI (MA-MB-FS),70081,559.0,1900-01-01 14:00:18,66.0,8.469697,2814
2,0#1000-11,1900-01-01 14:00:18,1900-01-01 14:00:18,70081,26,9664,170,TERMINI (MA-MB-FS),70082,231.0,1900-01-01 14:01:34,76.0,3.039474,2814


In [133]:
# Create a directed graph
G = nx.DiGraph()

# Add edges with 'stop_id' as source, 'next_stop_id' as target, and 'distance' as an attribute
edges = df_edges[['stop_id', 'next_stop_id', 'distance']].values
G.add_weighted_edges_from(edges, weight='distance')

In [199]:
def greedy_dumb_search(start_node, path_length =20, max_search=20):# Assuming you already have a graph G with nodes, edges, and weights
    # Choose a starting node
    # Initialize variables
    current_node = start_node
    longest_path = []
    visited = set()
    edge_count = 0
    total_distance = 0
    # Continue until at least 2 edges are added or we reach 20 nodes
    while edge_count < path_length and len(visited) < max_search:
        # Mark the current node as visited
        visited.add(current_node)
        # Find neighbors of the current node with the highest edge weights
        neighbors = list(G.neighbors(current_node))
        neighbors = [i for i in neighbors if not math.isnan(i)]
        max_weight = -1
        next_node = None
    
        for neighbor in neighbors:
            print(neighbor)
            if neighbor not in visited:
                weight = G[current_node][neighbor]['distance']
                if weight > max_weight:
                    max_weight = weight
                    next_node = neighbor
    
        # If there is a neighbor with a higher weight, add it to the path
        if next_node is not None:
            longest_path.extend([current_node, next_node])
            current_node = next_node
            edge_count += 1
            total_distance += max_weight
        else:
            # No more neighbors to add, break the loop
            break
    return total_distance, longest_path

float

In [149]:
G[73886][70031]['distance']

KeyError: 70031

In [193]:
neighbors = [70034, 70037, "na"]
neighbors.remove("nan")

ValueError: list.remove(x): x not in list

In [202]:
import math

In [205]:
# Assuming you already have a graph G with nodes, edges, and weights
# Choose a starting node
start_node = 70032  # Replace with your desired starting node

# Initialize variables
current_node = start_node
longest_path = []
visited = set()
edge_count = 0
total_distance = 0
# Continue until at least 2 edges are added or we reach 20 nodes
while edge_count < 20 and len(visited) < 20:
    # Mark the current node as visited
    visited.add(current_node)

    # Find neighbors of the current node with the highest edge weights
    neighbors = list(G.neighbors(current_node))
    neighbors = [i for i in neighbors if not math.isnan(i)]
    print("current_neighbors", neighbors)
    max_weight = -1
    next_node = None

    for neighbor in neighbors:
        print(neighbor)
        if neighbor not in visited:
            weight = G[current_node][neighbor]['distance']

            if weight > max_weight:
                max_weight = weight
                next_node = neighbor

    # If there is a neighbor with a higher weight, add it to the path
    if next_node is not None:
        longest_path.extend([current_node, next_node])
        current_node = next_node
        edge_count += 1
        total_distance += max_weight
    else:
        # No more neighbors to add, break the loop
        break

print("Longest path with at least two edges:", longest_path, total_distance)

current_neighbors [70034, 70037]
70034
70037
current_neighbors [80610, 72983, 70038]
80610
72983
70038
current_neighbors [70078]
70078
Longest path with at least two edges: [70032, 70037, 70037, 72983] 1427.0


In [181]:
for i in longest_path:
    print(dfopt.loc[dfopt["stop_id"] == i]["trip_headsign"].unique())

['AGRICOLTURA' 'P.ZA VENEZIA' 'P.ZA STAZIONE S. PIETRO (FL)' 'CLODIO'
 'DEI CAPASSO']
[]


In [162]:
visited

{70032, 70037}

In [156]:
longest_paths = [[70032]]
longest_paths[0]

[70032]

In [32]:
nx.dag_longest_path(G, weight="distance")

NetworkXUnfeasible: Graph contains a cycle or graph changed during iteration

In [31]:
G.nodes

NodeView((72710, 72711, 72712, 72713, 72714, 74671, 76701, 76702, 78129, 82030, 72680, 72681, 72683, 72684, 72686, 72687, 72688, 75567, 75568, 70081, 70082, 70084, 70085, 72044, 70031, 70032, 70034, 70035, 70037, 80610, 70540, 70543, 70611, 70612, 70613, 79670, 72071, 77708, 72075, 70951, 71440, 80812, 70955, 72053, 70961, 72054, 70963, 70964, 78798, 70965, 70715, 73153, 72151, 75090, 72115, 78478, 70656, 75370, 78755, 70924, 70926, 70928, 72065, 70930, 72067, 80813, 70933, 70934, 72045, 72046, 80566, 70646, 70518, 80237, 70078, 70079, 71023, 74851, 74167, 74170, 74176, 79665, 73115, 73120, 73122, 78541, 72271, 72274, 72277, 72805, 72930, 72181, 73347, 71155, 72252, 72255, 79659, 73108, 73113, 73114, 72941, 72945, 72947, 74850, 79728, 71022, 75356, 71017, 71018, 71019, 71020, '74215', '78139', '74462', '73237', '75936', '75937', '75938', '75939', '72258', '72259', '82286', '82288', '82287', '72265', '77096', '80693', '72157', '72014', '73231', '73232', '73233', '73234', '73235', '73236