In [1]:
import pandas as pd
import gtfs_kit as gk
import geopandas as gpd
import pyproj as pj
import matplotlib as mpl
import matplotlib.pyplot as plt
import folium
from folium import plugins
import warnings

warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

### Function Definition

In [2]:
# Function to visualize on map the route
def route_and_stop_visualization():
    ### We create a map with a view on the zone we are interested in
    map = folium.Map(location=[43.9941, 10.2301], tiles="OpenStreetMap", zoom_start=9)
    
    ### We Circle The Stops belonging to the route
    for point in stops.geometry:
        folium.CircleMarker((point.xy[1][0],point.xy[0][0]), color="blue", weight=0.5, opacity=1).add_to(map)
    
    ### The lists of shapes have duplicates because each shape describe the trip for both direction
    ### so for drawing purposes we want to select only one of the way. Then we draw the shapes
    shape_id_list_duplicates = shapes["shape_id"].to_list()
    shape_id_list = list(set(shape_id_list_duplicates))
    
    for shape_id in shape_id_list:
        shape_df_list = []
        shape_to_draw = shapes.loc[(shapes['shape_id'] == shape_id)]
        shape_to_draw.drop_duplicates(subset='shape_pt_sequence', keep="first",inplace=True)
        for point in shape_to_draw.geometry:
            shape_df_list.append((point.xy[1][0],point.xy[0][0]))
        folium.PolyLine(shape_df_list, color="red", weight=1.5, opacity=1).add_to(map)
    
    return map

In [3]:
def get_sec(time_str):
    """Get seconds from time."""
    h, m, s = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

### Initial Exploration

In [4]:
# Load the gtfs we generated with the various csv file created in the preprocessing
feed = gk.feed.read_feed('../preprocessing.zip',dist_units="km")
routes = gk.routes.get_routes(feed)
stop_times = feed.get_stop_times()
trips = feed.get_trips()
stops = feed.get_stops()
shapes = feed.shapes
calendar_dates = feed.calendar_dates
calendar = feed.calendar

In [5]:
### Checking dimensions
routes_dimension = routes.shape
print(f" Routes Dimension: {routes.shape}")
trips_dimension = trips.shape
print(f" Trips Dimension: {trips.shape}")
stops_dimension = stops.shape
print(f" Stops Dimension: {stops.shape}")
stop_times_dimension = stop_times.shape
print(f" Stop Times Dimension: {stop_times.shape}")
shapes_dimension = shapes.shape
print(f" Shapes Dimension: {shapes.shape}")
calendar_dates_dimension = calendar_dates.shape
print(f" Calenda Dates Dimension: {calendar_dates.shape}")
calendar_dimension = calendar.shape
print(f" Calend Dimension: {calendar.shape}")

 Routes Dimension: (3, 7)
 Trips Dimension: (1201, 7)
 Stops Dimension: (36, 5)
 Stop Times Dimension: (12497, 6)
 Shapes Dimension: (104363, 5)
 Calenda Dates Dimension: (28260, 2)
 Calend Dimension: (201, 10)


In [6]:
### Look at the head of each table
print("Route Head of Dataframe")
print(routes.head())
print('-' * 50)
print("Trips Head of Dataframe")
print(trips.head())
print('-' * 50)
print("Stops Head of Dataframe")
print(stops.head())
print('-' * 50)
print("Stop Times Head of Dataframe")
print(stop_times.head())
print('-' * 50)
print("Shape Head of Dataframe")
print(shapes.head())
print('-' * 50)
print("Calendar Dates Head of Dataframe")
print(calendar_dates.head())
print('-' * 50)
print("Calendar Head of Dataframe")
print(calendar.head())

Route Head of Dataframe
     route_id agency_id route_short_name       route_long_name  route_type  \
0  1085727347       163              NaN  Pisa - Lucca - Aulla           2   
1  1199544473       163              NaN  Pisa - Lucca - Aulla           2   
2  1011259294       163              NaN  Pisa - Lucca - Aulla           2   

  route_color route_text_color  
0      FFFFFF           000000  
1      FFFFFF           000000  
2      FFFFFF           000000  
--------------------------------------------------
Trips Head of Dataframe
     route_id   service_id      trip_id trip_headsign trip_short_name  \
0  1085727347  1814_171965  1814_191301     Pisa C.Le           06772   
1  1085727347  1814_171965  1814_191302     Pisa C.Le           06732   
2  1085727347  1814_171965  1814_191303         Lucca           06745   
3  1085727347  1814_171965  1814_191304     Pisa C.Le           06794   
4  1085727347  1814_171965  1814_191305         Lucca           06733   

   direction_id  

### Adding information on the dataset

In [7]:
# Calculating Distance
# Duplicated distance traveled for calculation
stop_times['shape_dist_traveled_shift'] = stop_times.loc[:,'shape_dist_traveled']
stop_times['shape_dist_traveled_shift'] = stop_times.shape_dist_traveled_shift.shift(1)

# The shift created NaN in the shifted column, we need to replace it with 0
stop_times['shape_dist_traveled_shift'] = stop_times['shape_dist_traveled_shift'].fillna(0)
stop_times.loc[:,['shape_dist_traveled', 'shape_dist_traveled_shift']].head()

### Calculate the difference
stop_times['dist_diff'] = stop_times['shape_dist_traveled'] - stop_times['shape_dist_traveled_shift']
stop_times.head()

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,shape_dist_traveled_shift,dist_diff
0,1814_191301,15:42:00,15:42:00,S06404_1,1,0.0,0.0,0.0
1,1814_191301,15:48:00,15:49:00,S06402_1,2,8.01853,0.0,8.01853
2,1814_191301,15:56:00,15:57:00,S06400_1,3,14.83468,8.01853,6.81615
3,1814_191301,16:07:00,16:08:00,S06501_1,4,20.90887,14.83468,6.07419
4,1814_191301,16:13:00,16:13:00,S06500_1,5,23.13283,20.90887,2.22396


In [8]:
# Calculate time diff

# We create an empty column for time different
stop_times = stop_times.assign(time_diff=None)

for index, row  in stop_times.iterrows():
    if row['stop_sequence'] == 1:
         stop_times['time_diff'][index] = 0
    else:
        value = get_sec(row['arrival_time']) - get_sec(stop_times.loc[index - 1,'departure_time'])
        stop_times['time_diff'][index] = value

In [16]:
### Calculating speed
stop_times = stop_times.assign(speed=None)

for index, row  in stop_times.iterrows():
    if(row['time_diff'] == 0):
        stop_times['speed'][index] = 0
    else:
        # Dist unit in Km while time_diff is in seconds so we transform it in hour
        stop_times['speed'][index] = (row['dist_diff'] / row['time_diff']) * 60 * 60


In [17]:
stop_times

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,shape_dist_traveled,shape_dist_traveled_shift,dist_diff,time_diff,speed
0,1814_191301,15:42:00,15:42:00,S06404_1,1,0.00000,0.00000,0.00000,0,0
1,1814_191301,15:48:00,15:49:00,S06402_1,2,8.01853,0.00000,8.01853,360,80.1853
2,1814_191301,15:56:00,15:57:00,S06400_1,3,14.83468,8.01853,6.81615,420,58.424143
3,1814_191301,16:07:00,16:08:00,S06501_1,4,20.90887,14.83468,6.07419,600,36.44514
4,1814_191301,16:13:00,16:13:00,S06500_1,5,23.13283,20.90887,2.22396,300,26.68752
...,...,...,...,...,...,...,...,...,...,...
12492,4118_296638,17:42:00,17:42:00,S06404_1,1,0.00000,23.13283,-23.13283,0,0
12493,4118_296638,17:48:00,17:49:00,S06402_1,2,8.01853,0.00000,8.01853,360,80.1853
12494,4118_296638,17:57:00,17:58:00,S06400_1,3,14.83468,8.01853,6.81615,480,51.121125
12495,4118_296638,18:07:00,18:08:00,S06501_1,4,20.90887,14.83468,6.07419,540,40.4946


In [None]:
shapes

### Plotting our root

In [None]:
### Our dataframe contains geographical coordinates and we convert them 
### in GeoDataFrame for better visualization
shapes = gpd.GeoDataFrame(shapes, 
        geometry=gpd.points_from_xy(shapes.shape_pt_lon, shapes.shape_pt_lat)).set_crs(epsg=4326)

stops = gpd.GeoDataFrame(stops, 
        geometry=gpd.points_from_xy(stops.stop_lon, stops.stop_lat)).set_crs(epsg=4326)

In [None]:
### We plot the shape which will result as the route
rt_df = pd.merge(routes, trips, on=['route_id','route_id']).reset_index(drop=True)
rts_df = pd.merge(rt_df, shapes, on=['shape_id','shape_id']).reset_index(drop=True)
rts_df.plot.scatter(x="shape_pt_lon",y="shape_pt_lat")

In [None]:
### We visualize the route to understand if there are impurities
route_and_stop_visualization()

## Data cleaning

In [None]:
stops

In [None]:
# Delete the stops
stops_id_to_delete = ["S06725_1","S06506_1","S06505_1","S06504_1","S06502_1","S06351_1","S06350_1","S06040_1"]
for stops_to_delete in stops_id_to_delete:
    stops = stops[stops.stop_id != stops_to_delete]

In [None]:
# Based on the stops we calculate the list of stop_times to delete. We also take the list of
# trip ids to delete

trip_ids_to_delete = []
indexes_stops_to_delete = []
for ind in stop_times.index:
    for stops_to_delete in stops_id_to_delete:
        if stop_times["stop_id"][ind] == stops_to_delete:
            trip_ids_to_delete.append(stop_times["trip_id"][ind])
            indexes_stops_to_delete.append(ind)


### For safety reasons we delete rows from the dataframe after we calculate the speed in the 
# for ind_to_delete in indexes_stops_to_delete:
#    stop_times = stop_times[stop_times.index != ind_to_delete]

In [None]:
# Based on the trips_ids we calculate the list of shapes and trips to delete. 

shape_ids_to_delete = []

for ind in trips.index:
    for trip_to_delete in trip_ids_to_delete:
        if trips["trip_id"][ind] == trip_to_delete:
            shape_ids_to_delete.append(trips["shape_id"][ind])

### For safety reasons we delete rows from the dataframe after the complete for sequence
for trip_to_delete in trip_ids_to_delete:
    trips = trips[trips.trip_id != trip_to_delete]
    
## We do deletion for shapes too
for shape_to_delete in shape_ids_to_delete:
    shapes = shapes[shapes.shape_id != shape_to_delete]

In [None]:
### We visualize the route after the cleaning
route_and_stop_visualization()

In [None]:
# Subtraction of dimensions
subtracted_dimensions = pd.DataFrame(columns=["Name","Initial","Updated","Difference"])

res = tuple(map(lambda i, j: i - j, stops_dimension, stops.shape))
subtracted_dimensions.loc[0] = ["Stops",stops_dimension, stops.shape, res]

res = tuple(map(lambda i, j: i - j, trips_dimension, trips.shape))
subtracted_dimensions.loc[1] = ["Trips",trips_dimension, trips.shape, res]

res = tuple(map(lambda i, j: i - j, stop_times_dimension,stop_times.shape))
subtracted_dimensions.loc[2] = ["Stop_Times",stop_times_dimension, stop_times.shape, res]

res = tuple(map(lambda i, j: i - j, shapes_dimension, shapes.shape))
subtracted_dimensions.loc[3] = ["Shapes",shapes_dimension, shapes.shape, res]

subtracted_dimensions

In [None]:
# Join the dataset for easy at use
df_analytics = trips
df_analytics = pd.merge(df_analytics, routes, on=['route_id','route_id'])
df_analytics = pd.merge(df_analytics, stop_times, on=['trip_id','trip_id'])
df_analytics = pd.merge(df_analytics, stops, on=['stop_id','stop_id'])


In [None]:
# Basic statistics on the number of stops
print("Mean of stop_sequence : " + str(df_analytics.stop_sequence.mean()))
print('-' * 50)
print("Median of stop_sequence : " + str(df_analytics.stop_sequence.median()))
print('-' * 50)
print("Standard Deviation of stop_sequence : " + str(df_analytics.stop_sequence.std()))
print('-' * 50)
print("Skewness of stop_sequence : " + str(df_analytics.stop_sequence.skew()))

In [None]:
# Distribution of number of stops in the dataset
plt.title('Distribution of the no. stops in the dataset')
plt.hist(df_analytics.stop_sequence)
plt.xlabel('No. Stops')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Number of stops per trip
plt.boxplot(df_analytics.stop_sequence)

plt.title('No. stops per trip')
plt.show()

## Saving general data processed

In [None]:
#Load the previous zip for setup
result = gk.feed.read_feed('../preprocessing.zip',dist_units="km")

#Overwrite modified tables
result.routes = routes
result.stop_times = stop_times
result.trips = trips
result.stops = stops
result.shapes = shapes
result.calendar_dates = calendar_dates
result.calendar = calendar

#Write
result.write('../general_data_analytics.zip',9)

## Generate more usefull info