In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import contextily as ctx
import pyproj
from shapely.geometry import Point, LineString
from zipfile import ZipFile, Path
import datetime

In [2]:
with ZipFile("C:\\Users\\zare\\GEO\\geopanda\\gtfs.zip") as myzip:
    stops_df = pd.read_csv(myzip.open("stops.txt"), dtype={ 
    'stop_id': 'str', 
    'stop_code': 'str',
    'stop_name': 'str',
    'stop_desc' : 'str',                                              
    'stop_lat': 'float',
    'stop_lon': 'float',
    'location_type': 'Int64',
    'parent_station': 'str',
    'wheelchair_boarding': 'str', 
    'platform_code': 'str',
    'zone_id': 'str',
    'level_id' : 'str'
    })
    
    
    stop_times_df = pd.read_csv(myzip.open("stop_times.txt"), dtype={
        'trip_id': 'str',
        'arrival_time': 'str',
        'stop_id': 'str', 
        'departure_time': 'str', 
        'stop_id': 'str',
        'stop_sequence': 'Int64',
        'stop_headsign': 'str',
        'pickup_type': 'Int64',
        'drop_off_type': 'Int64',
    })
    
    
    
    routes_df = pd.read_csv(myzip.open("routes.txt"), dtype={
        'route_id': 'str',  
        'agency_id': 'str',  
        'route_short_name': 'str',  
        'route_long_name': 'str', 
        'route_desc': 'str', 
        'route_type': 'Int64',
        'route_color': 'str',  
        'route_text_color': 'str', 
        'rout_desc': 'str'
    })
    
    trips_df = pd.read_csv(myzip.open("trips.txt"), dtype={
        'route_id': 'str', 
        'service_id': 'str',  
        'trip_id': 'str',
        'shape_id': 'str', 
        'trip_headsign': 'str',
        'trip_short_name': 'str',
        'direction_id': 'Int64',  
        'block_id': 'str',
        'shape_id': 'str',
        'wheelchair_accessible': 'str',  
        'bikes_allowed': 'str'
    })
    
    shapes_df = pd.read_csv(myzip.open("shapes.txt"), dtype={
        'shape_id': 'str', 
        'shape_pt_lat': 'float', 
        'shape_pt_lon': 'float',  
        'shape_pt_sequence': 'Int64'
    })
    
    calendar_df = pd.read_csv(myzip.open("calendar.txt"), dtype={
        'service_id': 'str',  
        'monday': 'bool',  
        'tuesday': 'bool',  
        'wednesday': 'bool',  
        'thursday': 'bool',  
        'friday': 'bool', 
        'saturday': 'bool',  
        'sunday': 'bool',  
        'start_date': 'str', 
        'end_date': 'str',
    })
    
    calendar_dates_df = pd.read_csv(myzip.open("calendar_dates.txt"), dtype={
        'service_id': 'str',  
        'date': 'str',
        'exception_type': 'Int64',
    })
    
    agency_df = pd.read_csv(myzip.open("agency.txt"), dtype={
        'agency_id': 'str', 
        'agency_name': 'str', 
        'agency_url': 'str',  
        'agency_timezone': 'str',
        'agency_lang': 'str', 
        'agency_phone': 'str',
    })

In [3]:
show_date_str = "2023-11-27"

date = datetime.datetime.strptime(show_date_str, "%Y-%m-%d")
date_string = date.strftime("%Y%m%d")
day_of_week_name = date.strftime('%A').lower()

services_for_day_1 = calendar_df[(calendar_df[day_of_week_name]) & (date_string >= calendar_df.start_date) & (date_string <= calendar_df.end_date)].service_id.to_numpy()

services_added_for_day = calendar_dates_df[(calendar_dates_df.date == date_string) & (calendar_dates_df.exception_type == 1)].service_id.to_numpy()
services_removed_for_day = calendar_dates_df[(calendar_dates_df.date == date_string) & (calendar_dates_df.exception_type == 2)].service_id.to_numpy()
services_for_day_2 = np.concatenate([services_for_day_1, services_added_for_day])
services_for_day = np.setdiff1d(services_for_day_2, services_removed_for_day)

trips_for_day = trips_df[trips_df.service_id.isin(services_for_day)]
berlin_bus_route_ids = routes_df[(routes_df['route_type'] == 700) | (routes_df['route_type'] == 3) ].route_id.unique()
day_trip_buses = trips_for_day[trips_for_day.route_id.isin(berlin_bus_route_ids)]

In [4]:
#creating stop_gdf
stops_gdf = gpd.GeoDataFrame(stops_df, geometry = gpd.points_from_xy(stops_df.stop_lon, stops_df.stop_lat)).set_crs(epsg=4326)

In [6]:
#creating shape_gdf called shapes
shapes = shapes_df[["shape_id", "shape_pt_lat", "shape_pt_lon"]].groupby("shape_id").agg(list).apply(lambda x: LineString(zip(x.iloc[1], x.iloc[0])), axis=1)

In [8]:
shapes = gpd.GeoDataFrame( data=shapes.index, geometry = shapes.values, crs=4326)

In [9]:
shapes['shape_id'] = shapes.shape_id.astype(str)

In [11]:
shapes = shapes.rename(columns={'geometry': 'geometry_shapes'})
stops_gdf = stops_gdf.rename(columns={'geometry': 'geometry_stops'})

In [13]:
#merging data to get all info for shape_stop
stop_data_shape = pd.merge(day_trip_buses, stop_times_df[['trip_id','stop_id','stop_sequence']], on='trip_id')
stop_data_shape1 = pd.merge(stop_data_shape, stops_gdf[['stop_id','stop_name','geometry_stops']], on='stop_id')
stop_data_shape2 = pd.merge(stop_data_shape1, routes_df[['route_id','route_short_name']], on='route_id')

req_columns = ["shape_id", "stop_sequence", "stop_id", "geometry_stops"]
add_columns = ["route_id", "route_short_name","direction_id", "stop_name"]

df_shape_stop = stop_data_shape2[req_columns + add_columns].drop_duplicates()


In [15]:
#getting finall shapes of stops
df_shape_stop = pd.merge(df_shape_stop, shapes[['shape_id','geometry_shapes']], on='shape_id')


In [16]:
df_shape_stop

Unnamed: 0,shape_id,stop_sequence,stop_id,geometry_stops,route_id,route_short_name,direction_id,stop_name,geometry_shapes
0,137,0,de:12051:900275125::4,POINT (12.53589 52.41884),21947_700,2,0,"Brandenburg, Fontanestr.","LINESTRING (12.53589 52.41884, 12.53568 52.418..."
1,137,1,de:12051:900275226::1,POINT (12.51434 52.41764),21947_700,2,0,"Brandenburg, August-Sonntag-Str.","LINESTRING (12.53589 52.41884, 12.53568 52.418..."
2,137,2,de:12051:900275224::1,POINT (12.52295 52.41319),21947_700,2,0,"Brandenburg, Dreifertstr.","LINESTRING (12.53589 52.41884, 12.53568 52.418..."
3,137,3,de:12051:900275225::1,POINT (12.51621 52.41355),21947_700,2,0,"Brandenburg, Südtor","LINESTRING (12.53589 52.41884, 12.53568 52.418..."
4,137,4,de:12051:900275869::3,POINT (12.51130 52.41389),21947_700,2,0,"Brandenburg, Frankenstr.","LINESTRING (12.53589 52.41884, 12.53568 52.418..."
...,...,...,...,...,...,...,...,...,...
165805,13822,1,de:12062:900415005:1:50,POINT (13.71030 51.63692),19715_700,RB43,0,"Finsterwalde, Bahnhof","LINESTRING (13.56419 51.62061, 13.56429 51.620..."
165806,13824,1,de:12062:900415112:1:50,POINT (13.56416 51.62053),19715_700,RB43,1,"Doberlug-Kirchhain, Bahnhof","LINESTRING (13.71045 51.63700, 13.70806 51.636..."
165807,13824,0,de:12062:900415005:1:50,POINT (13.71030 51.63692),19715_700,RB43,1,"Finsterwalde, Bahnhof","LINESTRING (13.71045 51.63700, 13.70806 51.636..."
165808,13802,0,de:12070:900215696:1:50,POINT (11.85094 53.07086),19706_700,RE6,1,"Perleberg, Bahnhof","LINESTRING (11.85178 53.07105, 11.85094 53.070..."


In [21]:
#getting_distance
df_shape_stop["cut_distance_stop_point"] = df_shape_stop[["geometry_stops", "geometry_shapes"]].apply(lambda x: x.iloc[1].project(x.iloc[0], normalized=True), axis=1)

  return lib.line_locate_point_normalized(line, other)


In [22]:
df_shape_stop["projected_stop_point"] = df_shape_stop[["geometry_shapes", "cut_distance_stop_point"]].apply(lambda x: x.iloc[0].interpolate(x.iloc[1], normalized=True), axis=1)


In [23]:
#calculate distances
from shapely.geometry import LineString, MultiPoint

df_shape = shapes[shapes.shape_id.isin(stop_data_shape2.shape_id.unique())]
df_shape["list_of_points"] = df_shape.geometry_shapes.apply(lambda x: list(MultiPoint(x.coords).geoms))
df_shape_exp = df_shape.explode("list_of_points")
df_shape_exp["projected_line_points"] = df_shape_exp[["geometry_shapes", "list_of_points"]].apply(lambda x: x.iloc[0].project(x.iloc[1], normalized=True), axis=1)


  result = super().apply(func, convert_dtype=convert_dtype, args=args, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
  return lib.line_locate_point_normalized(line, other)


In [25]:
#renaming dataframes and concatenating
df_shape_stop.rename({ "projected_stop_point": "geometry", "cut_distance_stop_point": "normalized_distance_along_shape"},axis=1,inplace=True)
df_shape_stop["cut_flag"] = True

df_shape_exp = df_shape_exp[["shape_id", "list_of_points", "projected_line_points"]]
df_shape_exp.rename({ "list_of_points": "geometry", "projected_line_points": "normalized_distance_along_shape"}, axis=1, inplace=True)
df_shape_exp["cut_flag"] = False

In [26]:
# combine stops and shape points

gdf = pd.concat([df_shape_stop, df_shape_exp], ignore_index=False)
gdf.sort_values(["shape_id", "normalized_distance_along_shape"], inplace=True)
gdf.reset_index(inplace=True, drop=True)




In [27]:
 # drop all non stops

cuts = gdf.where(gdf.cut_flag).dropna(subset="cut_flag")
cuts = cuts.astype({"shape_id": str, "stop_sequence": int, "direction_id": int})
cuts[["end_stop_id", "end_stop_name"]] = cuts.groupby("shape_id")[['stop_id', "stop_name"]].shift(-1)

In [28]:
#create segments for buses

segment_geometries = []
for shape_id in cuts.shape_id.drop_duplicates():
    cut_idx = cuts[cuts.shape_id == shape_id].index
    for i, cut in enumerate(cut_idx[:-1]):
        segment_geometries.append(LineString(gdf.iloc[cut_idx[i]:cut_idx[i+1]+1].geometry))

In [29]:
#creating bus_segments_gdf

segment_df = cuts.dropna(subset="end_stop_id", axis=0)
segment_gdf = gpd.GeoDataFrame(segment_df, geometry=segment_geometries)
segment_gdf.drop(["geometry_shapes", "cut_flag", "normalized_distance_along_shape", "geometry_stops"], axis=1, inplace=True)
segment_gdf.crs = "EPSG:4326"

segment_gdf['segment_id'] = segment_gdf.stop_id.astype(str) + ' - ' + segment_gdf.end_stop_id.astype(str)
segment_gdf['segment_name'] = segment_gdf.stop_name + ' - ' + segment_gdf.end_stop_name

col_ordered = ['route_short_name','direction_id','stop_sequence', 'segment_name', 'stop_name', 'end_stop_name', 'segment_id','stop_id', 'end_stop_id','geometry']

segment_gdf = segment_gdf[col_ordered]
segment_gdf.rename( columns=dict(stop_name='start_nm', stop_id='start_id', route_short_name="Bus_Num",direction_id='dir', stop_sequence = 'seq',
                                segment_name = 'seg_nm', end_stop_name = 'end_nm', segment_id = 'seg_id', end_stop_id= 'end_id'),inplace=True)

In [30]:
#saving bus_segments_gdf

bus_segments_gdf = gpd.GeoDataFrame(pd.DataFrame(segment_gdf), geometry='geometry')

bus_segments_gdf.to_file("C:\\Users\\zare\\GEO\\geopanda\\bus_segments.shp", driver="ESRI Shapefile")