In [1]:
#######################################
###
### environment setup
###
#######################################

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pyproj import Proj, transform, CRS
from shapely.geometry import Polygon, Point
import datetime

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# display full
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [112]:
import geopandas as gpd

In [113]:
import geoplot as gplt

In [114]:
#######################################
###
### identify source data sets
###
#######################################

In [115]:
files = {
    'via_data': '../resource_files/via_datadict.csv',
    'tracts' : '../resource_files/bexar_county/Bexar_County_Census_Tracts-shp/Bexar_County_Census_Tracts.shp', 
    'block_groups' : '../resource_files/bexar_county/Bexar_County_Census_Block_Groups-shp/Bexar_County_Census_Block_Groups.shp', 
    'blocks' : '../resource_files/bexar_county/Bexar_County_Census_Blocks-shp/Bexar_County_Census_Blocks.shp', 
    'adherence_2020' : '../resource_files/Adherence_2020.csv',
    'bus_otp_2020' : '../resource_files/BusOnTimePerformance_2020.csv',
    'bus_fare_box_2020' : '../fred/bus_fare_fixed.csv',
    'message_logs_2020' : '../resource_files/Logged_Messages.csv',
    'service_miles_2020' : '../resource_files/TotalServiceMiles_2020.csv',
    'stops_addr_2020' : '../resource_files/Stops_LatLong_GeocodeData_2020.csv',
    'stops_201909' : '../resource_files/via_201909/stops.txt', 
    'stops_202004' : '../resource_files/via_202004/stops.csv', 
    'trips_201909' : '../resource_files/via_201909/trips.txt', 
    'trips_202004' : '../resource_files/via_202004/trips.csv', 
    'routes_201909' : '../resource_files/via_201909/routes.txt', 
    'routes_202004' : '../resource_files/via_202004/routes.csv', 
    'stop_times_201909' : '../resource_files/via_201909/stop_times.txt', 
    'stop_times_202004' : '../resource_files/via_202004/stop_times.csv', 
    'transfers_201909' : '../resource_files/via_201909/transfers.txt', 
    'transfers_202004' : '../resource_files/via_202004/transfers.csv', 
    'shapes_201909' : '../resource_files/via_201909/shapes.txt', 
    'shapes_202004' : '../resource_files/via_202004/shapes.csv', 
}

In [133]:
via_datadict = pd.read_csv(files['via_data'])
via_datadict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   via_column   54 non-null     object 
 1   gotb_column  54 non-null     object 
 2   datatype     54 non-null     object 
 3   description  0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 1.8+ KB


In [134]:
via_rename = via_datadict.set_index('via_column')['gotb_column']
via_rename.head()

via_column
ServiceDate           service_date
Routes                       route
Block                  route_block
RouteDirectionName       route_dir
StopNumber                 stop_id
Name: gotb_column, dtype: object

In [135]:
via_datatype = via_datadict.set_index('via_column')['datatype']
via_datatype.head()

via_column
ServiceDate           datetime64
Routes                    string
Block                     string
RouteDirectionName        string
StopNumber                string
Name: datatype, dtype: object

In [136]:
gotb_datatype = via_datadict.set_index('gotb_column')['datatype']
gotb_datatype.head()

gotb_column
service_date    datetime64
route               string
route_block         string
route_dir           string
stop_id             string
Name: datatype, dtype: object

In [11]:
#######################################
###
### set up census data
###
#######################################

In [12]:
gdf_tracts = gpd.read_file(files['tracts']).set_index('OBJECTID')
gdf_tracts.index.name = 'TRACTID'
gdf_tracts['density']=gdf_tracts.SUM_POPULA / gdf_tracts.ShapeSTAre.min()
gdf_tracts = gdf_tracts.to_crs('epsg:4326')
gdf_tracts = gdf_tracts.astype({'TRACT':'string'})
gdf_tracts.head()

Unnamed: 0_level_0,TRACT,SUM_POPULA,ShapeSTAre,ShapeSTLen,geometry,density
TRACTID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,48029110100,3379,38168580.0,26074.749275,"POLYGON ((-98.48695 29.43502, -98.48626 29.434...",0.000433
2,48029110300,2542,16679050.0,22363.990201,"POLYGON ((-98.47326 29.41461, -98.47326 29.413...",0.000325
3,48029110500,2238,13666530.0,16048.493069,"POLYGON ((-98.50799 29.42314, -98.50812 29.422...",0.000287
4,48029110600,7553,21173790.0,20426.412628,"POLYGON ((-98.50111 29.42735, -98.50121 29.426...",0.000967
5,48029110700,1398,10583710.0,16998.776148,"POLYGON ((-98.50401 29.44202, -98.50386 29.441...",0.000179


In [13]:
gdf_tracts.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 366 entries, 1 to 366
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   TRACT       366 non-null    string  
 1   SUM_POPULA  366 non-null    int64   
 2   ShapeSTAre  366 non-null    float64 
 3   ShapeSTLen  366 non-null    float64 
 4   geometry    366 non-null    geometry
 5   density     366 non-null    float64 
dtypes: float64(3), geometry(1), int64(1), string(1)
memory usage: 20.0 KB


In [14]:
gdf_groups = gpd.read_file(files['block_groups']).set_index('OBJECTID')
gdf_groups.index.name = 'GROUPID'
gdf_groups['density']=gdf_groups.SUM_POPULA / gdf_groups.ShapeSTAre
gdf_groups = gdf_groups.to_crs('epsg:4326')
gdf_groups = gdf_groups.astype({'BLOCKGROUP':'string'})
gdf_groups.head()

Unnamed: 0_level_0,BLOCKGROUP,SUM_POPULA,Shape_STAr,Shape_STLe,ShapeSTAre,ShapeSTLen,geometry,density
GROUPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,480291101001,955,19517060.0,23279.128568,19517060.0,23279.128568,"POLYGON ((-98.48695 29.43502, -98.48626 29.434...",4.9e-05
2,480291101002,694,7319464.0,14609.29787,7319464.0,14609.29787,"POLYGON ((-98.48824 29.41933, -98.48826 29.418...",9.5e-05
3,480291101003,1730,11332050.0,14546.232289,11332050.0,14546.232289,"POLYGON ((-98.49432 29.43370, -98.49426 29.433...",0.000153
4,480291103001,1041,4292228.0,9662.074836,4292228.0,9662.074836,"POLYGON ((-98.47984 29.40970, -98.47983 29.409...",0.000243
5,480291103002,797,4074139.0,10523.20497,4074139.0,10523.20497,"POLYGON ((-98.47989 29.40860, -98.47992 29.408...",0.000196


In [15]:
gdf_groups.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1084 entries, 1 to 1084
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   BLOCKGROUP  1084 non-null   string  
 1   SUM_POPULA  1084 non-null   int64   
 2   Shape_STAr  1084 non-null   float64 
 3   Shape_STLe  1084 non-null   float64 
 4   ShapeSTAre  1084 non-null   float64 
 5   ShapeSTLen  1084 non-null   float64 
 6   geometry    1084 non-null   geometry
 7   density     1084 non-null   float64 
dtypes: float64(5), geometry(1), int64(1), string(1)
memory usage: 76.2 KB


In [16]:
gdf_blocks_cols = [
    'BLOCK', 'BLOCKGROUP', 'TRACT', 'ID', 'COLORING', 'MCD', 
    'PLACE', 'VTD', 'CONGRESS', 'LOWERSLD', 'UPPERSLD', 'UNIFSCHOOL',
    'POPULATION', 'HISPANIC_O', 'NH_WHT', 'NH_BLK', 'NH_ASN', 'NH_OTH',
    'gecovector', 'ShapeSTAre', 'ShapeSTLen', 'geometry'
]

In [17]:
gdf_blocks = gpd.read_file(files['blocks']).set_index('OBJECTID')
gdf_blocks.index.name = 'BLOCKOBJ'
gdf_blocks['NH_OTH'] = (
    gdf_blocks.POPULATION 
    - gdf_blocks.HISPANIC_O 
    - gdf_blocks.NH_WHT 
    - gdf_blocks.NH_BLK 
    - gdf_blocks.NH_ASN
)
# gdf_blocks['density']=gdf_blocks.POPULATION / gdf_blocks.ShapeSTAre
gdf_blocks = gdf_blocks[gdf_blocks_cols]
gdf_blocks = gdf_blocks.to_crs('epsg:4326')
gdf_blocks = gdf_blocks.astype({
    'BLOCK': 'string',
    'BLOCKGROUP': 'string',
    'TRACT': 'string',
    'MCD': 'string',
    'PLACE': 'string',
    'VTD': 'string',
    'CONGRESS': 'string',
    'LOWERSLD': 'string',
    'UPPERSLD': 'string',
    'UNIFSCHOOL': 'string',
})
gdf_blocks.head()

Unnamed: 0_level_0,BLOCK,BLOCKGROUP,TRACT,ID,COLORING,MCD,PLACE,VTD,CONGRESS,LOWERSLD,UPPERSLD,UNIFSCHOOL,POPULATION,HISPANIC_O,NH_WHT,NH_BLK,NH_ASN,NH_OTH,gecovector,ShapeSTAre,ShapeSTLen,geometry
BLOCKOBJ,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,480291919003005,480291919003,48029191900,1400310,4,4802993407,4865000,480294004,4821,48120,48019,4838730,13,11,0,0,0,2,0.001038,28896.121094,1063.271402,"POLYGON ((-98.46781 29.42506, -98.46783 29.424..."
2,480291411011002,480291411011,48029141101,1400328,3,4802993407,4865000,480291074,4823,48119,48019,4838730,0,0,0,0,0,0,0.018304,509512.736328,8615.618889,"POLYGON ((-98.46881 29.36511, -98.46815 29.364..."
3,480291411011006,480291411011,48029141101,1400348,3,4802993407,4865000,480291074,4823,48119,48019,4838730,102,82,16,2,0,2,0.012803,356394.378906,2836.366689,"POLYGON ((-98.46323 29.36640, -98.46324 29.365..."
4,480291410002007,480291410002,48029141000,1400365,1,4802993407,4865000,480291074,4823,48119,48019,4838730,162,149,13,0,0,0,0.012892,358877.382812,2852.30438,"POLYGON ((-98.46667 29.36662, -98.46668 29.367..."
5,480291410002006,480291410002,48029141000,1400383,3,4802993407,4865000,480291074,4823,48119,48019,4838730,133,124,5,4,0,0,0.016129,448943.710938,2994.127235,"POLYGON ((-98.46668 29.36753, -98.46667 29.368..."


In [18]:
gdf_blocks.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 26452 entries, 1 to 26452
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   BLOCK       26452 non-null  string  
 1   BLOCKGROUP  26452 non-null  string  
 2   TRACT       26452 non-null  string  
 3   ID          26452 non-null  int64   
 4   COLORING    26452 non-null  int64   
 5   MCD         26452 non-null  string  
 6   PLACE       22406 non-null  string  
 7   VTD         26452 non-null  string  
 8   CONGRESS    26452 non-null  string  
 9   LOWERSLD    26452 non-null  string  
 10  UPPERSLD    26452 non-null  string  
 11  UNIFSCHOOL  26452 non-null  string  
 12  POPULATION  26452 non-null  int64   
 13  HISPANIC_O  26452 non-null  int64   
 14  NH_WHT      26452 non-null  int64   
 15  NH_BLK      26452 non-null  int64   
 16  NH_ASN      26452 non-null  int64   
 17  NH_OTH      26452 non-null  int64   
 18  gecovector  26452 non-null  float64 
 

In [19]:
gdf_counties = gdf_tracts[['TRACT','geometry']].copy()
gdf_counties['COUNTY'] = gdf_counties['TRACT'].str[:5]
gdf_counties.drop(columns='TRACT', inplace=True)
gdf_counties = gdf_counties.dissolve(by='COUNTY').reset_index()
gdf_counties.index.name = 'COUNTYID'
gdf_counties = gdf_counties.astype({'COUNTY':"string"})
gdf_counties.head()

Unnamed: 0_level_0,COUNTY,geometry
COUNTYID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,48029,"POLYGON ((-98.32720 29.20987, -98.32725 29.209..."


In [20]:
# For simplification, using distance per degree latitude to determine buffer amount. 
# This is slightly inaccurate due to the curvature of the earth. Buffer distances is
# set at 800 meters, which is roughly a half mile.

meters_per_degree = 111111
buffer_in_meters = 750
buffer_in_degrees = buffer_in_meters / meters_per_degree
buffer_in_degrees

0.00675000675000675

In [21]:
#######################################
###
### set up via datathon data
###
#######################################

In [22]:
#######################################
###
### via datathon 2020
###
### adherence
###
#######################################

In [139]:
def retype_dataframe(df, type_dict):
    retype_dict = {k:v for (k,v) in type_dict.items() if k in df.columns}
    # print(retype_dict)
    string_cols = [k for (k,v) in retype_dict.items() if v == 'string']
    # print(string_cols)
    for col in string_cols:
        df[col] = df[col].apply(str)
    df = df.astype(retype_dict).copy()

    return df

In [152]:
def wrangle_adherence(source_file):
    # adherence_cols_rename = {
    #     'ServiceDate' : 'service_date', 
    #     'Routes' : 'route', 
    #     'Block' : 'route_block', 
    #     'RouteDirectionName' : 'route_dir', 
    #     'StopNumber' : 'stop_id',
    #     'Location' : 'stop_name', 
    #     'Latitude' : 'stop_lat', 
    #     'Longitude' : 'stop_lon',
    #     'ScheduledTime(S)' : 'sched_time_s', 
    #     'ScheduledTime(HHMMSS)' : 'sched_time_hms', 
    #     'ArrivalTime(S)' : 'arr_time_s',
    #     'ArrivalTime(HHMMSS)' : 'arr_time_hms', 
    #     'DepartureTime(S)' : 'dep_time_s', 
    #     'DepartureTime(HHMMSS)' : 'dep_time_hms',
    #     'Odometer' : 'odometer', 
    #     'VehicleNumber' : 'vehicle_id'
    # }
    adherence_stops_cols = ['stop_id','stop_name','deg_lat','deg_lon']
    adherence_stops_drops = adherence_stops_cols.copy()
    adherence_stops_drops.remove('stop_id')
    # print(adherence_stops_drops)
    
    df_adh = pd.read_csv(source_file).rename(columns=via_rename)
    df_adh.route_dir = df_adh.route_dir.str.slice(0,1)
    df_adh.vehicle_id = df_adh.vehicle_id.apply(str)
    # print('Check F1: df_adh')
    # print(df_adh.info())

    
    gdf_stops = df_adh[adherence_stops_cols]
    # print('\nCheck F2: df_stops')
    # print(gdf_stops.info())
    
    df_adh = df_adh.drop(columns=adherence_stops_drops)
    # df_adh = retype_dataframe(df_adh, gotb_datatype)
    
    # print('\nCheck F3: df_adh')
    # print(df_adh.info())

    adherence_stops_rename = {
        'deg_lat' : 'stop_lat', 
        'deg_lon' : 'stop_lon',
    }
    
    gdf_stops['stops'] = 1
    gdf_stops.deg_lat.fillna(0, inplace=True)
    gdf_stops.deg_lon.fillna(0, inplace=True)
    gdf_stops.deg_lat = gdf_stops.deg_lat * 10 ** -7
    gdf_stops.deg_lon = gdf_stops.deg_lon * 10 ** -7
    gdf_stops = gdf_stops.groupby(adherence_stops_cols).agg({'stops':['sum']}).reset_index()
    gdf_stops.columns = [col[0] for col in gdf_stops.columns]
    # print(new_cols)
    gdf_stops.rename(columns=adherence_stops_rename, inplace=True)
    gdf_stops['stop_code'] = gdf_stops.stop_id
    gdf_stops.set_index('stop_code', inplace=True)
    gdf_stops = retype_dataframe(gdf_stops, gotb_datatype)
    gdf_stops = gpd.GeoDataFrame(
        gdf_stops,
        geometry = gpd.points_from_xy(gdf_stops.stop_lon, gdf_stops.stop_lat),
        crs={'epsg:4326'}
    )
    # print('\nCheck F4: df_stops')
    # print(gdf_stops.info())
    return df_adh, gdf_stops

In [153]:
df_adherence, gdf_stops_adh = wrangle_adherence(files['adherence_2020'])
# print(adh.type())
# df_adherence = adh[0]
# gdf_stops_adh = adh[1]
print('\ndf_adherence')
print(df_adherence.info())
print('\ngdf_stops_adh')
print(gdf_stops_adh.info())


df_adherence
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358328 entries, 0 to 358327
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   service_date    358328 non-null  object 
 1   route           358328 non-null  int64  
 2   route_block     358328 non-null  object 
 3   route_dir       358328 non-null  object 
 4   stop_id         358328 non-null  object 
 5   sched_time_s    358328 non-null  int64  
 6   sched_time_hms  358328 non-null  object 
 7   arr_time_s      329729 non-null  float64
 8   arr_time_hms    329729 non-null  object 
 9   dep_time_s      329729 non-null  float64
 10  dep_time_hms    329729 non-null  object 
 11  odometer        329684 non-null  float64
 12  vehicle_id      358328 non-null  object 
dtypes: float64(3), int64(2), object(8)
memory usage: 35.5+ MB
None

gdf_stops_adh
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 649 entries, 10013 to GARA
Data columns (total 6 

In [154]:
df_adherence.columns

Index(['service_date', 'route', 'route_block', 'route_dir', 'stop_id',
       'sched_time_s', 'sched_time_hms', 'arr_time_s', 'arr_time_hms',
       'dep_time_s', 'dep_time_hms', 'odometer', 'vehicle_id'],
      dtype='object')

In [161]:
adherence_routes_cols = ['service_date','vehicle_id','route','sched_time_hms','route_block','route_dir','stop_id']
adh_routes = df_adherence[adherence_routes_cols]
adh_routes['recs'] = 1
adh_routes = adh_routes.groupby(adherence_routes_cols).agg({'recs':['count']}).reset_index()
adh_routes.columns = [col[0] for col in adh_routes.columns]
# adh_routes = retype_dataframe(adh_routes, gotb_datatype)
adh_routes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 357465 entries, 0 to 357464
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   service_date    357465 non-null  object
 1   vehicle_id      357465 non-null  object
 2   route           357465 non-null  int64 
 3   sched_time_hms  357465 non-null  object
 4   route_block     357465 non-null  object
 5   route_dir       357465 non-null  object
 6   stop_id         357465 non-null  object
 7   recs            357465 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 21.8+ MB


In [162]:
adh_routes.recs.value_counts()

1    356606
2       857
4         2
Name: recs, dtype: int64

In [163]:
df_adherence.route_dir.value_counts()

S    116631
N    116201
E     62920
W     62576
Name: route_dir, dtype: int64

In [173]:
df_adherence.vehicle_id.value_counts()

nan      1744
525.0    1374
529.0    1290
533.0    1274
585.0    1273
536.0    1237
522.0    1231
577.0    1226
481.0    1221
534.0    1207
457.0    1196
524.0    1184
464.0    1168
554.0    1155
424.0    1151
500.0    1139
558.0    1128
523.0    1128
509.0    1120
604.0    1113
599.0    1111
526.0    1100
477.0    1097
520.0    1092
551.0    1091
352.0    1088
463.0    1074
553.0    1073
542.0    1073
515.0    1072
545.0    1072
543.0    1067
446.0    1064
550.0    1061
475.0    1060
537.0    1058
478.0    1055
552.0    1054
473.0    1054
506.0    1049
587.0    1048
513.0    1047
474.0    1038
514.0    1038
415.0    1032
518.0    1032
625.0    1031
652.0    1026
531.0    1026
546.0    1026
507.0    1025
566.0    1022
465.0    1020
583.0    1019
426.0    1017
479.0    1010
435.0    1006
512.0    1006
497.0    1006
423.0    1004
540.0    1004
532.0    1003
575.0    1002
594.0     997
434.0     996
476.0     993
716.0     992
605.0     992
682.0     991
596.0     991
467.0     990
547.0 

In [164]:
adh_routes[adh_routes.recs>2].head(1000)

Unnamed: 0,service_date,vehicle_id,route,sched_time_hms,route_block,route_dir,stop_id,recs
54051,9/12/2019,500.0,2,15:59:00,002-008,S,10059,4
54070,9/12/2019,500.0,2,20:20:00,002-008,S,10059,4


In [172]:
df_adherence[(df_adherence.route == '2') & (df_adherence.vehicle_id == '500.0') & (df_adherence.stop_id == "10059") & (df_adherence.route_dir == 'S') ]

Unnamed: 0,service_date,route,route_block,route_dir,stop_id,sched_time_s,sched_time_hms,arr_time_s,arr_time_hms,dep_time_s,dep_time_hms,odometer,vehicle_id


In [32]:
#######################################
###
### stops
###
#######################################

In [33]:
def wrangle_stops(source_file, actv_date='20_04'):
    
    keep_cols = [
        'stop_id', 'stop_name', 'stop_lat', 'stop_lon'
    ]
    
    gdf = pd.read_csv(source_file)
    gdf.stop_code = gdf.stop_code.apply(str)
    gdf.stop_id = gdf.stop_id.apply(str)
    gdf = gdf.astype({'stop_code':'string', 'stop_id':'string', 'stop_name':'string'})
    gdf.set_index('stop_code', inplace=True)
    gdf = gdf[keep_cols]
    gdf = gpd.GeoDataFrame(
        gdf,
        geometry = gpd.points_from_xy(gdf.stop_lon, gdf.stop_lat),
        crs={'epsg:4326'}
    )
    # gdf.stop_id = gdf.stop_id.apply(str)
    gdf['actv_' + str(actv_date)] = 1
    return gdf


In [34]:
#######################################
###
### stops
###
### stops adh
###
#######################################

In [35]:

gdf_stops_adh.head(5)


Unnamed: 0_level_0,stop_id,stop_name,stop_lat,stop_lon,stops,geometry
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10013,10013,NORTH STAR TRANSIT CENTER,29.519705,-98.49834,1511,POINT (-98.49834 29.51970)
10019,10019,NORTH STAR TRANSIT CENTER,29.519579,-98.498064,836,POINT (-98.49806 29.51958)
10023,10023,NORTH STAR TRANSIT CENTER,29.519261,-98.498801,575,POINT (-98.49880 29.51926)
10029,10029,NORTH STAR TRANSIT CENTER,29.519446,-98.498229,646,POINT (-98.49823 29.51945)
10033,10033,NORTH STAR TRANSIT CENTER,29.519403,-98.498651,567,POINT (-98.49865 29.51940)


In [36]:
gdf_stops_adh[gdf_stops_adh.stop_lat==0].stop_id.to_list()

['19863', '23799', '98013']

In [37]:
#######################################
###
### stops
###
### stops 200404
###
#######################################

In [38]:
gdf_stops_202004 = wrangle_stops(files['stops_202004'], actv_date='20_04')

gdf_stops_202004.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 6491 entries, 25316 to 74369
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   stop_id     6491 non-null   string  
 1   stop_name   6491 non-null   string  
 2   stop_lat    6491 non-null   float64 
 3   stop_lon    6491 non-null   float64 
 4   geometry    6491 non-null   geometry
 5   actv_20_04  6491 non-null   int64   
dtypes: float64(2), geometry(1), int64(1), string(2)
memory usage: 355.0+ KB


In [39]:
gdf_stops_202004.head()

Unnamed: 0_level_0,stop_id,stop_name,stop_lat,stop_lon,geometry,actv_20_04
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25316,25316,BLANCO & DRESDEN,29.499014,-98.507783,POINT (-98.50778 29.49901),1
72479,72479,FRESNO & IH-10 W ACCESS RD.,29.474488,-98.516238,POINT (-98.51624 29.47449),1
56239,56239,FREDERICKSBURG RD. & N. FLORES,29.441883,-98.503773,POINT (-98.50377 29.44188),1
76759,76759,MARTIN & N. FRIO,29.430081,-98.503058,POINT (-98.50306 29.43008),1
88973,88973,DOLOROSA & S. FLORES,29.424073,-98.494741,POINT (-98.49474 29.42407),1


In [40]:
#######################################
###
### stops
###
### stops 201909
###
#######################################

In [41]:
gdf_stops_201909 = wrangle_stops(files['stops_201909'], actv_date='19_09')
gdf_stops_201909 = gdf_stops_201909.join(gdf_stops_202004[['actv_20_04']], how='left')
gdf_stops_201909.actv_20_04.fillna(0, inplace=True)

gdf_stops_201909.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 6895 entries, 25316 to 56683
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   stop_id     6895 non-null   string  
 1   stop_name   6895 non-null   string  
 2   stop_lat    6895 non-null   float64 
 3   stop_lon    6895 non-null   float64 
 4   geometry    6895 non-null   geometry
 5   actv_19_09  6895 non-null   int64   
 6   actv_20_04  6895 non-null   float64 
dtypes: float64(3), geometry(1), int64(1), string(2)
memory usage: 750.9+ KB


In [42]:
gdf_stops = gdf_stops_202004.join(gdf_stops_201909[['actv_19_09']], how='left')
gdf_stops.actv_19_09 = gdf_stops.actv_19_09.fillna(0)
gdf_stops.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 6491 entries, 25316 to 74369
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   stop_id     6491 non-null   string  
 1   stop_name   6491 non-null   string  
 2   stop_lat    6491 non-null   float64 
 3   stop_lon    6491 non-null   float64 
 4   geometry    6491 non-null   geometry
 5   actv_20_04  6491 non-null   int64   
 6   actv_19_09  6491 non-null   float64 
dtypes: float64(3), geometry(1), int64(1), string(2)
memory usage: 725.7+ KB


In [43]:
gdf_stops_201909.head()

Unnamed: 0_level_0,stop_id,stop_name,stop_lat,stop_lon,geometry,actv_19_09,actv_20_04
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25316,25316,BLANCO & DRESDEN,29.499014,-98.507783,POINT (-98.50778 29.49901),1,1.0
72479,72479,FRESNO & IH-10 W ACCESS RD.,29.474488,-98.516238,POINT (-98.51624 29.47449),1,1.0
56239,56239,FREDERICKSBURG RD. & N. FLORES,29.441883,-98.503773,POINT (-98.50377 29.44188),1,1.0
76759,76759,MARTIN & N. FRIO,29.430081,-98.503058,POINT (-98.50306 29.43008),1,1.0
88973,88973,DOLOROSA & S. FLORES,29.424073,-98.494741,POINT (-98.49474 29.42407),1,1.0


In [44]:
gdf_stops_202004[gdf_stops_202004.stop_id == '98013']

Unnamed: 0_level_0,stop_id,stop_name,stop_lat,stop_lon,geometry,actv_20_04
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
98013,98013,INGRAM TRANSIT CENTER,29.464792,-98.626006,POINT (-98.62601 29.46479),1


In [45]:
adh_join = gdf_stops_adh.join(gdf_stops_201909[['stop_id']], how='left', on='stop_id', rsuffix='_all')
adh_join.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 649 entries, 10013 to GARA
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   stop_id      649 non-null    object  
 1   stop_name    649 non-null    string  
 2   stop_lat     649 non-null    float64 
 3   stop_lon     649 non-null    float64 
 4   stops        649 non-null    int64   
 5   geometry     649 non-null    geometry
 6   stop_id_all  644 non-null    object  
dtypes: float64(2), geometry(1), int64(1), object(2), string(1)
memory usage: 40.6+ KB


In [46]:
# adherence_stops_cols = ['StopNumber','Location','Latitude','Longitude','geometry']

In [47]:
#######################################
###
### stops - all
###
#######################################

In [48]:
adh_stops_merge = gdf_stops_adh[['geometry', 'stop_id']].rename(columns={'geometry': 'geometry_adh', 'stop_id': 'stop_id_adh'})
all_stops = gdf_stops_202004[['geometry','stop_id']].join(gdf_stops_201909[['geometry','stop_id']], how='outer', lsuffix='_20', rsuffix='_19')
all_stops = all_stops.join(adh_stops_merge, how='outer')

all_stops['actv_20_04'] = all_stops.stop_id_20.isna()==False
all_stops['actv_19_09'] = all_stops.stop_id_19.isna()==False
all_stops['actv_adh'] = all_stops.stop_id_adh.isna() == False
all_stops['same_actv'] = all_stops.actv_20_04.astype('int32') + all_stops.actv_19_09.astype('int32') + all_stops.actv_adh.astype('int32')

# all_stops['same_name'] = all_stops.stop_name_20 == all_stops.stop_name_19
all_stops['same_geom'] = all_stops.geometry_20 == all_stops.geometry_19
all_stops['geom_adh_20'] = all_stops.geometry_20 == all_stops.geometry_adh
all_stops['geom_adh_19'] = all_stops.geometry_19 == all_stops.geometry_adh
# all_stops['same_lat'] = all_stops.stop_lat_20 == all_stops.stop_lat_19
# all_stops['same_lon'] = all_stops.stop_lon_20 == all_stops.stop_lon_19
# all_stops['same_wheel'] = all_stops.wheelchair_boarding_20 == all_stops.wheelchair_boarding_19
all_stops.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 6915 entries, 10013 to GARA
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   geometry_20   6491 non-null   geometry
 1   stop_id_20    6491 non-null   string  
 2   geometry_19   6895 non-null   geometry
 3   stop_id_19    6895 non-null   string  
 4   geometry_adh  649 non-null    geometry
 5   stop_id_adh   649 non-null    string  
 6   actv_20_04    6915 non-null   bool    
 7   actv_19_09    6915 non-null   bool    
 8   actv_adh      6915 non-null   bool    
 9   same_actv     6915 non-null   int32   
 10  same_geom     6915 non-null   bool    
 11  geom_adh_20   6915 non-null   bool    
 12  geom_adh_19   6915 non-null   bool    
dtypes: bool(6), geometry(3), int32(1), string(3)
memory usage: 445.7+ KB


In [49]:
adh_stops_chk = all_stops[all_stops.actv_adh]
# adh_stops_chk[adh_stops_chk.same_actv == 3]
# adh_stops_chk[all_stops.columns]
adh_stops_chk.sample(10)

Unnamed: 0_level_0,geometry_20,stop_id_20,geometry_19,stop_id_19,geometry_adh,stop_id_adh,actv_20_04,actv_19_09,actv_adh,same_actv,same_geom,geom_adh_20,geom_adh_19
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
99396,POINT (-98.49043 29.42250),99396,POINT (-98.49043 29.42250),99396,POINT (-98.49041 29.42263),99396,True,True,True,3,True,False,False
85599,POINT (-98.56985 29.46904),85599,POINT (-98.56985 29.46904),85599,POINT (-98.56900 29.46849),85599,True,True,True,3,True,False,False
85779,POINT (-98.47323 29.42509),85779,POINT (-98.47323 29.42509),85779,POINT (-98.47309 29.42511),85779,True,True,True,3,True,False,False
47169,POINT (-98.46621 29.49018),47169,POINT (-98.46621 29.49018),47169,POINT (-98.46621 29.49023),47169,True,True,True,3,True,False,False
10503,POINT (-98.55035 29.48876),10503,POINT (-98.55035 29.48876),10503,POINT (-98.55033 29.48883),10503,True,True,True,3,True,False,False
26526,POINT (-98.40656 29.49732),26526,POINT (-98.40656 29.49732),26526,POINT (-98.40657 29.49731),26526,True,True,True,3,True,False,False
78393,POINT (-98.52649 29.43207),78393,POINT (-98.52649 29.43207),78393,POINT (-98.52663 29.43206),78393,True,True,True,3,True,False,False
53113,POINT (-98.54804 29.44277),53113,POINT (-98.54804 29.44277),53113,POINT (-98.54810 29.44275),53113,True,True,True,3,True,False,False
69419,POINT (-98.52578 29.43638),69419,POINT (-98.52578 29.43638),69419,POINT (-98.52581 29.43642),69419,True,True,True,3,True,False,False
91817,POINT (-98.58777 29.42040),91817,POINT (-98.58777 29.42040),91817,POINT (-98.58783 29.42056),91817,True,True,True,3,True,False,False


In [50]:
print(adh_stops_merge.index.to_list()[:20])
print(all_stops.index.to_list()[:20])


['10013', '10019', '10023', '10029', '10033', '10039', '10043', '10049', '10059', '10103', '10109', '10203', '10209', '10503', '10509', '11126', '11127', '11137', '11206', '11267']
['10013', '10019', '10023', '10029', '10033', '10039', '10043', '10049', '10059', '10103', '10109', '10203', '10209', '10303', '10309', '10403', '10409', '10503', '10509', '10603']


In [51]:
same_stops = all_stops[all_stops['same_actv']]
same_stops.info()

KeyError: "None of [Int64Index([3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\n            ...\n            2, 2, 2, 2, 2, 2, 2, 2, 2, 1],\n           dtype='int64', length=6915)] are in the [columns]"

In [52]:
same_stops[['same_geom']].sum()

NameError: name 'same_stops' is not defined

In [53]:
block_stops = gdf_stops_blocks.BLOCK.value_counts()
len(block_stops)

NameError: name 'gdf_stops_blocks' is not defined

In [54]:
# all_stops[all_stops.same_geom == False]

In [55]:
# gdf_stops_202004_blocks[['BLOCK','geometry']].info()

In [56]:
def buffer_stops(stops_gdf, geo_col='geometry', buffer=.01):

    gdf = stops_gdf.copy()
    
    gdf['geometry'] = gdf.buffer(buffer)
    gdf = gdf.set_geometry('geometry')

    return gdf

In [57]:
gdf_stops_202004_buff = buffer_stops(gdf_stops_202004[['stop_id','geometry']], buffer=buffer_in_degrees)

gdf_stops_202004_buff.head()

Unnamed: 0_level_0,stop_id,geometry
stop_code,Unnamed: 1_level_1,Unnamed: 2_level_1
25316,25316,"POLYGON ((-98.50103 29.49901, -98.50107 29.498..."
72479,72479,"POLYGON ((-98.50949 29.47449, -98.50952 29.473..."
56239,56239,"POLYGON ((-98.49702 29.44188, -98.49706 29.441..."
76759,76759,"POLYGON ((-98.49631 29.43008, -98.49634 29.429..."
88973,88973,"POLYGON ((-98.48799 29.42407, -98.48802 29.423..."


In [58]:
gdf_stops_202004_buff.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 6491 entries, 25316 to 74369
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   stop_id   6491 non-null   string  
 1   geometry  6491 non-null   geometry
dtypes: geometry(1), string(1)
memory usage: 152.1+ KB


In [59]:
def match_stops(stops_gdf, area_gdf, stops_geo='geometry', area_geo='geometry'):
    sidx_name = stops_gdf.index.name
    aidx_name = area_gdf.index.name
    sidx_geo = sidx_name + '_geo'
    aidx_geo = aidx_name + 'geo'
    sgdf = stops_gdf[[stops_geo]].copy().reset_index()
    agdf = area_gdf[[area_geo]].copy().reset_index()
    
    return sidx_name

# check_match = match_stops(area_gdf=gdf_blocks[['geometry']], stops_gdf=gdf_stops_202004_buff)
# check_match

In [60]:
# gdf_stops_blocks = gpd.sjoin(gdf_blocks[['BLOCK','geometry']], gdf_stops_202004_buff, how='inner', op='intersects')
gdf_stops_blocks = gpd.sjoin(gdf_stops_buff, gdf_blocks[['BLOCK','geometry']], how='inner', op='intersects')
gdf_stops_blocks = gdf_stops_blocks[['BLOCK','stop_id','geometry']]
# gdf_stops_blocks = gdf_stops_blocks.rename(columns={'index_right':'stop_id'})
gdf_stops_blocks.head(5)

NameError: name 'gdf_stops_buff' is not defined

In [61]:
gdf_stops_blocks.info()

NameError: name 'gdf_stops_blocks' is not defined

In [62]:
stop_blocks = gdf_stops_blocks.stop_id.value_counts()
len(stop_blocks)

NameError: name 'gdf_stops_blocks' is not defined

In [63]:
gdf_stops_blocks[['BLOCK','geometry']].head()

NameError: name 'gdf_stops_blocks' is not defined

In [64]:
#######################################
###
### via datathon 2020
###
### on time performance
###
#######################################

In [65]:
df_otp = pd.read_csv(files['bus_otp_2020'])
df_otp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107319 entries, 0 to 107318
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   ServiceDateTime   107319 non-null  object 
 1   Route             107319 non-null  int64  
 2   RouteName         107319 non-null  object 
 3   EarlyDeparture    107319 non-null  int64  
 4   OnTime            107319 non-null  int64  
 5   LateArrival       107319 non-null  int64  
 6   Missing           107319 non-null  int64  
 7   TimePointCount    107319 non-null  int64  
 8   OTP               107156 non-null  float64
 9   AverageDwellTime  107155 non-null  float64
dtypes: float64(2), int64(6), object(2)
memory usage: 8.2+ MB


In [66]:
df_otp.columns

Index(['ServiceDateTime', 'Route', 'RouteName', 'EarlyDeparture', 'OnTime',
       'LateArrival', 'Missing', 'TimePointCount', 'OTP', 'AverageDwellTime'],
      dtype='object')

In [67]:
df_otp.head()

Unnamed: 0,ServiceDateTime,Route,RouteName,EarlyDeparture,OnTime,LateArrival,Missing,TimePointCount,OTP,AverageDwellTime
0,2/12/18,2,BLANCO,83,626,87,82,796,78.64,103.97
1,2/12/18,3,SAN PEDRO SKIP,5,518,78,11,601,86.19,167.29
2,2/12/18,4,SAN PEDRO Frequent,21,454,109,130,584,77.74,78.31
3,2/12/18,5,McCULLOUGH,7,403,30,10,440,91.59,211.44
4,2/12/18,6,US-281 Express,1,54,22,11,77,70.13,163.19


In [68]:
#######################################
###
### via datathon 2020
###
### fare box activity
###
#######################################

In [69]:
df_fare = pd.read_csv(files['bus_fare_box_2020'], index_col=0)
df_fare.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89623 entries, 0 to 89622
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ServiceDateTime      89623 non-null  object 
 1   Route                89623 non-null  int64  
 2   RouteName            87728 non-null  object 
 3   CurrentRevenue       89623 non-null  float64
 4   Ridership            89623 non-null  int64  
 5   TokenCount           89623 non-null  int64  
 6   TicketCount          89623 non-null  int64  
 7   PassCount            89623 non-null  int64  
 8   BillCount            89623 non-null  int64  
 9   UnclassifiedRevenue  89623 non-null  float64
 10  DumpCount            89623 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 8.2+ MB


In [70]:
df_fare.columns

Index(['ServiceDateTime', 'Route', 'RouteName', 'CurrentRevenue', 'Ridership',
       'TokenCount', 'TicketCount', 'PassCount', 'BillCount',
       'UnclassifiedRevenue', 'DumpCount'],
      dtype='object')

In [71]:
df_fare.head()

Unnamed: 0,ServiceDateTime,Route,RouteName,CurrentRevenue,Ridership,TokenCount,TicketCount,PassCount,BillCount,UnclassifiedRevenue,DumpCount
0,2017-01-01,2,2-BLANCO RD.,510.82,1244,0,0,588,312,20.88,116
1,2017-01-01,3,3-SAN PEDRO,203.93,689,0,0,371,119,9.55,49
2,2017-01-01,4,4-SAN PEDRO,356.58,1377,0,0,671,230,20.44,94
3,2017-01-01,5,5-MC CULLOUGH,224.8,474,0,0,239,124,19.5,64
4,2017-01-01,8,8-N.ST.MARYS,252.29,471,0,0,214,147,6.45,53


In [72]:
#######################################
###
### via datathon 2020
###
### message logs
###
#######################################

In [73]:
df_logs = pd.read_csv(files['message_logs_2020'])
df_logs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155517 entries, 0 to 155516
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   dDate              155517 non-null  object 
 1   Local_Timestamp    155517 non-null  object 
 2   Latitude           155517 non-null  float64
 3   Longitude          155517 non-null  float64
 4   MESSAGE_TYPE_TEXT  155517 non-null  object 
 5   ADHERENCE          155517 non-null  int64  
 6   Odometer           155517 non-null  float64
 7   VehicleNumber      155517 non-null  object 
 8   degLat             0 non-null       float64
 9   degLong            0 non-null       float64
dtypes: float64(5), int64(1), object(4)
memory usage: 11.9+ MB


In [74]:
df_logs.columns

Index(['dDate', 'Local_Timestamp', 'Latitude', 'Longitude',
       'MESSAGE_TYPE_TEXT', 'ADHERENCE', 'Odometer', 'VehicleNumber', 'degLat',
       'degLong'],
      dtype='object')

In [75]:
df_logs.head(5)


Unnamed: 0,dDate,Local_Timestamp,Latitude,Longitude,MESSAGE_TYPE_TEXT,ADHERENCE,Odometer,VehicleNumber,degLat,degLong
0,2/14/2020,2/14/20 6:56 AM,29.44451,-98.50143,vehicle location,0,0.0,102,,
1,2/14/2020,2/14/20 6:56 AM,29.44452,-98.50142,vehicle location,0,0.0,102,,
2,2/14/2020,2/14/20 6:57 AM,29.44452,-98.50142,vehicle location,0,0.0,102,,
3,2/14/2020,2/14/20 6:57 AM,29.44452,-98.50142,vehicle location,0,0.0,102,,
4,2/14/2020,2/14/20 6:58 AM,29.44452,-98.50142,vehicle location,0,0.0,102,,


In [76]:
df_logs.MESSAGE_TYPE_TEXT.value_counts()

vehicle location     155152
Return to Network       365
Name: MESSAGE_TYPE_TEXT, dtype: int64

In [77]:
df_logs.ADHERENCE.value_counts().describe()

count       82.000000
mean      1896.548780
std       9608.117182
min          1.000000
25%         16.000000
50%         59.500000
75%        269.250000
max      85096.000000
Name: ADHERENCE, dtype: float64

In [175]:
vehicle_ids = df_logs.VehicleNumber.value_counts()
print(len(vehicle_ids))
vehicle_ids.head(20)

704


PD-29     1324
717        859
PD-22      689
PD-23      626
T-189      602
T-188      566
T-116      511
T-151      476
T-132      460
491        440
C-117      406
T-100      405
T-166      394
982        390
T-195      385
T-120      382
970        371
952        371
953        355
M-2121     351
Name: VehicleNumber, dtype: int64

In [79]:
df_logs[df_logs.VehicleNumber==450].head(20)

Unnamed: 0,dDate,Local_Timestamp,Latitude,Longitude,MESSAGE_TYPE_TEXT,ADHERENCE,Odometer,VehicleNumber,degLat,degLong
28851,2/14/2020,2/14/20 6:00 AM,29.45275,-98.51462,vehicle location,-1,24.28,450,,
28852,2/14/2020,2/14/20 6:00 AM,29.45083,-98.51278,vehicle location,-1,24.45,450,,
28853,2/14/2020,2/14/20 6:01 AM,29.44892,-98.51093,vehicle location,-1,24.63,450,,
28854,2/14/2020,2/14/20 6:02 AM,29.44463,-98.50669,vehicle location,-1,25.01,450,,
28855,2/14/2020,2/14/20 6:03 AM,29.44217,-98.5043,vehicle location,-1,25.23,450,,
28856,2/14/2020,2/14/20 6:04 AM,29.44123,-98.5023,vehicle location,-1,25.37,450,,
28857,2/14/2020,2/14/20 6:05 AM,29.4396,-98.49864,vehicle location,-1,25.66,450,,
28858,2/14/2020,2/14/20 6:06 AM,29.43702,-98.49696,vehicle location,-1,25.86,450,,
28859,2/14/2020,2/14/20 6:06 AM,29.43506,-98.49544,vehicle location,0,26.03,450,,
28860,2/14/2020,2/14/20 6:07 AM,29.43287,-98.4932,vehicle location,0,26.23,450,,


In [80]:
#######################################
###
### via datathon 2020
###
### service miles
###
#######################################

In [81]:
df_svc_miles = pd.read_csv(files['service_miles_2020'])
df_svc_miles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63622 entries, 0 to 63621
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Route                63622 non-null  int64  
 1   RouteName            63622 non-null  object 
 2   Class                62115 non-null  object 
 3   ServiceType          63622 non-null  object 
 4   ScheduledMiles       63622 non-null  float64
 5   AVLRevenueMiles      63622 non-null  float64
 6   AVLDeadHeadMiles     63622 non-null  float64
 7   AVLExemptMiles       63622 non-null  float64
 8   AVLMilesAdjustments  63622 non-null  float64
 9   ServiceDate          63622 non-null  object 
dtypes: float64(5), int64(1), object(4)
memory usage: 4.9+ MB


In [82]:
df_svc_miles.columns


Index(['Route', 'RouteName', 'Class', 'ServiceType', 'ScheduledMiles',
       'AVLRevenueMiles', 'AVLDeadHeadMiles', 'AVLExemptMiles',
       'AVLMilesAdjustments', 'ServiceDate'],
      dtype='object')

In [83]:
df_svc_miles.head()

Unnamed: 0,Route,RouteName,Class,ServiceType,ScheduledMiles,AVLRevenueMiles,AVLDeadHeadMiles,AVLExemptMiles,AVLMilesAdjustments,ServiceDate
0,2,2-BLANCO RD.,Major Radial,SUNDAY,924.35,886.97,46.91,0.0,26.0,1/1/2019
1,3,3-SAN PEDRO SKIP,Major Limited,SUNDAY,707.88,683.66,19.57,0.0,0.0,1/1/2019
2,4,4-SAN PEDRO,Major Radial,SUNDAY,731.48,692.08,34.76,0.0,0.0,1/1/2019
3,5,5-MC CULLOUGH,Major Radial,SUNDAY,484.17,457.37,22.12,0.0,0.0,1/1/2019
4,6,6-US 281 Express,Major Express,SUNDAY,0.0,0.0,0.0,0.0,0.0,1/1/2019


In [84]:
#######################################
###
### via datathon 2020
###
### stop addresses
###
#######################################

In [85]:
df_stop_addrs = pd.read_csv(files['stops_addr_2020'])
df_stop_addrs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9859 entries, 0 to 9858
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   StopNumber    9859 non-null   object 
 1   Location      9859 non-null   object 
 2   StreetNumber  9715 non-null   float64
 3   Street        9854 non-null   object 
 4   City          9854 non-null   object 
 5   State         9854 non-null   object 
 6   Zip           9854 non-null   float64
dtypes: float64(2), object(5)
memory usage: 539.3+ KB


In [86]:
df_stop_addrs.columns

Index(['StopNumber', 'Location', 'StreetNumber', 'Street', 'City', 'State',
       'Zip'],
      dtype='object')

In [87]:
df_stop_addrs.head()

Unnamed: 0,StopNumber,Location,StreetNumber,Street,City,State,Zip
0,79877,ST. MARY'S BETWEEN CONVENT & MART,603.0,N St Mary's St,San Antonio,TX,78205.0
1,79977,ST. MARY'S & PECAN,152.0,E Pecan St,San Antonio,TX,78205.0
2,89377,ST. MARY'S BETWEEN TRAVIS & HOUSTON,175.0,E Houston St,San Antonio,TX,78205.0
3,89867,ST. MARY'S & COMMERCE,106.0,S St Mary's St,San Antonio,TX,78205.0
4,89839,COMMERCE & SOLEDAD,102.0,Main Plz,San Antonio,TX,78205.0
