In [1]:
import pandas as pd
import numpy as np
import random
import json
from shapely.geometry import shape, Point, MultiPolygon
import geopandas as gpd

In [2]:
base_path = '/usr/local/google/home/atalekar/External/iisc/markov_simuls/staticInst/data/base/mumbai/'

In [3]:
mumbai_route_csv = base_path + 'mumbai_routes.csv'
routes_df = pd.read_csv(mumbai_route_csv)
# Clean up
routes_df['trainLine'] = routes_df.trainLine.str.strip()
routes_df = routes_df[routes_df.stationCode != 'RMAR'].copy() # drop ram mandir station

city_gj = base_path + 'city.geojson'
geoDF = gpd.read_file(city_gj)
geoDF['wardNo'] = geoDF['wardNo'].astype(int)
geoDF['wardIndex'] = geoDF['wardNo'] - 1
geoDF = geoDF[['wardIndex','wardNo', 'wardName', 'geometry']]
geoDF['wardBounds'] = geoDF.apply(lambda row: MultiPolygon(row['geometry']).bounds, axis=1)
geoDF = geoDF.sort_values('wardNo')
##!! Note that the geojson file has coordinates in (longitude, latitude) order!
geoDF['wardCentre'] = geoDF.apply(lambda row: (MultiPolygon(row['geometry']).centroid.x, MultiPolygon(row['geometry']).centroid.y), axis=1)


In [4]:
# Assigning wards to trains stations. 
def getStationWard(station):
  for index, row in geoDF.iterrows():
    poly = MultiPolygon(row['geometry'])
    pt = Point(station.longitude, station.latitude)
    if poly.contains(pt):
      return row.wardIndex
  return None
routes_df['wardIndex'] = routes_df.apply(lambda x: getStationWard(x), axis=1)
routes_df = routes_df[routes_df.wardIndex.notna()]
routes_df.reset_index(drop=True,inplace=True)
routes_df['stationId'] = routes_df.index
routes_df = routes_df[['trainLine','stationId', 'stationName', 'stationCode', 'slowTimeFromPreviousStation', 'latitude', 'longitude', 'wardIndex']]

In [5]:
# Splitting  train lines based on https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/Mumbai_Rail_Map_-_English.jpg/1280px-Mumbai_Rail_Map_-_English.jpg
western_route_df = routes_df[routes_df.trainLine == 'western'].reset_index(drop=True)
central_route_df = routes_df[routes_df.trainLine == 'central'].reset_index(drop=True)
harbour1_route_df = routes_df[routes_df.trainLine == 'harbour'].reset_index(drop=True)[:15]
harbour2_route_df = routes_df[routes_df.trainLine == 'harbour'][:8]
harbour3_route_df = routes_df[routes_df.trainLine == 'harbour'][-7:]
harbour2_route_df = harbour2_route_df.append(harbour3_route_df, ignore_index=True)
del harbour3_route_df

In [6]:
# Ensuring station ids are correct across routes for same stations. 
central_route_df.loc[central_route_df.stationName == 'dadar', 'stationId'] = western_route_df[western_route_df.stationName == 'dadar']['stationId'].values[0]
harbour1_route_df.loc[harbour1_route_df.stationCode == 'ST', 'stationId'] = central_route_df[central_route_df.stationCode == 'ST']['stationId'].values[0]
harbour1_route_df.loc[harbour1_route_df.stationCode == 'MSD', 'stationId'] = central_route_df[central_route_df.stationCode == 'MSD']['stationId'].values[0]
harbour1_route_df.loc[harbour1_route_df.stationCode == 'SNRD', 'stationId'] = central_route_df[central_route_df.stationCode == 'SNRD']['stationId'].values[0]
harbour1_route_df.loc[harbour1_route_df.stationName == 'kurla', 'stationId'] = central_route_df[central_route_df.stationName == 'kurla']['stationId'].values[0]

harbour2_route_df.loc[harbour2_route_df.stationCode == 'ST', 'stationId'] = central_route_df[central_route_df.stationCode == 'ST']['stationId'].values[0]
harbour2_route_df.loc[harbour2_route_df.stationCode == 'MSD', 'stationId'] = central_route_df[central_route_df.stationCode == 'MSD']['stationId'].values[0]
harbour2_route_df.loc[harbour2_route_df.stationCode == 'SNRD', 'stationId'] = central_route_df[central_route_df.stationCode == 'SNRD']['stationId'].values[0]
harbour2_route_df.loc[harbour2_route_df.stationCode == 'B', 'stationId'] = western_route_df[western_route_df.stationCode == 'B']['stationId'].values[0]
harbour2_route_df.loc[harbour2_route_df.stationCode == 'KHAR', 'stationId'] = western_route_df[western_route_df.stationCode == 'KHAR']['stationId'].values[0]
harbour2_route_df.loc[harbour2_route_df.stationCode == 'STC', 'stationId'] = western_route_df[western_route_df.stationCode == 'STC']['stationId'].values[0]
harbour2_route_df.loc[harbour2_route_df.stationCode == 'VLP', 'stationId'] = western_route_df[western_route_df.stationCode == 'VLP']['stationId'].values[0]
harbour2_route_df.loc[harbour2_route_df.stationName == 'andheri', 'stationId'] = western_route_df[western_route_df.stationName == 'andheri']['stationId'].values[0]


In [7]:
# Splicing train lines to single routes dataframe and file.
western_route_df['trainLineId'] = 1
central_route_df['trainLineId'] = 2
harbour1_route_df['trainLineId'] = 3
harbour2_route_df['trainLineId'] = 4
combined_route_df = western_route_df.append(central_route_df, ignore_index=True)
combined_route_df = combined_route_df.append(harbour1_route_df, ignore_index=True)
combined_route_df = combined_route_df.append(harbour2_route_df, ignore_index=True)
combined_route_df['stationId'] = combined_route_df.stationId.astype('int')
combined_route_df['wardIndex'] = combined_route_df.stationId.astype('int')
combined_route_df = combined_route_df[['trainLineId', 'stationId', 'slowTimeFromPreviousStation', 'latitude', 'longitude', 'wardIndex', 'stationName', 'stationCode']]
combined_route_df.head()

Unnamed: 0,trainLineId,stationId,slowTimeFromPreviousStation,latitude,longitude,wardIndex,stationName,stationCode
0,1,0,0,18.934886,72.827164,0,churchgate,CCG
1,1,1,3,18.9447,72.8244,1,marine lines,MEL
2,1,2,2,18.951565,72.818633,2,charni road,CYR
3,1,3,3,18.9634,72.8161,3,grant road,GTR
4,1,4,2,18.9697,72.8194,4,mumbai central,BCT


In [8]:
train_line_transfer_time_min = 7

In [9]:
def getTravelTimeOnTrainLine(train_line_route_df, src_id, dest_id):
  src = train_line_route_df[train_line_route_df.stationId == src_id]
  dest = train_line_route_df[train_line_route_df.stationId == dest_id]
  start_index = min(src.index[0], dest.index[0])
  end_index = max(src.index[0], dest.index[0])
  return train_line_route_df.loc[start_index+1:end_index].slowTimeFromPreviousStation.sum()

In [12]:
# This can probably solved with a standard graph traversal, 
# but will need significant changes in representation so hand coding traversal.  
def getTravelTimeBetweenStations(src_id, dest_id):
  '''Returns 2 values, first is an int representing time of travel in minutes, and second is a list of legs
      of the route between each station of the format [[trainline1, start1, end1],[trainline2, start2, end2]]'''
  src = combined_route_df[combined_route_df.stationId == src_id]
  dest = combined_route_df[combined_route_df.stationId == dest_id]
  
  # Handle edge cases - malformed input routes.
  if src.shape[0] == 0 or dest.shape[0] == 0:
    return None, None
  # src and dest is same.
  if src.index[0] == dest.index[0]:
    return 0, []
  
  src_w = src_id in western_route_df.stationId.unique()
  src_c = src_id in central_route_df.stationId.unique()
  src_h1 = src_id in harbour1_route_df.stationId.unique()
  src_h2 = src_id in harbour2_route_df.stationId.unique()

  dest_w = dest_id in western_route_df.stationId.unique()
  dest_c = dest_id in central_route_df.stationId.unique()
  dest_h1 = dest_id in harbour1_route_df.stationId.unique()
  dest_h2 = dest_id in harbour2_route_df.stationId.unique()

  # Both stations on same line
  if src_w and dest_w:
        t = getTravelTimeOnTrainLine(western_route_df, src_id, dest_id)
        return t, [[1, src_id, dest_id, t]]
  if src_c and dest_c:
        t = getTravelTimeOnTrainLine(central_route_df, src_id, dest_id)
        return t, [[2, src_id, dest_id, t]]
  if src_h1 and dest_h1:
        t = getTravelTimeOnTrainLine(harbour1_route_df, src_id, dest_id)
        return t, [[3, src_id, dest_id, t]]
  if src_h2 and dest_h2:
        t = getTravelTimeOnTrainLine(harbour2_route_df, src_id, dest_id)
        return t, [[4, src_id, dest_id, t]]
  
  # key interchange points
  dadar_id = western_route_df[western_route_df.stationName == 'dadar']['stationId'].values[0]
  bandra_id = western_route_df[western_route_df.stationCode == 'B']['stationId'].values[0]
  vadala_id = harbour1_route_df[harbour1_route_df.stationCode == 'VD']['stationId'].values[0]
  cst_id = harbour1_route_df[harbour1_route_df.stationCode == 'ST']['stationId'].values[0]
  kurla_id = harbour1_route_df[harbour1_route_df.stationName == 'kurla']['stationId'].values[0]

  # Both stations on connected lines
  if src_w and dest_c:
    t1 = getTravelTimeOnTrainLine(western_route_df, src_id, dadar_id)
    t2 = getTravelTimeOnTrainLine(central_route_df, dadar_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[1, src_id, dadar_id, t1], [2, dadar_id, dest_id, t2]]
  if src_c and dest_w:
    t1 = getTravelTimeOnTrainLine(central_route_df, src_id, dadar_id)
    t2 = getTravelTimeOnTrainLine(western_route_df, dadar_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[2, src_id, dadar_id, t1], [1, dadar_id, dest_id, t2]]

  if src_w and dest_h2:
    t1 = getTravelTimeOnTrainLine(western_route_df, src_id, bandra_id)
    t2 = getTravelTimeOnTrainLine(harbour2_route_df, bandra_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[1, src_id, bandra_id, t1], [4, bandra_id, dest_id, t2]]
  if src_h2 and dest_w:
    t1 = getTravelTimeOnTrainLine(harbour2_route_df, src_id, bandra_id)
    t2 = getTravelTimeOnTrainLine(western_route_df, bandra_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[4, src_id, bandra_id, t1], [1, bandra_id, dest_id, t2]]
  
  if src_h1 and dest_h2:
    t1 = getTravelTimeOnTrainLine(harbour1_route_df, src_id, vadala_id)
    t2 = getTravelTimeOnTrainLine(harbour2_route_df, vadala_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[3, src_id, vadala_id, t1], [4, vadala_id, dest_id, t2]]
  if src_h2 and dest_h1:
    t1 = getTravelTimeOnTrainLine(harbour2_route_df, src_id, vadala_id)
    t2 = getTravelTimeOnTrainLine(harbour1_route_df, vadala_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[4, src_id, vadala_id, t1], [3, vadala_id, dest_id, t2]]

  if src_c and dest_h2:
    t1 = getTravelTimeOnTrainLine(central_route_df, src_id, cst_id)
    t2 = getTravelTimeOnTrainLine(harbour2_route_df, cst_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[2, src_id, cst_id, t1], [4, cst_id, dest_id, t2]]
  if src_h2 and dest_c:
    t1 = getTravelTimeOnTrainLine(harbour2_route_df, src_id, cst_id)
    t2 = getTravelTimeOnTrainLine(central_route_df, cst_id, dest_id)
    return t1 + t2 + train_line_transfer_time_min, [[4, src_id, cst_id, t1], [2, cst_id, dest_id, t2]]

  if src_c and dest_h1:
    t1 = getTravelTimeOnTrainLine(central_route_df, src_id, cst_id)
    t2 = getTravelTimeOnTrainLine(harbour1_route_df, cst_id, dest_id)

    t3 = getTravelTimeOnTrainLine(central_route_df, src_id, kurla_id)
    t4 = getTravelTimeOnTrainLine(harbour1_route_df, kurla_id, dest_id)

    if (t1 + t2) < (t3+t4):
      return t1 + t2 + train_line_transfer_time_min, [[2, src_id, cst_id, t1], [3, cst_id, dest_id, t2]]
      
    return t3 + t4 + train_line_transfer_time_min, [[2, src_id, kurla_id, t3], [3, kurla_id, dest_id, t4]]

  if src_h1 and dest_c:
    t1 = getTravelTimeOnTrainLine(harbour1_route_df, src_id, cst_id)
    t2 = getTravelTimeOnTrainLine(central_route_df, cst_id, dest_id)

    t3 = getTravelTimeOnTrainLine(harbour1_route_df, src_id, kurla_id)
    t4 = getTravelTimeOnTrainLine(central_route_df, kurla_id, dest_id)

    if (t1 + t2) < (t3+t4):
      return t1 + t2 + train_line_transfer_time_min, [[3, src_id, cst_id, t1], [2, cst_id, dest_id, t2]]
      
    return t3 + t4 + train_line_transfer_time_min, [[3, src_id, kurla_id, t3], [2, kurla_id, dest_id, t4]]

  # 3 train line case
  if src_w and dest_h1:
    t1 = getTravelTimeOnTrainLine(western_route_df, src_id, dadar_id)
    t2_1 = getTravelTimeOnTrainLine(central_route_df, dadar_id, cst_id)
    t3_1 = getTravelTimeOnTrainLine(harbour1_route_df, cst_id, dest_id)
    t2_2 = getTravelTimeOnTrainLine(central_route_df, dadar_id, kurla_id)
    t3_2 = getTravelTimeOnTrainLine(harbour1_route_df, kurla_id, dest_id)

    t4 = getTravelTimeOnTrainLine(western_route_df, src_id, bandra_id)
    t5 = getTravelTimeOnTrainLine(harbour2_route_df, bandra_id, vadala_id)
    t6 = getTravelTimeOnTrainLine(harbour1_route_df, vadala_id, dest_id)

    if (t1 + t2_1 + t3_1) < (t1 + t2_2 + t3_2) and (t1 + t2_1 + t3_1) < (t4 + t5 + t6):
      return t1 + t2_1 + t3_1 + train_line_transfer_time_min*2, [[1, src_id, dadar_id, t1], [2, dadar_id, cst_id, t2_1], [3, cst_id, dest_id, t3_1]]
    if (t1 + t2_2 + t3_2) < (t1 + t2_1 + t3_1) and (t1 + t2_2 + t3_2) < (t4 + t5 + t6):
      return t1 + t2_2 + t3_2 + train_line_transfer_time_min*2, [[1, src_id, dadar_id, t1], [2, dadar_id, kurla_id, t2_2], [3, kurla_id, dest_id, t3_2]]
    
    return t4 + t5 + t6 + train_line_transfer_time_min*2, [[1, src_id, bandra_id, t4], [4, bandra_id, vadala_id, t5], [3, vadala_id, dest_id, t6]]

  if src_h1 and dest_w:
    t1_1 = getTravelTimeOnTrainLine(harbour1_route_df, src_id, cst_id)
    t2_1 = getTravelTimeOnTrainLine(central_route_df, cst_id, dadar_id)
    t3 = getTravelTimeOnTrainLine(western_route_df, dadar_id, dest_id)
    t1_2 = getTravelTimeOnTrainLine(harbour1_route_df, src_id, kurla_id)
    t2_2 = getTravelTimeOnTrainLine(central_route_df, kurla_id, dadar_id)

    t4 = getTravelTimeOnTrainLine(harbour1_route_df, src_id, vadala_id)
    t5 = getTravelTimeOnTrainLine(harbour2_route_df, bandra_id, vadala_id)
    t6 = getTravelTimeOnTrainLine(western_route_df, bandra_id, dest_id)

    if (t1_1 + t2_1 + t3) < (t1_2 + t2_2 + t3) and (t1_1 + t2_1 + t3) < (t4 + t5 + t6):
      return t1_1 + t2_1 + t3 + train_line_transfer_time_min*2, [[3, src_id, cst_id, t1_1], [2, cst_id, dadar_id, t2_1], [1, dadar_id, dest_id, t3]]
    if (t1_2 + t2_2 + t3) < (t1_1 + t2_1 + t3) and (t1_2 + t2_2 + t3) < (t4 + t5 + t6):
      return t1_2 + t2_2 + t3 + train_line_transfer_time_min*2, [[3, src_id, kurla_id, t1_2], [2, kurla_id, dadar_id, t2_2], [1, dadar_id, dest_id, t3]]
    
    return t4 + t5 + t6 + train_line_transfer_time_min*2, [[3, src_id, vadala_id, t4], [4, vadala_id, bandra_id, t5], [1, bandra_id, dest_id, t6]]    
  
  return None, None

In [13]:
stations = combined_route_df.stationId.unique()
travel_time = []
travel_route = []
for start in stations:
  for end in stations:
    t, r = getTravelTimeBetweenStations(start, end)
    travel_time.append([start, end, t, str(r)])
    travel_route.append(r)
travel_time_df = pd.DataFrame(travel_time)
travel_time_df.columns = ['start', 'end', 'travel_time_min', 'route_str']

In [14]:
combined_route_df.to_json(base_path + 'train_route.json', orient='records')
travel_time_df.to_json(base_path + 'travel_time.json', orient='records')

In [15]:
travel_time_df.head(50)

Unnamed: 0,start,end,travel_time_min,route_str
0,0,0,0,[]
1,0,1,3,"[[1, 0, 1, 3]]"
2,0,2,5,"[[1, 0, 2, 5]]"
3,0,3,8,"[[1, 0, 3, 8]]"
4,0,4,10,"[[1, 0, 4, 10]]"
5,0,5,13,"[[1, 0, 5, 13]]"
6,0,6,16,"[[1, 0, 6, 16]]"
7,0,7,19,"[[1, 0, 7, 19]]"
8,0,8,21,"[[1, 0, 8, 21]]"
9,0,9,23,"[[1, 0, 9, 23]]"


In [12]:
def getOverlapTimeOnSameTrainLine(trainLine, srcId1, destId1, srcId2, destId2):
  src1 = trainLine[trainLine.stationId == srcId1]
  dest1 = trainLine[trainLine.stationId == destId1]
  src2 = trainLine[trainLine.stationId == srcId2]
  dest2 = trainLine[trainLine.stationId == destId2]

  # Stations need to be on the same line
  if src1.shape[0] == 0 or dest1.shape[0] == 0 or src2.shape[0] == 0 or dest2.shape[0] == 0:
    return 0
  # Overlap only exists if they are traveling in same direction.
  if ((src1.index[0] < dest1.index[0]) and (src2.index[0] > dest2.index[0])) or ((src1.index[0] > dest1.index[0]) and (src2.index[0] < dest2.index[0])):
    return 0

  # identify overlap start
  startId=None
  if (src1.index[0] <= src2.index[0]) and (src2.index[0] <= dest1.index[0]):
    startId = src2.stationId.values[0]
  elif (src2.index[0] <= src1.index[0]) and (src1.index[0] <= dest2.index[0]):
    startId = src1.stationId.values[0]
  else: # There is no overlap
    return 0
  endIndex = min(dest1.index[0], dest2.index[0])
  endId = trainLine.loc[endIndex].stationId
  return travel_time_df[(travel_time_df.start == startId) & (travel_time_df.end == endId)].travel_time_min.values[0]

In [13]:
train_lines = [combined_route_df, western_route_df, central_route_df, harbour1_route_df, harbour2_route_df]
def getOverlapTime(srcId1, destId1, srcId2, destId2):
  travel1 = travel_time_df[(travel_time_df.start == src1) & (travel_time_df.end == dest1)]
  travel2 = travel_time_df[(travel_time_df.start == src2) & (travel_time_df.end == dest2)]
  route1 = travel_route[travel1.index[0]]
  route2 = travel_route[travel2.index[0]]
  overlap_time = 0
  for leg1 in route1:
    for leg2 in route2:
      # overlap time only if on same line
      if leg1[0]==leg2[0]:
        overlap_time += getOverlapTimeOnSameTrainLine(train_lines[leg1[0]], leg1[1], leg1[2], leg2[1], leg2[2])

  return overlap_time

In [17]:
overlaps = []
stations = combined_route_df.stationId.unique()
for src1 in stations:
  print(src1)
  for dest1 in stations:
    for src2 in stations:
      for dest2 in stations:
        overlap_time = getOverlapTime(src1, dest1, src2, dest2)
        overlaps.append([src1, dest1, src2, dest2, overlap_time])
overlaps_df = pd.DataFrame(overlaps)
overlaps_df.columns = ['src1', 'dest1', 'src2', 'dest2', 'overlap_time_min']

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
30
31
32
33
34
35
36
37
38
39
43
44
45
46
47
48
49
51
52
53
54
55
56


In [18]:
overlaps_df.to_json(base_path + 'overlap_time.json', orient='records')

In [None]:
route_df = pd.
# combined_route_df.to_json(base_path + 'train_route.json', orient='records')