In [None]:
import pandas as pd
import numba as nb
import numpy as np
import re
import ast

## Reading the bus stop and train station datasets

*   The necessary columns of the datasets are read into Pandas dataframes. 

>*   stationInfo dataset: Each row in this dataset contains information on each train station that is on tramway, Cercanías, or metro network. The columns used;

>>*  CODIGOMUNICIPIO: The identification code of the districts.
>>*  DENOMINAPARADA: The name of the train station.
>>*  DPAYPOINT: The code of the payment point device.
>>*  LATITUD: Latitude of the train station.
>>*  LONGITUD: Longitude of the train station.




>*   stopInfo dataset: This dataset contains information on all the bus stops in Madrid metropolitan area. The columns used;

>>*  CODIGOMUNICIPIO: The identification code of the districts.
>>*  DPAYPOINT: The code of the payment point device.
>>*  LATITUD: Latitude of the bus stop.
>>*  LONGITUD: Longitude of the bus stop.
>>*  IDLINEA: The identification number of the bus (bus number)



Processing:
*   Bus stops without IDLINEA value are removed.
*   'type' column is created to store type of the transportation.
>* 0 = Rail
>* 1 = Bus
*   Two datasets are concatenated to create allUStopDf data frame.
*   allUniqueStopDf data frame is obtained by eliminating the records that have same location and transportation type to have info for each unique stops.
*   The index of the data frame is assigned to the IDSTOP column.

In [None]:
railStopDf = pd.read_csv('stationInfo.csv', usecols=['LATITUD', 'LONGITUD', 'DPAYPOINT', 'DENOMINAPARADA', 'CODIGOMUNICIPIO'])
railStopDf['type'] = 0

busStopDf = pd.read_csv('stopInfo.csv', usecols=['LATITUD', 'LONGITUD', 'DPAYPOINT', 'IDLINEA', 'CODIGOMUNICIPIO'])
busStopDf = busStopDf[busStopDf['IDLINEA'].notna()]
busStopDf['type'] = 1

allStopDf = pd.concat([railStopDf, busStopDf], ignore_index=True)
allUniqueStopDf = allStopDf.drop_duplicates(subset=['LATITUD','LONGITUD','type'], keep="first")
allUniqueStopDf = allUniqueStopDf.reset_index(drop=True)
allUniqueStopDf['IDSTOP'] = allUniqueStopDf.index
allUniqueStopDf


allPaypointDf is created by merging allStopDf and allUniqueStopDf. Thereby, the IDSTOP values are assigned to each record with different payment point (DPAYPOINT).

In [None]:
allPaypointDf = pd.merge(allStopDf, allUniqueStopDf[['LATITUD', 'LONGITUD', 'IDSTOP']], on=['LATITUD', 'LONGITUD'], how='left')
allPaypointDf = allPaypointDf.sort_values(['IDSTOP'], ascending=[True])
allPaypointDf


### Smart Card Transaction Dataset

Dataset is formed by the smart card information of the passenger as well as the data of the boarding transportation mode. The transaction records dataset is limited to the records of the users with senior smart card profiles. 
*   cardID: This attribute is the hashed version of the smart card serial number. This identification number is unique for each card. The CRTM provided the dataset with the encoded card IDs and the hash function is not given in order to avoid any privacy and confidentiality of users.
*  date: The date of the transaction with the format of year-month-day hour-min-
sec.
*  DPAYPOINT: The code of the payment point device. (Same as the DPAYPOINT
column of the train stations, and bus stop tables.)

Processing:
*   Data id read into the usersDf data frame.
*   'date' column set as datetime.
*   Desired time interval is selected to filter the data frame.
*   Single transaction records are removed by counting the occurrence of the cardID value.
*   usersDf is merged with the allPaypointDf to obtain a data frame with transaction records and the information on the noarding atop of the record.
*   The records have the same cardID and date are removed.
*   Lastly, all the stops are sorted by respectively according to cardID and date values.

In [None]:
usersDf = pd.read_csv('transactionData.txt', sep='#', engine='python', usecols=[0,1,2])
usersDf.columns = ['cardID', 'date', 'DPAYPOINT']
usersDf['date'] = pd.to_datetime(usersDf['date'])
usersDf = usersDf[(usersDf['date'] > '2020-02-01 04:00:00') & (usersDf['date'] < '2020-02-02 03:59:59')]
print('Full dataset size: ', len(usersDf))


v = usersDf.cardID.value_counts()
usersDf = usersDf[usersDf['cardID'].isin(v.index[v.gt(1)])]

usersDf = pd.merge(left=usersDf, right=allPaypointDf, how='inner', left_on='DPAYPOINT', right_on='DPAYPOINT')
usersDf = usersDf[usersDf['LATITUD'].notna()]
usersDf = usersDf.drop_duplicates(subset=['cardID','date'], keep="first")
print('Dataset size after removing the single transactions: ', len(usersDf))

usersDf = usersDf.sort_values(['cardID', 'date'], ascending=[True, True]).reset_index(drop=True)
usersDf

###Lines dataset
This dataset contains the station name and a list of line numbers for that station.

Procesing: 
*   lines.csv is read into linesRailDf.
*   The apperance of the data frame is editted to obtain more suitable one.
*   The same data frame is created for the bus stops by grouping the IDLINEA column with the stop ID.
*   allStopsWithLinesOnly data frame is generated by concatinating these two data frames.



In [None]:
linesRailDf = pd.read_csv('lines.csv', names=["DENOMINAPARADA", "lines"])
for row in range(len(linesRailDf)):
    linesRailDf.iat[row,1] = linesRailDf.iat[row,1].split(";")

linesBusDf = allPaypointDf[allPaypointDf['type'] == 1].groupby('IDSTOP')['IDLINEA'].apply(list).to_frame()
linesBusDf = linesBusDf.rename(columns = {'IDLINEA':'lines'})
linesBusDf = linesBusDf.reset_index()

merged_linesRailDf = pd.merge(linesRailDf, allPaypointDf[['IDSTOP', 'DENOMINAPARADA']], how='left', right_on='DENOMINAPARADA', left_on='DENOMINAPARADA')
allStopWithLinesOnly = pd.concat([merged_linesRailDf, linesBusDf], ignore_index=True)
allStopWithLinesOnly

In [None]:
allStopWithLines = pd.merge(left=allPaypointDf, right=allStopWithLinesOnly, how='left', left_on='IDSTOP', right_on='IDSTOP')
allStopWithLines.head()


### Near stops data frame

This data frame is generated with the nearStop.py file. It contains nearStops column that store a list of stops which are located in a 650 meter radius circle of the stop in the first column.

Processing:
*   The format of the csv created is changed for better presentation and applicaiton as well as the type of the columns.

In [None]:
nearStopDf = pd.read_csv('near.csv')
nearStopDf['stop'] = nearStopDf['stop'].str.strip("[]").astype(float)
nearStopDf.head()

A function called distance_calculate is created to calculate the distance between two points by using the latitude and longitude pairs of these locations. The result returns in meters.

In [None]:
def distance_calculate(lat1_input,lat2_input,lon1_input,lon2_input):
  lon1 = np.radians(lon1_input)
  lon2 = np.radians(lon2_input)
  lat1 = np.radians(lat1_input)
  lat2 = np.radians(lat2_input)
      
  dlon = lon2 - lon1
  dlat = lat2 - lat1
  a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
  c = 2 * np.arcsin(np.sqrt(a))
  r = 6371000
  return c * r

Another function called isin_list is created to check if two lists have any elements in common. Returns a boolean.

In [None]:
def isin_list(list1, list2):
  isin = False
  for item in range(len(list1)):
    if list1[item] in list2:
      isin = True
      break 
  return isin

# Transfer Types

### Train-to-train transfer detection algorithm
transferRail() function is created to detect transfers between two consecutive transctions.


Process:


*   The given record is saved into a dataframe and the 'date' column type set as datetime. 
*   actualTimeDiff: The time difference between these given 2 consecutive rides.
*   currentLineList: The list of the lines passes throught the first boarding stop.
*   nextLineList: The list of the lines passes throught the second boarding stop.
*   dist: The distance between 2 boarding locations.

Cases:

It is checked if currentLineList and nextLineList have a common element.
>If yes, maxTime is calculated by the given equation. 
>>Transfer detected if actualTimeDiff < maxTime. 

>If no, nearStop list is checked for the second boarding station to find candidate transfer stations.
>>If a candidate station is found, the maxTime variable is calculated for it. 
>>> Transfer detected if actualTimeDiff < maxTime. 

>> If no candidate station is found, the same operator transfer is considered and maxTime is calculated based on that. 
>>> Transfer detected if actualTimeDiff < maxTime.








In [None]:
# 1. Rail-Rail
def transferRail(cardID, usersDf, record):
  transfer = False
  dest_stop = -2
  maxTime = 0
  recordDf = record
  recordDf["date"] = pd.to_datetime(recordDf["date"])
  
  actualTimeDiff = (recordDf.iloc[1,1] - recordDf.iloc[0,1]).seconds/60
  currentLineList = linesRailDf[linesRailDf.DENOMINAPARADA == recordDf.iloc[0,3]].iat[0,1]
  nextLineList  = linesRailDf[linesRailDf.DENOMINAPARADA == recordDf.iloc[1,3]].iat[0,1]

  lat1 = recordDf.iloc[0,4]
  lon1 = recordDf.iloc[0,5]
  lat2 = recordDf.iloc[1,4]
  lon2 = recordDf.iloc[1,5]
  dist = distance_calculate(lat1,lat2,lon1,lon2) 

  if isin_list(currentLineList, nextLineList):
    maxTime = dist/500 + 5 + 10 
    if actualTimeDiff < maxTime:
      transfer = True

  else: 
    bufferZoneList = nearStopDf[nearStopDf['stop'] == recordDf.iloc[1,8]].iat[0,1]
    bufferZoneList = ast.literal_eval(bufferZoneList)

    for station in bufferZoneList:
      float(station)
      typeOfTransport = allUniqueStopDf[allUniqueStopDf['IDSTOP'] == station].iat[0,4]

      if typeOfTransport == 0:
        currentCandLineDf = allStopWithLinesOnly[allStopWithLinesOnly.IDSTOP == station]
        if len(currentCandLineDf) > 0:
          currentCandList = currentCandLineDf.iat[0,1]

          if isin_list(currentCandList, nextLineList):
            dest_stop = station
            cand_lat2 = allUniqueStopDf[allUniqueStopDf.IDSTOP == station].iat[0,1]
            cand_lon2 = allUniqueStopDf[allUniqueStopDf.IDSTOP == station].iat[0,2]
            can_dist = distance_calculate(lat1, cand_lat2, lon1, cand_lon2)
            walking_dist = distance_calculate(cand_lat2,lat2,cand_lon2,lon2)
            maxTime = (can_dist/500) + 5 + (walking_dist/50) + 10 

          else:
            maxTime = dist/500 + 5 + 20 

        if actualTimeDiff < maxTime:
          transfer = True
          break

  return transfer, dest_stop


### Bus-to-bus transfer detection algorithm
transferBus() function is created to detect transfers between two consecutive transctions.


Process:


*   The given record is saved into a dataframe and the 'date' column type set as datetime. 
*   actualTimeDiff: The time difference between these given 2 consecutive rides.
*   currentLine: The line of the first bus.

Cases:


>nearStop list is checked for the second boarding stop to find candidate transfer stop.
>>If a candidate stop is found, the maxTime variable is calculated for it. 
>>> Transfer detected if actualTimeDiff < maxTime. 

>> If no candidate stop is found, there is no transfer.


In [None]:
# 2. Bus-Bus
def transferBus(cardID, usersDf, record):
  transfer = False
  dest_stop = -2
  recordDf = record
  recordDf["date"] = pd.to_datetime(recordDf["date"])

  if record.iat[0, 7] != record.iat[1, 7]: 
    actualTimeDiff = (recordDf.iloc[1,1] - recordDf.iloc[0,1]).seconds/60
    currentLine = recordDf.iloc[0,7]
    lat1 = recordDf.iloc[0,4]
    lon1 = recordDf.iloc[0,5]
    lat2 = recordDf.iloc[1,4]
    lon2 = recordDf.iloc[1,5]

    bufferZoneList = nearStopDf[nearStopDf['stop'] == recordDf.iloc[1,8]].iat[0,1]
    bufferZoneList = ast.literal_eval(bufferZoneList)

    for stop in bufferZoneList:
      float(stop)
      typeOfTransport = allUniqueStopDf[allUniqueStopDf['IDSTOP'] == stop].iat[0,4]

      if typeOfTransport == 1:
        candStopBusLineDf = allStopWithLinesOnly[allStopWithLinesOnly.IDSTOP == stop]
    
        if len(candStopBusLineDf) > 0:
          candStopBusLineList = candStopBusLineDf.iat[0,1]
  
          if currentLine in candStopBusLineList:
            dest_stop = stop
            cand_lat2 = allUniqueStopDf[allUniqueStopDf.IDSTOP == stop].iat[0,1]
            cand_lon2 = allUniqueStopDf[allUniqueStopDf.IDSTOP == stop].iat[0,2]
            can_dist = distance_calculate(lat1, cand_lat2, lon1, cand_lon2)
            walking_dist = distance_calculate(cand_lat2,lat2,cand_lon2,lon2)
            maxTime = (can_dist/250) + 10 + (walking_dist/50)
  

            if actualTimeDiff < maxTime:
              transfer = True
              break

  return transfer, dest_stop

### Train-to-bus transfer detection algorithm
transferRailBus() function is created to detect transfers between two consecutive transctions.


Process:


*   The given record is saved into a dataframe and the 'date' column type set as datetime. 
*   actualTimeDiff: The time difference between these given 2 consecutive rides.
*   currentLineList: List of the lines passes through the first boardin station.

Cases:

>nearStop list is checked for the second boarding stop to find candidate station could be arrived by using the first train.
>>If a candidate station is found, the maxTime variable is calculated for it. 
>>> Transfer detected if actualTimeDiff < maxTime. 

>> If no candidate station is found, there is no transfer.

In [None]:
def transferRailBus(cardID, usersDf, record):
  transfer = False
  dest_stop = -2
  recordDf = record
  recordDf["date"] = pd.to_datetime(recordDf["date"])

  actualTimeDiff = (recordDf.iloc[1,1] - recordDf.iloc[0,1]).seconds/60
  currentLineList = linesRailDf[linesRailDf.DENOMINAPARADA == recordDf.iloc[0,3]].iat[0,1]
  lat1 = recordDf.iloc[0,4]
  lon1 = recordDf.iloc[0,5]
  nextBoardingStopID = recordDf.iat[0,8]
  lat2 = recordDf.iat[1, 4]
  lon2 = recordDf.iat[1, 5]

  bufferZoneList = nearStopDf[nearStopDf['stop'] == nextBoardingStopID].iat[0,1]
  bufferZoneList = ast.literal_eval(bufferZoneList)

  for station in bufferZoneList:
    float(station)
    typeOfTransport = allUniqueStopDf[allUniqueStopDf['IDSTOP'] == station].iat[0,4]

    if typeOfTransport == 0:
      candStationLineDf = allStopWithLinesOnly[allStopWithLinesOnly.IDSTOP == station]

      if len(candStationLineDf) > 0:
        candStationLineList = candStationLineDf.iat[0,1]
        
        if isin_list(currentLineList, candStationLineList):
          dest_stop = station
          cand_lat2 = allUniqueStopDf[allUniqueStopDf.IDSTOP == station].iat[0,1]
          cand_lon2 = allUniqueStopDf[allUniqueStopDf.IDSTOP == station].iat[0,2]
          can_dist = distance_calculate(lat1, cand_lat2, lon1, cand_lon2)
          walking_dist = distance_calculate(cand_lat2,lat2,cand_lon2,lon2)       
          maxTime =  (can_dist/500) + (walking_dist/50) + 10 + 10 + 5

          if actualTimeDiff < maxTime:
            transfer = True
            break

  return transfer, dest_stop

### Bus-to-train transfer detection algorithm
transferBusRail() function is created to detect transfers between two consecutive transctions.


Process:


*   The given record is saved into a dataframe and the 'date' column type set as datetime. 
*   actualTimeDiff: The time difference between these given 2 consecutive rides.
*   currentLine: The line of the first boarding.

Cases:

>nearStop list is checked for the second boarding station to find candidate stop could be arrived by using the first bus.
>>If a candidate stop is found, the maxTime variable is calculated for it. 
>>> Transfer detected if actualTimeDiff < maxTime. 

>> If no candidate stop is found, there is no transfer.

In [None]:
def transferBusRail(cardID, usersDf, record):
  transfer = False
  dest_stop = -2
  recordDf = record
  recordDf["date"] = pd.to_datetime(recordDf["date"])

  actualTimeDiff = (recordDf.iloc[1,1] - recordDf.iloc[0,1]).seconds/60
  currentLine = recordDf.iloc[0,7]

  lat1 = recordDf.iloc[0,4]
  lon1 = recordDf.iloc[0,5]
  lat2 = recordDf.iloc[1,4]
  lon2 = recordDf.iloc[1,5]

  bufferZoneList = nearStopDf[nearStopDf['stop'] == recordDf.iloc[1,8]].iat[0,1]
  bufferZoneList = ast.literal_eval(bufferZoneList)

  for stop in bufferZoneList:
    float(stop)
    typeOfTransport = allUniqueStopDf[allUniqueStopDf['IDSTOP'] == stop].iat[0,4]

    if typeOfTransport == 1:
      candStopBusLineDf = allStopWithLinesOnly[allStopWithLinesOnly.IDSTOP == stop]

      if len(candStopBusLineDf) > 0:
        candStopBusLineList = candStopBusLineDf.iat[0,1]
      
        if currentLine in candStopBusLineList:
          dest_stop = stop
          cand_lat2 = allStopWithLines[allStopWithLines.IDSTOP == stop].iat[0,1]
          cand_lon2 = allStopWithLines[allStopWithLines.IDSTOP == stop].iat[0,2]
          can_dist = distance_calculate(lat1, cand_lat2, lon1, cand_lon2)
          walking_dist = distance_calculate(cand_lat2,lat2,cand_lon2,lon2)

          maxTime = (can_dist/250) + (walking_dist/50)
          if actualTimeDiff < maxTime:
            
            transfer = True
            break

  return transfer, dest_stop

# Destination estimation

Process:

*   Each consecutive transaction record in the usersDf is checked:
> If they are belonging to the same smart card.
>> If no (which means the record of one user read full and the for loop passed to the next user)
>>>Depending of the first and the last boarding transportation mode, the final destination of the dat for the passenger is searched. Similar to the transfer detection algorithms, the zone around the first boarding location is scanned to find a stop/station that can be reached with the last boarding vehicle. 

>> If yes (which means the for loop is still in the same user as the previous iteration)
>>> Depending of the first and the last boarding transportation mode, 
>>>>*   The transfer detection algoruthm runs. The boolean value for the transfer is saved to transfer column.
>>>>*   The destination is obtained from the transfer function without checking the time, only considering the distance.

In [None]:
cardID_first = usersDf.iloc[0, 0]
transferList = []
destinationList = []

count = 0 
for transaction in range(len(usersDf)-1):
  destination = -1
  transfer = False
  cardID_second = usersDf.iloc[transaction+1, 0]

  if(cardID_second != cardID_first):
    firstTripStopID = usersDf.iloc[transaction-count, 8]

    if (usersDf.iloc[transaction-count, 6] == 0 and usersDf.iloc[transaction, 6] == 0) and (firstTripStopID != usersDf.iloc[transaction, 8]):
      currentLineList = allStopWithLines[allStopWithLines.IDSTOP == firstTripStopID].iat[0,-1]
      nextLineList  = allStopWithLines[allStopWithLines.IDSTOP == usersDf.iloc[transaction,8]].iat[0,-1]
      if isin_list(currentLineList, nextLineList):
        destination = firstTripStopID
      else: 
        nearList = ast.literal_eval(nearStopDf[nearStopDf['stop'] == firstTripStopID].iat[0,1])
        for station in nearList:
          if allStopWithLines[allStopWithLines.IDSTOP == station].iat[0, 4] == 0:
            currentLineList = allStopWithLines[allStopWithLines.IDSTOP == station].iat[0,-1]
            if isin_list(nextLineList, currentLineList):
              destination = firstTripStopID
              break

    if (usersDf.iloc[transaction-count, 6] == 1 and usersDf.iloc[transaction, 6] == 1) and (firstTripStopID != usersDf.iloc[transaction, 8]):
        firstLine = usersDf.iloc[transaction-count, 7]
        lastLine  = usersDf.iloc[transaction, 7]
        if firstLine == lastLine:
          destination = firstTripStopID
        else: 
          nearList = ast.literal_eval(nearStopDf[nearStopDf['stop'] == firstTripStopID].iat[0,1])
          for stop in nearList:
            if allStopWithLines[allStopWithLines.IDSTOP == stop].iat[0, 4] == 1:
              bufferLineList = allStopWithLines[allStopWithLines.IDSTOP == stop].iat[0,-1]
              if lastLine in bufferLineList:
                destination = firstTripStopID
                break

    if (usersDf.iloc[transaction-count, 6] == 1 and usersDf.iloc[transaction, 6] == 0):
        currentLine = usersDf.iloc[transaction-count, 7]
        nextLineList  = allStopWithLines[allStopWithLines.IDSTOP == usersDf.iloc[transaction, 8]].iat[0,-1]
        nearList = ast.literal_eval(nearStopDf[nearStopDf['stop'] == firstTripStopID].iat[0,1])
        for station in nearList:
          if allStopWithLines[allStopWithLines.IDSTOP == station].iat[0, 4] == 0:
            currentLineList = allStopWithLines[allStopWithLines.IDSTOP == station].iat[0,-1]
            if isin_list(nextLineList, currentLineList):
              destination = firstTripStopID
              break

    if (usersDf.iloc[transaction-count, 6] == 0 and usersDf.iloc[transaction, 6] == 1):
        currentLineList = allStopWithLines[allStopWithLines.IDSTOP == usersDf.iloc[transaction-count, 8]].iat[0,-1]
        nextLine = usersDf.iloc[transaction, 7] 
        nearList = ast.literal_eval(nearStopDf[nearStopDf['stop'] == firstTripStopID].iat[0,1])
        for stop in nearList:
          if allStopWithLines[allStopWithLines.IDSTOP == stop].iat[0, 4] == 1:
            currentLineList = allStopWithLines[allStopWithLines.IDSTOP == stop].iat[0,-1]
            if isin_list(nextLine, currentLineList):               
              destination = firstTripStopID
              break

    
    cardID_first = cardID_second
    count = 0 
  else:
    count += 1
    record = usersDf.iloc[[transaction, transaction+1]]
    if usersDf.iloc[transaction, 6] == 0 and usersDf.iloc[transaction+1, 6] == 0:
      transfer, destination = transferRail(cardID_second, usersDf, record)
    elif usersDf.iloc[transaction, 6] == 1 and usersDf.iloc[transaction+1, 6] == 1:
      transfer, destination = transferBus(cardID_second, usersDf, record)
    elif usersDf.iloc[transaction, 6] == 0 and usersDf.iloc[transaction+1, 6] == 1:
      transfer, destination = transferRailBus(cardID_second, usersDf, record)
    elif usersDf.iloc[transaction, 6] == 1 and usersDf.iloc[transaction+1, 6] == 0:
      transfer, destination = transferBusRail(cardID_second, usersDf, record)

  transferList.append(transfer)
  destinationList.append(destination)

transferList.append(False)
destinationList.append(-2)
usersDf.insert(9, 'transfer', transferList)
usersDf.insert(10, 'destination', destinationList)



OD Matrix created for the specified time interval is saved as csv to use later without running the previous steps.

In [None]:
from google.colab import files
usersDf.to_csv('ODMatrix_Feb1_04.csv', encoding = 'utf-8-sig')
