## Feature Engineering
Generate distance between two stops which will be used as part of the modeling.

In [1]:
# Imports and general data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import re

from sklearn import preprocessing
from scipy.stats import chi2_contingency
from matplotlib.backends.backend_pdf import PdfPages


# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')

#Plotting inline
%matplotlib inline

In [2]:
# Import necessary data
leavetimes_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\trim_leavetimes.csv', sep=';')
stops_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\DublinBus_2018\stops_trim.csv')
trips_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\trim_trips.csv', sep=';')
distance_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\distance_all.csv')
weather_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\DublinBus_2018\Weather_2018.csv')
historic_df = pd.read_csv(r'C:\Users\turlo\OneDrive\Documents\MSC_Computer_Science\Summer_Project\Data\stop_times_hist.txt')

### Standardise Column Names
Data used contains two different names for the same data, this just cleans it to make it more standard and clear.

In [3]:
historic_df["DIRECTION"] = historic_df["trip_id"].str.strip().str[-1]

In [4]:
historic_df["ROUTE"] =  historic_df["trip_id"].apply(lambda x: re.findall(r'-([^-]*)-', str(x)))

In [5]:
historic_df = historic_df.explode('ROUTE')

In [6]:
historic_df = historic_df[["stop_id","stop_sequence","shape_dist_traveled","ROUTE","DIRECTION"]]
historic_df["PROGRNUMBER"] = historic_df["stop_sequence"]
historic_df = historic_df.rename(columns = {"stop_id":"STOPID", "stop_sequence":"STOP_SEQUENCE", "shape_dist_traveled":"DISTANCE_TRAVELLED"})

In [7]:
frames = [distance_df, historic_df]
distance_df = pd.concat(frames)

### Clean Data
Drop unnecessary data and change format/data types to match the main analysis script.

In [8]:
distance_df = distance_df.drop_duplicates()

In [None]:
stops_df['STOPPOINTID'] = stops_df['STOPPOINTID'].astype('float')

In [9]:
days = ['19-JAN-18 00:00:00', '20-JAN-18 00:00:00', '21-JAN-18 00:00:00']
leavetimes_df = leavetimes_df[leavetimes_df.DAYOFSERVICE.isin(days)]

In [10]:
# Adjust times from seconds to hh:mm:ss and get the expected and actual trip time
trips_df['PLANNEDTIME'] = (trips_df['PLANNEDTIME_ARR'] - trips_df['PLANNEDTIME_DEP'])
trips_df['ACTUALTIME'] = (trips_df['ACTUALTIME_ARR'] - trips_df['ACTUALTIME_DEP'])
trips_df['ACTUALTIME_SECONDS'] = trips_df['ACTUALTIME']

trips_df['DATE'] = pd.to_datetime(trips_df['DAYOFSERVICE']).dt.strftime('%Y-%m-%d')
trips_df['HOUR'] = pd.to_datetime(trips_df['PLANNEDTIME_DEP'], unit='s').dt.strftime("%H:00:00")
trips_df['PLANNEDTIME_ARR'] = pd.to_datetime(trips_df['PLANNEDTIME_ARR'], unit='s').dt.strftime("%H:%M:%S")
trips_df['PLANNEDTIME_DEP'] = pd.to_datetime(trips_df['PLANNEDTIME_DEP'], unit='s').dt.strftime("%H:%M:%S")
trips_df['ACTUALTIME_ARR'] = pd.to_datetime(trips_df['ACTUALTIME_ARR'], unit='s').dt.strftime("%H:%M:%S")
trips_df['ACTUALTIME_DEP'] = pd.to_datetime(trips_df['ACTUALTIME_DEP'], unit='s').dt.strftime("%H:%M:%S")


leavetimes_df['PLANNEDTIME_ARR'] = pd.to_datetime(leavetimes_df['PLANNEDTIME_ARR'], unit='s').dt.strftime("%H:%M:%S")
leavetimes_df['PLANNEDTIME_DEP'] = pd.to_datetime(leavetimes_df['PLANNEDTIME_DEP'], unit='s').dt.strftime("%H:%M:%S")
leavetimes_df['ACTUALTIME_ARR'] = pd.to_datetime(leavetimes_df['ACTUALTIME_ARR'], unit='s').dt.strftime("%H:%M:%S")
leavetimes_df['ACTUALTIME_DEP'] = pd.to_datetime(leavetimes_df['ACTUALTIME_DEP'], unit='s').dt.strftime("%H:%M:%S")

trips_df['PLANNEDTIME'] = pd.to_datetime(trips_df['PLANNEDTIME'], unit='s').dt.strftime("%H:%M:%S")
trips_df['ACTUALTIME'] = pd.to_datetime(trips_df['ACTUALTIME'], unit='s').dt.strftime("%H:%M:%S")


weather_df['DATE'] = pd.to_datetime(weather_df['dt'], unit='s').dt.strftime("%Y-%m-%d")
weather_df['HOUR'] = pd.to_datetime(weather_df['dt'], unit='s').dt.strftime("%H:%M:%S")


### Merge Data

In [11]:
# Combine data tables and keep only the relevant headings
result = pd.merge(trips_df[['DATE','TRIPID','LINEID','ROUTEID','DIRECTION','PLANNEDTIME','ACTUALTIME', 
                            'ACTUALTIME_SECONDS', 'HOUR']],
                  leavetimes_df[['PROGRNUMBER','STOPPOINTID','PLANNEDTIME_ARR','PLANNEDTIME_DEP','ACTUALTIME_ARR',
                                 'ACTUALTIME_DEP', 'TRIPID']],
                 on='TRIPID', 
                 how='left')


In [13]:
result = pd.merge(result,
                  stops_df[['STOPID','STOPNAME','STOPLAT','STOPLON','STOPPOINTID']],
                 on='STOPPOINTID', 
                 how='left')

In [14]:
stop_detail = pd.merge(result,
                  distance_df[['STOPID', 'DISTANCE_TRAVELLED', 'PROGRNUMBER']],
                 on=['STOPID', 'PROGRNUMBER'],
                 how='left')

In [15]:
stop_detail = stop_detail[["LINEID", "ROUTEID", 'DIRECTION', 'PROGRNUMBER', 'STOPID', 
                           'STOPNAME', 'STOPLAT', 'STOPLON', 'DISTANCE_TRAVELLED']]

In [16]:
stop_detail = stop_detail.drop_duplicates(subset=["LINEID", "ROUTEID", 'DIRECTION', 'PROGRNUMBER', 'STOPID', 
                           'STOPNAME', 'STOPLAT', 'STOPLON'], keep="first")

### Order Results and Clean
If the stop is the first one on a route the distance is set to zero. For any other unknown distances the values are dropped. These dropped values are mainly for som less used routes.

In [17]:
stop_detail = stop_detail.sort_values(by=['ROUTEID', 'PROGRNUMBER'])

In [18]:
stop_detail['DISTANCE_TRAVELLED'] = np.where(stop_detail['PROGRNUMBER'] == 1.0, 0.0, stop_detail['DISTANCE_TRAVELLED'])

In [19]:
stop_detail = stop_detail.dropna(subset = ["DISTANCE_TRAVELLED"])

In [22]:
stop_detail['DISTANCE_TRAVELLED'] = stop_detail['DISTANCE_TRAVELLED'].astype(str).str.replace(",", "").astype(float)

### Generate New Columns

In [21]:
stop_detail['previous_bus_id'] = stop_detail['ROUTEID'].shift(periods=1)

In [23]:
stop_detail['previous_stop_dist'] = stop_detail['DISTANCE_TRAVELLED'].shift(periods=1)

In [24]:
stop_detail['PREV_STOPID'] = stop_detail['STOPID'].shift(periods=1)

In [25]:
stop_detail = stop_detail.where(stop_detail['ROUTEID'] == stop_detail['previous_bus_id'])

In [26]:
stop_detail['DIST_BETWEEN'] = (stop_detail['DISTANCE_TRAVELLED']-stop_detail['previous_stop_dist']).where(stop_detail['ROUTEID'] == stop_detail['previous_bus_id'])

In [27]:
stop_detail.dropna(subset = ["STOPID"], inplace=True)

### Interesting Stats and Final Table
Investigation needed into negative distances, very small distances and very large distances.

Results generated here populate the dublinbus.tfi_station_distance SQL table.

In [37]:
# Get average
avg_dist = stop_detail.groupby('ROUTEID')['DIST_BETWEEN'].mean().reset_index().rename(columns={'DIST_BETWEEN': 'avg_DIST_BETWEEN'})
avg_dist

Unnamed: 0,ROUTEID,avg_DIST_BETWEEN
0,102_10,432.018843
1,102_8,478.322581
2,102_9,455.205535
3,104_15,690.242996
4,104_16,372.314444
...,...,...
478,84_32,399.127660
479,9_5,316.800000
480,9_6,319.531250
481,9_7,350.705882


In [31]:
stop_final = stop_detail[['LINEID', 'ROUTEID', 'DIRECTION', 'PROGRNUMBER', 'STOPID', 'PREV_STOPID', 
                          'DISTANCE_TRAVELLED',  'DIST_BETWEEN']]

In [34]:
stop_final = stop_final.reset_index(drop=True)
stop_final.to_csv("stop_final.csv")