In [1]:
# Imports and general data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import re

from sklearn import preprocessing
from scipy.stats import chi2_contingency
from matplotlib.backends.backend_pdf import PdfPages


# hide ipykernel warnings 
import warnings
warnings.filterwarnings('ignore')

#Plotting inline
%matplotlib inline

### Import Data & Create Features

In [2]:
trips_collist = ['DAYOFSERVICE', 'LINEID','ROUTEID','DIRECTION', 'PLANNEDTIME_DEP', 'TRIPID']
leavetimes_collist = ['PROGRNUMBER','STOPPOINTID', 'DAYOFSERVICE', 'ACTUALTIME_ARR', 'TRIPID']
distance_collist = ['STOPPOINTID', 'STOPID', 'PROGRNUMBER', 'DISTANCE_TRAVELLED']
weather_collist = ['temp', 'weather_main', 'dt']

In [3]:
# Import necessary data
leavetimes_df = pd.read_csv('trim_leavetimes.csv', sep=';', usecols=leavetimes_collist) # Choose only relevant columns
trips_df = pd.read_csv('trim_trips.csv', sep=';', usecols=trips_collist) #choose only relevant columns
weather_df = pd.read_csv('Weather_2018.csv', usecols=weather_collist) #choose only relevant columns
distance_df = pd.read_csv('distance_all.csv', usecols=distance_collist) #choose only relevant columns Check if this is taken from the old records that was used previously or if anything is missing

In [4]:
# Adjust times from seconds to hh:mm:ss and get the expected and actual trip time
trips_df['DATE'] = pd.to_datetime(trips_df['DAYOFSERVICE']).dt.strftime('%Y-%m-%d')
trips_df['HOUR'] = pd.to_datetime(trips_df['PLANNEDTIME_DEP'], unit='s').dt.strftime("%H:00:00")
trips_df['DUMMY_DATE'] = pd.to_datetime(trips_df['DAYOFSERVICE']).dt.strftime('%Y%m%d')

leavetimes_df['DUMMY_DATE'] = pd.to_datetime(leavetimes_df['DAYOFSERVICE']).dt.strftime('%Y%m%d')
leavetimes_df['ACTUALTIME_ARR_TRIP'] = leavetimes_df['ACTUALTIME_ARR']

trips_df['UNIQUEID'] = trips_df["DUMMY_DATE"].astype(str) + "_" + trips_df["TRIPID"].astype(str)
leavetimes_df['UNIQUEID'] = leavetimes_df["DUMMY_DATE"].astype(str) + "_" + leavetimes_df["TRIPID"].astype(str)

weather_df['DATE'] = pd.to_datetime(weather_df['dt'], unit='s').dt.strftime("%Y-%m-%d")
weather_df['HOUR'] = pd.to_datetime(weather_df['dt'], unit='s').dt.strftime("%H:%M:%S")

In [5]:
# Combine data tables and keep only the relevant headings
result = pd.merge(trips_df[['DATE', 'LINEID','ROUTEID','DIRECTION',  'HOUR', 'UNIQUEID']],
                  leavetimes_df[['PROGRNUMBER','STOPPOINTID', 'ACTUALTIME_ARR_TRIP', 'UNIQUEID']],
                  on='UNIQUEID', 
                 how='left')

In [6]:
result = pd.merge(result,
                  distance_df[['STOPPOINTID', 'STOPID', 'PROGRNUMBER', 'DISTANCE_TRAVELLED']],
                 on=['STOPPOINTID', 'PROGRNUMBER'],
                 how='left')

In [7]:
result = pd.merge(result,
                  weather_df[['temp', 'weather_main', 'HOUR', 'DATE']],
                  on=['HOUR', 'DATE'],
                 how='left')

In [8]:
result['DATE'] = pd.to_datetime(result['DATE'])
result['WEEKDAY'] = result['DATE'].dt.day_name()

### Set Relevant Columns and Order

In [9]:
stop_detail = result.reindex(columns=['WEEKDAY', 'LINEID', 'ROUTEID', 'HOUR', 'DIRECTION', 'PROGRNUMBER',
                                 'ACTUALTIME_ARR_TRIP', 'STOPID', 'temp', 'weather_main', 'DISTANCE_TRAVELLED', 'UNIQUEID'])

In [10]:
stop_detail = stop_detail.sort_values(by=['UNIQUEID', 'PROGRNUMBER'])

In [11]:
stop_detail['DISTANCE_TRAVELLED'] = np.where(stop_detail['PROGRNUMBER'] == 1.0, 0.0, 
                                             stop_detail['DISTANCE_TRAVELLED'])

In [12]:
stop_detail = stop_detail.dropna(subset = ["DISTANCE_TRAVELLED"])

In [13]:
stop_detail['DISTANCE_TRAVELLED'] = stop_detail['DISTANCE_TRAVELLED'].astype(str).str.replace(",", "").astype(float)

### Generate New Mapping Data

In [14]:
stop_detail = stop_detail[stop_detail['UNIQUEID'].map(stop_detail['UNIQUEID'].value_counts()) > 10]

In [15]:
#stop_detail = stop_detail.head(100000) # Only for test data set

In [16]:
dummy_journeys = stop_detail.groupby('UNIQUEID').apply(pd.DataFrame.sample, 
                                                       n=10).reset_index(drop=True)

In [17]:
dummy_journeys['trip_index']= (dummy_journeys.index / 2 + 1).astype(int)

In [18]:
dummy_journeys = dummy_journeys.sort_values(by=['UNIQUEID', 'trip_index', 'PROGRNUMBER'])

In [19]:
dummy_journeys["UNIQUEID"] = dummy_journeys["UNIQUEID"] + "_" + dummy_journeys["trip_index"].astype(str)

### Generate New Feature Columns

In [20]:
dummy_journeys['PREVIOUS_ID'] = dummy_journeys['UNIQUEID'].shift(periods=1)

In [21]:
dummy_journeys['previous_stop_dist'] = dummy_journeys['DISTANCE_TRAVELLED'].shift(periods=1)

In [22]:
dummy_journeys['PREV_STOPID'] = dummy_journeys['STOPID'].shift(periods=1)

In [23]:
dummy_journeys['PREV_PROGRNUMBER'] = dummy_journeys['PROGRNUMBER'].shift(periods=1)

In [24]:
dummy_journeys['PREV_TIME'] = dummy_journeys['ACTUALTIME_ARR_TRIP'].shift(periods=1)

In [25]:
dummy_journeys = dummy_journeys.where(dummy_journeys['UNIQUEID'] == dummy_journeys['PREVIOUS_ID'])

In [26]:
dummy_journeys['DIST_BETWEEN'] = (dummy_journeys['DISTANCE_TRAVELLED'] - 
                                  dummy_journeys['previous_stop_dist']).where(dummy_journeys['UNIQUEID'] == dummy_journeys['PREVIOUS_ID'])

In [27]:
dummy_journeys['STOPS_BETWEEN'] = (dummy_journeys['PROGRNUMBER']-dummy_journeys['PREV_PROGRNUMBER']).where(dummy_journeys['UNIQUEID'] == dummy_journeys['PREVIOUS_ID'])

In [28]:
dummy_journeys['JOURNEY_TIME'] = (dummy_journeys['ACTUALTIME_ARR_TRIP']-dummy_journeys['PREV_TIME']).where(dummy_journeys['UNIQUEID'] == dummy_journeys['PREVIOUS_ID'])

In [29]:
dummy_journeys.dropna(subset = ["STOPID"], inplace=True)

In [30]:
dummy_journeys = dummy_journeys[['WEEKDAY', 'LINEID', 'HOUR',  
                                 'ROUTEID', 'DIRECTION', 'DIST_BETWEEN', 
                                 'STOPS_BETWEEN', 'temp', 
                                 'weather_main', 'JOURNEY_TIME']]

In [31]:
dummy_journeys

Unnamed: 0,WEEKDAY,LINEID,HOUR,ROUTEID,DIRECTION,DIST_BETWEEN,STOPS_BETWEEN,temp,weather_main,JOURNEY_TIME
1,Monday,37,06:00:00,37_14,1.0,767.0,2.0,9.00,Drizzle,82.0
3,Monday,37,06:00:00,37_14,1.0,5336.0,14.0,9.00,Drizzle,1230.0
5,Monday,37,06:00:00,37_14,1.0,2664.0,8.0,9.00,Drizzle,494.0
6,Monday,37,06:00:00,37_14,1.0,9241.0,28.0,9.00,Drizzle,1101.0
9,Monday,37,06:00:00,37_14,1.0,14640.0,44.0,9.00,Drizzle,2400.0
...,...,...,...,...,...,...,...,...,...,...
355221,Sunday,79,17:00:00,79_11,2.0,4725.0,16.0,10.03,Clouds,933.0
355223,Sunday,79,17:00:00,79_11,2.0,1293.0,1.0,10.03,Clouds,211.0
355224,Sunday,79,17:00:00,79_11,2.0,4563.0,15.0,10.03,Clouds,929.0
355227,Sunday,79,17:00:00,79_11,2.0,538.0,2.0,10.03,Clouds,74.0


In [32]:
dummy_journeys.to_csv("dummy_journeys.csv")