In [1]:
sc

<pyspark.context.SparkContext at 0x7f83e87bcd90>

In [2]:
sc.applicationId

u'application_1529929920393_0376'

## Load the libraries

In [9]:
import os
import sys
sys.path.append("/usr/lib/python2.7/site-packages")
import re

import numpy as np
import pandas as pd

from pyspark.sql.functions import col
from pyspark.sql.functions import year, month, dayofmonth, hour

from IPython import display
import matplotlib.pyplot as plt

import dash
import dash_core_components as dcc
import dash_html_components as html
import dash_table_experiments as dt
from dash.dependencies import Input, Output

# For displaying multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load the data

In [4]:
sqlContext.sql("use bmtc")

DataFrame[]

In [5]:
schedule_details_df = sqlContext.sql('select * from schedule_details')

schedule_details_df = schedule_details_df.select("form_four_id","schedule_number","trip_number","trip_type", "start_time", "end_time",
                                                 col("route_number_id").alias("route_id"))

form_four_df = sqlContext.sql("select form_four_id,form_four_name,schedule_number_id,\
                                      schedule_number_name,no_of_trips,start_time,\
                                      route_id,route_number,toll_zone,\
                                      area_limit,total_km,total_dead_km,\
                                      actual_km,total_running_time,total_break_time,\
                                      total_steering_time,spread_over_hours,ot_hours \
                               from form_four")

# Get the route_point
route_point_df = sqlContext.sql("select route_id, route_order, bus_stop_id from route_point")
bus_stop_df = sqlContext.sql("select bus_stop_id,bus_stop_name,latitude_current,longitude_current \
                                from bus_stop")

route_bus_stop_df = route_point_df.join(bus_stop_df, ["bus_stop_id"], "inner")

# Get the route map
route_map_df = sqlContext.sql("select route_id,start_bus_stop_id as prev_bus_stop_id,\
                               end_bus_stop_id as bus_stop_id,\
                               bus_stop_order as prev_bus_stop_order from route_map")

In [10]:
sqlContext.sql("use bmtcwaybill")

# Get the waybill details, and clean it
waybill_trip_details_df = sqlContext.sql("select * from waybill_trip_details")

# Filter waybill for the the Year 2017
waybill_trip_details_filtered_df = waybill_trip_details_df.filter(year(waybill_trip_details_df.duty_dt) == 2017)

volvo_waybill_df = waybill_trip_details_filtered_df.filter(col("schedule_name").like("V%"))
volvo_waybill_count = volvo_waybill_df.count()
volvo_waybill_count

DataFrame[]

2708934

In [26]:
volvo_schedule_no_list = volvo_waybill_df.select("schedule_no").rdd.map(lambda x: x[0]).distinct().collect()
volvo_schedule_name_list = volvo_waybill_df.select("schedule_name").rdd.map(lambda x: x[0]).distinct().collect()

In [27]:
len(volvo_schedule_no_list)
len(volvo_schedule_name_list)

1700

1095

In [12]:
schedule_details_df.filter(col("form_four_id") == 33561).show(10)

+------------+---------------+-----------+---------+----------+--------+--------+
|form_four_id|schedule_number|trip_number|trip_type|start_time|end_time|route_id|
+------------+---------------+-----------+---------+----------+--------+--------+
|       33561|           9256|          1|        3|  08:15:00|08:20:00|   13429|
|       33561|           9256|          2|        2|  08:25:00|09:30:00|   23786|
|       33561|           9256|          3|        2|  09:35:00|11:15:00|   29742|
|       33561|           9256|          4|        2|  11:20:00|13:00:00|   23478|
|       33561|           9256|          5|        2|  13:30:00|15:10:00|   29742|
|       33561|           9256|          6|        2|  15:15:00|16:55:00|   23478|
|       33561|           9256|          7|        2|  17:25:00|19:05:00|   29742|
|       33561|           9256|          8|        2|  19:10:00|19:45:00|   21819|
|       33561|           9256|          9|        3|  19:45:00|19:50:00|   13675|
+------------+--

### Load the OD Matrix

In [13]:
sqlContext.sql("use bmtc_eta_default")

# Read bus stop traversal times with a cap on the gap
# between bus stops
engineered_df = sqlContext.sql("select * from vts_od_matrix")

DataFrame[]

In [14]:
engineered_count = engineered_df.count()
engineered_count

engineered_df.show(5)

154327

+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+
|prev_bus_stop_id|bus_stop_id|route_id|bus_stop_order|distance|google_time_to_travel|month_block|day_block|hour_block|sum_count|avg_time_taken|
+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+
|             126|        158|   23067|             8|     764|                  209|          1|        1|         2|      846|           219|
|             126|        158|   23067|             8|     764|                  209|          1|        2|         1|     2732|           169|
|             126|        158|   23067|             8|     764|                  209|          1|        1|         3|     1257|           275|
|             126|        158|   23067|             8|     764|                  209|          1|        2|         3|     1759|        

In [15]:
# Extract the volvo route IDs 
valid_volvo_route_ids = engineered_df.select("route_id").rdd.map(lambda x: x[0]).distinct().collect()
valid_volvo_route_ids_count = len(valid_volvo_route_ids)
valid_volvo_route_ids_count

700

### Validation of OD matrix with volvo schedule details

In [16]:
# Take the OD matrix for a particular schedule
engineered_df.filter(col('route_id') == 23067).orderBy('bus_stop_order').show(500)

+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+
|prev_bus_stop_id|bus_stop_id|route_id|bus_stop_order|distance|google_time_to_travel|month_block|day_block|hour_block|sum_count|avg_time_taken|
+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+
|             160|       5841|   23067|             1|      80|                    5|          1|        1|         3|     1423|            39|
|             160|       5841|   23067|             1|      80|                    5|          1|        2|         1|     2786|            40|
|             160|       5841|   23067|             1|      80|                    5|          1|        1|         1|      615|            47|
|             160|       5841|   23067|             1|      80|                    5|          1|        1|         2|     1134|        

### Add the bus stop information

In [17]:
bus_stop_df.show(2)

+-----------+-----------------+----------------+-----------------+
|bus_stop_id|    bus_stop_name|latitude_current|longitude_current|
+-----------+-----------------+----------------+-----------------+
|          3|        Kodihalli|     12.96004656|      77.64719738|
|          4|Vijayanagara TTMC|     12.96572391|      77.53510224|
+-----------+-----------------+----------------+-----------------+
only showing top 2 rows



In [18]:
engineered_1_df = engineered_df.join(bus_stop_df,["bus_stop_id"], "left_outer")

In [19]:
engineered_df.count()
engineered_1_df.count()

154327

154327

In [20]:
engineered_1_df.show(2)

+-----------+----------------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+-------------+----------------+-----------------+
|bus_stop_id|prev_bus_stop_id|route_id|bus_stop_order|distance|google_time_to_travel|month_block|day_block|hour_block|sum_count|avg_time_taken|bus_stop_name|latitude_current|longitude_current|
+-----------+----------------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+-------------+----------------+-----------------+
|        158|             126|   23067|             8|     764|                  209|          1|        1|         2|      846|           219|  Corporation|     12.96483149|      77.58802933|
|        158|             126|   23067|             8|     764|                  209|          1|        2|         1|     2732|           169|  Corporation|     12.96483149|      77.58802933|
+-----------+----------------+-----

### Construct a panda dataframe

In [28]:
engineered_1_df.dtypes

[('bus_stop_id', 'int'),
 ('prev_bus_stop_id', 'int'),
 ('route_id', 'int'),
 ('bus_stop_order', 'int'),
 ('distance', 'int'),
 ('google_time_to_travel', 'int'),
 ('month_block', 'string'),
 ('day_block', 'string'),
 ('hour_block', 'string'),
 ('sum_count', 'bigint'),
 ('avg_time_taken', 'int'),
 ('bus_stop_name', 'string'),
 ('latitude_current', 'double'),
 ('longitude_current', 'double')]

In [29]:
engineered_pdf = engineered_1_df.toPandas()
# engineered_pdf["month_block"] = engineered_pdf["month_block"].astype(float).round(0).astype(int)
# engineered_pdf["day_block"] = engineered_pdf["day_block"].astype(float).round(0).astype(int)
# engineered_pdf["hour_block"] = engineered_pdf["hour_block"].astype(float).round(0).astype(int)

In [30]:
engineered_pdf.head()

Unnamed: 0,bus_stop_id,prev_bus_stop_id,route_id,bus_stop_order,distance,google_time_to_travel,month_block,day_block,hour_block,sum_count,avg_time_taken,bus_stop_name,latitude_current,longitude_current
0,158,126,23067,8,764,209,1,1,2,846.0,219.0,Corporation,12.964831,77.588029
1,158,126,23067,8,764,209,1,2,1,2732.0,169.0,Corporation,12.964831,77.588029
2,158,126,23067,8,764,209,1,1,3,1257.0,275.0,Corporation,12.964831,77.588029
3,158,126,23067,8,764,209,1,2,3,1759.0,326.0,Corporation,12.964831,77.588029
4,158,126,23067,8,764,209,1,2,4,1297.0,309.0,Corporation,12.964831,77.588029


### Feature 2: Speed

In [31]:
# Drive speed of travel in kmph
engineered_pdf["speed_kmph"] = engineered_pdf.apply( lambda row: 3.6*(row["distance"]/row["avg_time_taken"]) \
                                                                 if row["avg_time_taken"] > 0 else None,
                                                      axis = 1)

In [32]:
engineered_pdf.head()

Unnamed: 0,bus_stop_id,prev_bus_stop_id,route_id,bus_stop_order,distance,google_time_to_travel,month_block,day_block,hour_block,sum_count,avg_time_taken,bus_stop_name,latitude_current,longitude_current,speed_kmph
0,158,126,23067,8,764,209,1,1,2,846.0,219.0,Corporation,12.964831,77.588029,12.558904
1,158,126,23067,8,764,209,1,2,1,2732.0,169.0,Corporation,12.964831,77.588029,16.274556
2,158,126,23067,8,764,209,1,1,3,1257.0,275.0,Corporation,12.964831,77.588029,10.001455
3,158,126,23067,8,764,209,1,2,3,1759.0,326.0,Corporation,12.964831,77.588029,8.43681
4,158,126,23067,8,764,209,1,2,4,1297.0,309.0,Corporation,12.964831,77.588029,8.900971


In [34]:
engineered_pdf['speed_kmph'] = engineered_pdf['speed_kmph'].round(2)

In [35]:
engineered_pdf.head()

Unnamed: 0,bus_stop_id,prev_bus_stop_id,route_id,bus_stop_order,distance,google_time_to_travel,month_block,day_block,hour_block,sum_count,avg_time_taken,bus_stop_name,latitude_current,longitude_current,speed_kmph
0,158,126,23067,8,764,209,1,1,2,846.0,219.0,Corporation,12.964831,77.588029,12.56
1,158,126,23067,8,764,209,1,2,1,2732.0,169.0,Corporation,12.964831,77.588029,16.27
2,158,126,23067,8,764,209,1,1,3,1257.0,275.0,Corporation,12.964831,77.588029,10.0
3,158,126,23067,8,764,209,1,2,3,1759.0,326.0,Corporation,12.964831,77.588029,8.44
4,158,126,23067,8,764,209,1,2,4,1297.0,309.0,Corporation,12.964831,77.588029,8.9


In [37]:
form_four_pdf = form_four_df.toPandas()
schedule_details_pdf = schedule_details_df.toPandas()

## Visualization

In [78]:
app = dash.Dash()

app.layout = html.Div(children=[
    html.H1(children='BMTC Schedule Rationalization'),
    
    html.Div(children='''
        Select Schedule Attributes:
    '''),
    
    dcc.Input(
        id = 'input-schedule_no',
        placeholder='Enter Schedule No.',
        type='text',
        value=''
    ),
    
    dcc.Dropdown(
        id='dropdown-schedule_type',
        options=[
            {'label': sch_type, 'value': sch_type} for sch_type in ["All Days", "Week Days", "Holiday"]
            ],
        searchable=False
    ),
    
    html.Div(children='''
        Select Schedule Now:
    '''),
    
    dcc.Dropdown(
        id='dropdown-schedule_name',
        searchable=False
    ),
    
    html.Div(children='''
        Select Form Four ID:
    '''),
    
    dcc.Dropdown(
        id='dropdown-form_four_id',
        searchable=False
    ),
    
    html.Div(children='''
        Select Route No.:
    '''),
    
    dcc.Dropdown(
        id='dropdown-route_id',
        searchable=False
    ),
    
    html.Div(children='''
        Select Day Type:
    '''),
    
    dcc.Dropdown(
        id='dropdown-day_block',
        options = [{'label': "Weekend", 'value': 1},
                   {'label': "Weekday", 'value': 2}],
        searchable=False
    ),
    
    dt.DataTable(
        # Initialise the rows
        rows=[{}],
        columns = [u'route_id', u'bus_stop_id', u'prev_bus_stop_id', u'bus_stop_name', u'bus_stop_order',
                   u'distance', u'month_block', u'day_block', 
                   u'hour_block', u'google_time_to_travel', u'avg_time_taken', u'sum_count', u'speed_kmph'],
        row_selectable=True,
        filterable=True,
        sortable=True,
        selected_row_indices=[],
        id='info-table'
    )
    
])

@app.callback( Output('dropdown-schedule_name', 'options'), 
              [Input('input-schedule_no', 'value'), Input('dropdown-schedule_type', 'value')])
def update_schedule_name_dropdown(sch_no, sch_type):
    print("update_schedule_name_dropdown: Sch No: {}, and Type: {}".format(sch_no, sch_type))
    if (sch_no is not None) & (sch_type is not None):
        pattern = re.compile(".*{}.*{}".format(sch_no,sch_type))
        result = filter(pattern.match, volvo_schedule_name_list)
        return [{'label': sch, 'value': sch} for sch in result]

@app.callback( Output('dropdown-form_four_id', 'options'), 
              [Input('dropdown-schedule_name', 'value')])
def update_form_four_id(schedule_name):
    print("update_form_four_id: {}".format(schedule_name))
    form_four_id_list = form_four_pdf[form_four_pdf['schedule_number_name'] == schedule_name]['schedule_number_id']
    return [{'label': form_four_id, 'value': form_four_id} for form_four_id in form_four_id_list]
    
@app.callback( Output('dropdown-route_id', 'options'), [Input('dropdown-form_four_id', 'value')])
def update_route_id(form_four_id):
    print("update_route_id: {}".format(form_four_id))
    if form_four_id is not None:
        route_id_list = schedule_details_pdf[schedule_details_pdf['form_four_id'] == form_four_id]["route_id"]
        # Use valid_volvo_route_ids obtained from the OD matrix
        intersection_list = list(set(valid_volvo_route_ids) & set(route_id_list))
        print("Length of route_id_list is", len(route_id_list), 
              "Length of intersection_list is", len(intersection_list))
        return [{'label': route_id, 'value': route_id} for route_id in intersection_list]
    
@app.callback(Output('info-table', 'rows'), [Input('dropdown-route_id', 'value'),
                                            Input('dropdown-day_block', 'value')])
def update_table(route_id, day_block):
    """
    For user selections, return the relevant table
    """
    if (route_id is not None) & (day_block is not None):
        print("update_table: route_id: {}, day_block: {}".format(route_id, day_block))
        df = engineered_pdf[ (engineered_pdf.route_id == route_id) & 
                             (engineered_pdf.day_block == str(day_block)) ]
        print("update_table: count: {}".format(df.distance.count()))
        df = df.sort_values(by=['bus_stop_order'])
    else:
        df = engineered_pdf
    
    return df.to_dict('records')
    

In [79]:
app.run_server(debug=False, host = '172.21.100.194')

 * Running on http://172.21.100.194:8050/ (Press CTRL+C to quit)
172.19.98.111 - - [08/Jul/2018 19:10:42] "GET / HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:43] "GET /_dash-layout HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:43] "GET /_dash-dependencies HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:43] "GET /favicon.ico HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:43] "POST /_dash-update-component HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:43] "POST /_dash-update-component HTTP/1.1" 200 -


update_form_four_id: None
update_schedule_name_dropdown: Sch No: , and Type: None


172.19.98.111 - - [08/Jul/2018 19:10:55] "POST /_dash-update-component HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:55] "POST /_dash-update-component HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:56] "POST /_dash-update-component HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:56] "POST /_dash-update-component HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:56] "POST /_dash-update-component HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:10:56] "POST /_dash-update-component HTTP/1.1" 200 -


update_route_id: None
update_schedule_name_dropdown: Sch No: 5, and Type: None
update_schedule_name_dropdown: Sch No: 50, and Type: None
update_schedule_name_dropdown: Sch No: 500, and Type: None
update_schedule_name_dropdown: Sch No: 500, and Type: All Days


172.19.98.111 - - [08/Jul/2018 19:10:59] "POST /_dash-update-component HTTP/1.1" 200 -


update_form_four_id: V-500BC/2-All Days


172.19.98.111 - - [08/Jul/2018 19:11:03] "POST /_dash-update-component HTTP/1.1" 200 -


update_route_id: 7753
('Length of route_id_list is', 14, 'Length of intersection_list is', 6)


172.19.98.111 - - [08/Jul/2018 19:11:17] "POST /_dash-update-component HTTP/1.1" 200 -
172.19.98.111 - - [08/Jul/2018 19:11:18] "POST /_dash-update-component HTTP/1.1" 200 -


update_table: route_id: 31876, day_block: 2
update_table: count: 141


In [62]:
engineered_pdf[engineered_pdf['route_id'] == 6149].head()

Unnamed: 0,bus_stop_id,prev_bus_stop_id,route_id,bus_stop_order,distance,google_time_to_travel,month_block,day_block,hour_block,sum_count,avg_time_taken,bus_stop_name,latitude_current,longitude_current,speed_kmph
11416,5003,5002,6149,1,500,50,1,1,3,25.0,63.0,Wheel And Axle Plant,13.106071,77.582201,28.57
11417,5003,5002,6149,1,500,50,1,2,4,236.0,68.0,Wheel And Axle Plant,13.106071,77.582201,26.47
11418,5003,5002,6149,1,500,50,1,1,4,35.0,88.0,Wheel And Axle Plant,13.106071,77.582201,20.45
11419,5003,5002,6149,1,500,50,1,1,1,43.0,74.0,Wheel And Axle Plant,13.106071,77.582201,24.32
11420,5003,5002,6149,1,500,50,1,1,2,53.0,72.0,Wheel And Axle Plant,13.106071,77.582201,25.0


In [69]:
engineered_pdf.dtypes

bus_stop_id                int64
prev_bus_stop_id           int64
route_id                   int64
bus_stop_order             int64
distance                   int64
google_time_to_travel      int64
month_block               object
day_block                 object
hour_block                object
sum_count                float64
avg_time_taken           float64
bus_stop_name             object
latitude_current         float64
longitude_current        float64
speed_kmph               float64
dtype: object

In [73]:
engineered_pdf[ (engineered_pdf.day_block == str(2)) ].head()

Unnamed: 0,bus_stop_id,prev_bus_stop_id,route_id,bus_stop_order,distance,google_time_to_travel,month_block,day_block,hour_block,sum_count,avg_time_taken,bus_stop_name,latitude_current,longitude_current,speed_kmph
1,158,126,23067,8,764,209,1,2,1,2732.0,169.0,Corporation,12.964831,77.588029,16.27
3,158,126,23067,8,764,209,1,2,3,1759.0,326.0,Corporation,12.964831,77.588029,8.44
4,158,126,23067,8,764,209,1,2,4,1297.0,309.0,Corporation,12.964831,77.588029,8.9
6,158,126,23067,8,764,209,1,2,2,1885.0,252.0,Corporation,12.964831,77.588029,10.91
9,158,126,23078,8,764,209,1,2,1,2732.0,169.0,Corporation,12.964831,77.588029,16.27


In [55]:
engineered_pdf.distance.count()

154327

In [101]:
len(valid_volvo_route_ids)

700

In [102]:
len(volvo_route_schedule_details_list)

3137