In [1]:
sc

<pyspark.context.SparkContext at 0x7fda8e01ed90>

In [2]:
sc.applicationId

u'application_1529929920393_0148'

## Load libraries

In [3]:
import sys
sys.path.append("/usr/lib/python2.7/site-packages")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import col, unix_timestamp,abs, log, from_unixtime, avg
from pyspark.sql.functions import count, sum, desc, date_format, lit, concat
from pyspark.sql import functions as F

In [4]:
# For displaying multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load data

In [5]:
sqlContext.sql("use bmtc")

# Get the route map
route_map_df = sqlContext.sql("select route_id,start_bus_stop_id,end_bus_stop_id,\
                                      distance,time_to_travel,bus_stop_order,status \
                               from route_map")

# Get the route_point
route_point_df = sqlContext.sql("select route_id, route_order, bus_stop_id from route_point")

bus_stop_df = sqlContext.sql("select bus_stop_id,bus_stop_name,latitude_current,longitude_current from bus_stop")

# Drop corrupted locations
bus_stop_df = bus_stop_df.na.drop(subset=["latitude_current"])
bus_stop_df = bus_stop_df.na.drop(subset=["longitude_current"])

# Join the bus stop ID with lat,long
route_point_joined_df = route_point_df.join(bus_stop_df,\
                                            ["bus_stop_id"],\
                                            "left_outer")

form_four_df = sqlContext.sql("select form_four_id,form_four_name,schedule_number_id,\
                                      schedule_number_name,no_of_trips,start_time,\
                                      route_id,route_number,toll_zone,\
                                      area_limit,total_km,total_dead_km,\
                                      actual_km,total_running_time,total_break_time,\
                                      total_steering_time,spread_over_hours,ot_hours \
                               from form_four")

schedule_df = sqlContext.sql('select * from schedule')

schedule_df = schedule_df.select("schedule_id","schedule_number","depot_id","route_id","schedule_type")

schedule_details_df = sqlContext.sql('select * from schedule_details')

schedule_details_df = schedule_details_df.select("schedule_details_id","form_four_id","schedule_number","number_of_trips",\
                           "trip_number","trip_type","start_point","end_point","route_number_id",\
                           "route_number","route_direction","distance","start_time","end_time",\
                           "running_time","break_type_id","shift_type_id","is_dread_trip")

DataFrame[]

## Load the VTS_Waybill data

In [6]:
sqlContext.sql("use bmtc_eta_default")
sqlContext.sql("show tables").show(truncate=False)

DataFrame[]

+---------------------------+-----------+
|tableName                  |isTemporary|
+---------------------------+-----------+
|vts_365r                   |false      |
|vts_365r_distance          |false      |
|vts_365r_distance_1        |false      |
|vts_365r_filtered          |false      |
|vts_bus_stop_min_dist_apr17|false      |
|vts_bus_stop_min_dist_feb17|false      |
|vts_bus_stop_min_dist_jan17|false      |
|vts_bus_stop_min_dist_mar17|false      |
|vts_bus_stop_min_dist_may17|false      |
|vts_full_bus_stops_apr17   |false      |
|vts_full_bus_stops_feb17   |false      |
|vts_full_bus_stops_jan17   |false      |
|vts_full_bus_stops_mar17   |false      |
|vts_jun2017                |false      |
|vts_normal_speed           |false      |
|vts_volvo_2017             |false      |
|vts_volvo_oct2017          |false      |
|vts_waybill_volvo_apr2017  |false      |
|vts_waybill_volvo_aug2017  |false      |
|vts_waybill_volvo_dec2017  |false      |
+---------------------------+-----

In [7]:
#sqlContext.sql("select count(*) from vts_bus_stop_min_dist_apr17").show()

### >>> Input: Month 

In [8]:
#vts_waybill_volvo_df = sqlContext.sql("select * from vts_waybill_volvo_may2017 where day(ist_date) >=28 and day(ist_date) <= 30")
#UNSURE: vts_waybill_volvo_df = sqlContext.sql("select * from vts_waybill_volvo_may2017 where day(ist_date) >=13 and day(ist_date) <=15")
vts_waybill_volvo_df = sqlContext.sql("select * from vts_waybill_volvo_may2017 where day(ist_date) >=25 and day(ist_date) <=27")

In [9]:
vts_waybill_volvo_count = vts_waybill_volvo_df.count()
vts_waybill_volvo_count

3216245

## Join the augmented table with the bus stops table conditional to the route ID

In [10]:
route_point_joined_df.show(2)

+-----------+--------+-----------+--------------------+----------------+-----------------+
|bus_stop_id|route_id|route_order|       bus_stop_name|latitude_current|longitude_current|
+-----------+--------+-----------+--------------------+----------------+-----------------+
|        160|     809|          1|Kempegowda Bus St...|     12.97751447|      77.57178022|
|       5841|     809|          2|       KBS Connector|     12.97752049|      77.57099145|
+-----------+--------+-----------+--------------------+----------------+-----------------+
only showing top 2 rows



In [11]:
# Take the route id which are plying on roads
route_id_list = vts_waybill_volvo_df.select("route_id").distinct().rdd.map(lambda x:x[0]).collect()

In [12]:
len(route_id_list)

837

In [13]:
route_point_filtered_df = route_point_joined_df.filter(col("route_id").isin(route_id_list))

In [14]:
route_point_joined_count = route_point_joined_df.count()
route_point_joined_count
route_point_filtered_count = route_point_filtered_df.count()
route_point_filtered_count

1078342

19603

In [15]:
vts_waybill_volvo_filtered_df = vts_waybill_volvo_df.select("id", "vts_device_id", "ist_date", "lat", "longitude",
                                                            "ist_epoch", "duty_dt",
                                                            "schedule_no", "schedule_name", "service_type",
                                                            "trip_number", "start_time", "end_time", 
                                                            "route_id", "route_no", "etm_start_time", "etm_end_time")
vts_waybill_volvo_filtered_df.show(2)

+-----------+-------------+--------------------+-----------+-----------+----------+----------+-----------+-------------------+------------+-----------+----------+--------+--------+---------------+--------------------+--------------------+
|         id|vts_device_id|            ist_date|        lat|  longitude| ist_epoch|   duty_dt|schedule_no|      schedule_name|service_type|trip_number|start_time|end_time|route_id|       route_no|      etm_start_time|        etm_end_time|
+-----------+-------------+--------------------+-----------+-----------+----------+----------+-----------+-------------------+------------+-----------+----------+--------+--------+---------------+--------------------+--------------------+
|16987092643|   0e8e5640ff|2017-05-26 08:20:...|12.97711533|77.71913067|1495767034|2017-05-26|      10055|V-335EP/6-Week Days|           3|          4|  08:30:00|09:45:00|   31159|      JFWTC-KBS|2017-05-26 08:22:...|2017-05-26 09:08:...|
|16987092643|   0e8e5640ff|2017-05-26 08:20:

In [16]:
# Join the vts_waybill table with the bus stop table. 
# This will provide the set of bus stop on that route 
vts_waybill_bus_stop_df = vts_waybill_volvo_filtered_df.join(route_point_filtered_df, ['route_id'], "inner")

In [17]:
vts_waybill_bus_stop_count = vts_waybill_bus_stop_df.count()
vts_waybill_bus_stop_count

136183630

In [18]:
vts_waybill_bus_stop_df.show(2)

+--------+-----------+-------------+--------------------+-----------+----------+----------+----------+-----------+------------------+------------+-----------+----------+--------+-----------+--------------------+--------------------+-----------+-----------+-------------+----------------+-----------------+
|route_id|         id|vts_device_id|            ist_date|        lat| longitude| ist_epoch|   duty_dt|schedule_no|     schedule_name|service_type|trip_number|start_time|end_time|   route_no|      etm_start_time|        etm_end_time|bus_stop_id|route_order|bus_stop_name|latitude_current|longitude_current|
+--------+-----------+-------------+--------------------+-----------+----------+----------+----------+-----------+------------------+------------+-----------+----------+--------+-----------+--------------------+--------------------+-----------+-----------+-------------+----------------+-----------------+
|   13289|16987930887|   0e8e563cc2|2017-05-26 08:47:...|12.98657333|77.6479785|14

In [19]:
# Drop corrupted locations after joining with bus stop dataframe.
# These null locations corresponds to route ID not found in the route point joined dataframe

vts_waybill_bus_stop_df = vts_waybill_bus_stop_df.na.drop(subset=["latitude_current"])
vts_waybill_bus_stop_df = vts_waybill_bus_stop_df.na.drop(subset=["longitude_current"])
vts_waybill_bus_stop_count = vts_waybill_bus_stop_df.count()
vts_waybill_bus_stop_count

135423358

## We get the closest bus stop on a route

In [20]:
from pyspark.sql import Window

In [21]:
# Define a UDF for calculating the distance between the two geo points
from pyspark.sql.functions import udf
from math import *

pi_broadcast = sc.broadcast(pi)
pi_broadcast.value

# Return the Haversine distance between two geo-points
# The returned value is in metres
def distBetweenGeoPoints(lat1, long1, lat2, long2):
    dist =  ((acos(sin((lat1*pi_broadcast.value/180)) * 
                   sin((lat2*pi_broadcast.value/180))+cos((lat1*pi_broadcast.value/180)) * 
                   cos((lat2*pi_broadcast.value/180)) * cos(((long1 - long2)* 
              pi_broadcast.value/180))))*180/pi_broadcast.value*60*1.1515*1.60934)*1000
    return round(dist)

# The returned value is in metres

distBetweenGeoPointsUDF = udf(distBetweenGeoPoints)

3.141592653589793

In [22]:
VW_bus_stop_df = vts_waybill_bus_stop_df.withColumn("dist", 
                                                   distBetweenGeoPointsUDF(vts_waybill_bus_stop_df.lat, 
                                                                           vts_waybill_bus_stop_df.longitude, 
                                                                           vts_waybill_bus_stop_df.latitude_current, 
                                                                           vts_waybill_bus_stop_df.longitude_current).cast("float"))

In [None]:
# Cache the dataframe 
VW_bus_stop_df.cache()

In [None]:
VW_bus_stop_filtered_df = VW_bus_stop_df.filter(VW_bus_stop_df.dist <= 50)

In [None]:
VW_bus_stop_filtered_df.show(2)

In [None]:
VW_bus_stop_filtered_count = VW_bus_stop_filtered_df.count()
VW_bus_stop_filtered_count

In [None]:
VW_bus_stop_filtered_count

## Figure out entry nearest the bus stop

In [None]:
VW_window = Window.partitionBy("vts_device_id", "ist_epoch", "schedule_no", "route_id", "trip_number")
VW_window_1 = Window.partitionBy("duty_dt", "vts_device_id", "schedule_no", "route_id", "trip_number", "bus_stop_id", "route_order")

In [None]:
VW_window_df = VW_bus_stop_filtered_df.withColumn("min_dist", F.min("dist").over(VW_window))
VW_window_1_df = VW_window_df.where(col('dist') == col('min_dist'))

In [None]:
VW_window_count = VW_window_df.count()
VW_window_count
VW_window_1_count = VW_window_1_df.count()
VW_window_1_count

In [None]:
VW_window_2_df = VW_window_1_df.withColumn("min_time", F.min("ist_epoch").over(VW_window_1))
VW_window_3_df = VW_window_2_df.where(col('ist_epoch') == col('min_time'))
VW_window_3_count = VW_window_3_df.count()
VW_window_3_count

In [None]:
#sqlContext.sql("select * from vts_bus_stop_min_dist_apr17 where day(duty_dt) == 24").show(2)

### >>> Input: Change month and create/insert

In [None]:
# Instead of creating a persistent table using saveAsTable, make temp table and dump it as a hive table
VW_window_3_df.createOrReplaceTempView("temp_VW_window_3_df") 
#sqlContext.sql("create table vts_bus_stop_min_dist_may17 as select * from temp_VW_window_3_df")
sqlContext.sql("insert into vts_bus_stop_min_dist_may17 select * from temp_VW_window_3_df")

In [None]:
VW_bus_stop_df.unpersist()

In [None]:
VW_window_3_count

### Validation part

In [322]:
sqlContext.sql("use bmtc_eta_default")
VW_window_3_df = sqlContext.sql("select * from vts_bus_stop_min_dist_mar17")
VW_window_3_count = VW_window_3_df.count()
VW_window_3_count
schedule_335 = VW_window_3_df.filter(col('schedule_name').like("%335%")).select("schedule_name")\
                   .distinct().rdd.map(lambda x:x[0]).collect()

DataFrame[]

573730

In [283]:
# Take schedule 365R/1-All Days, and we see that it takes two schedule no, which corresponds to two form four IDs
VW_window_3_df.filter(col('schedule_name')=='V-335E/44-All Days').select("schedule_no").distinct().show()

+-----------+
|schedule_no|
+-----------+
|      35229|
+-----------+



In [284]:
form_four_df.filter(col('form_four_id').isin([35229])).show()

+------------+--------------+------------------+--------------------+-----------+----------+--------+------------+---------+----------+--------+-------------+---------+------------------+----------------+-------------------+-----------------+--------+
|form_four_id|form_four_name|schedule_number_id|schedule_number_name|no_of_trips|start_time|route_id|route_number|toll_zone|area_limit|total_km|total_dead_km|actual_km|total_running_time|total_break_time|total_steering_time|spread_over_hours|ot_hours|
+------------+--------------+------------------+--------------------+-----------+----------+--------+------------+---------+----------+--------+-------------+---------+------------------+----------------+-------------------+-----------------+--------+
|       35229|             1|              9434|  V-335E/44-All Days|          6|  18:20:00|   32611|        null|        0|         0|   113.0|          0.6|    112.4|              6:50|            null|               7:35|              8:5|  

In [285]:
# schedule_details_df.filter((col("schedule_number")==9434) & (col('form_four_id')==35229) ).show()

In [286]:
VW_schedule_df = VW_window_3_df.filter(col("schedule_name") == 'V-335E/44-All Days')

In [287]:
VW_schedule_count = VW_schedule_df.count()
VW_schedule_count

410

In [288]:
VW_schedule_df.show(2)

+--------+-----------+-------------+--------------------+---------+---------+----------+----------+-----------+------------------+------------+-----------+----------+--------+--------+--------------------+--------------------+-----------+-----------+--------------+----------------+-----------------+----+--------+----------+
|route_id|         id|vts_device_id|            ist_date|      lat|longitude| ist_epoch|   duty_dt|schedule_no|     schedule_name|service_type|trip_number|start_time|end_time|route_no|      etm_start_time|        etm_end_time|bus_stop_id|route_order| bus_stop_name|latitude_current|longitude_current|dist|min_dist|  min_time|
+--------+-----------+-------------+--------------------+---------+---------+----------+----------+-----------+------------------+------------+-----------+----------+--------+--------+--------------------+--------------------+-----------+-----------+--------------+----------------+-----------------+----+--------+----------+
|   32612|14190026552|

In [291]:
VW_by_device_df = VW_window_3_df.filter(col("vts_device_id") == 150218239).orderBy("ist_epoch")

In [292]:
VW_by_device_count = VW_by_device_df.count()
VW_by_device_count

983

In [65]:
temp_time_df = VW_by_device_df.filter(dayofmonth(col('ist_date')) == 1)\
                                .select("ist_date", "schedule_no", "route_id", "trip_number", "bus_stop_id", "bus_stop_name", "route_order")
temp_time_filtered_df = temp_time_df.filter(col('trip_number') == 2)

In [66]:
temp_time_filtered_df.show(1323)

+--------------------+-----------+--------+-----------+-----------+--------------------+-----------+
|            ist_date|schedule_no|route_id|trip_number|bus_stop_id|       bus_stop_name|route_order|
+--------------------+-----------+--------+-----------+-----------+--------------------+-----------+
|2017-02-01 06:45:...|      33103|   28798|          2|         33|      KBS Connector2|         15|
|2017-02-01 06:48:...|      33103|   28798|          2|      11378|   Depot-07 Dummy BS|         16|
|2017-02-01 06:51:...|      33103|   28798|          2|        124|   Maharanis College|         17|
|2017-02-01 06:54:...|      33103|   28798|          2|         61|           KR Circle|         18|
|2017-02-01 06:56:...|      33103|   28798|          2|        127|         Corporation|         19|
|2017-02-01 06:59:...|      33103|   28798|          2|        157|St Joseph Boys Hi...|         20|
|2017-02-01 07:01:...|      33103|   28798|          2|        854|     Richmond Circle|   

In [411]:
temp_time_filtered_df.count()

42

In [438]:
temp_route_point_df = route_point_joined_df.filter(col("route_id") == 23067)

In [447]:
temp_time_route_point_joined_df = temp_route_point_df.join(temp_time_filtered_df,
                                                          ['route_id', 'bus_stop_id', 'route_order'],
                                                          'left_outer')

In [448]:
temp_time_route_point_joined_pdf = temp_time_route_point_joined_df.toPandas()

In [449]:
temp_time_route_point_joined_pdf = temp_time_route_point_joined_pdf.drop(['latitude_current', 'longitude_current'], axis = 1)

In [450]:
temp_time_route_point_joined_pdf = temp_time_route_point_joined_pdf.sort_values(by='route_order')

In [451]:
temp_time_route_point_joined_pdf

Unnamed: 0,route_id,bus_stop_id,route_order,bus_stop_name,ist_date,schedule_no,trip_number,bus_stop_name.1
37,23067,160,1,Kempegowda Bus Station,,,,
25,23067,5841,2,KBS Connector,,,,
31,23067,33,3,KBS Connector2,,,,
43,23067,5840,4,KBS connector3,2017-01-01 08:25:24.0,28000.0,2.0,KBS connector3
30,23067,8455,5,KBS Dummy Bus Stand,2017-01-01 08:25:14.0,28000.0,2.0,KBS Dummy Bus Stand
10,23067,124,6,Maharanis College,2017-01-01 08:29:02.0,28000.0,2.0,Maharanis College
45,23067,61,7,KR Circle,2017-01-01 08:30:23.0,28000.0,2.0,KR Circle
33,23067,126,8,St Marthas Hospital,,,,
15,23067,158,9,Corporation,2017-01-01 08:33:42.0,28000.0,2.0,Corporation
29,23067,159,10,Subbaiah Circle,2017-01-01 08:35:21.0,28000.0,2.0,Subbaiah Circle


In [454]:
temp_time_route_point_joined_pdf.route_order_previous = temp_time_route_point_joined_pdf.route_order.shift(-1)

In [455]:
temp_time_route_point_joined_pdf

Unnamed: 0,route_id,bus_stop_id,route_order,bus_stop_name,ist_date,schedule_no,trip_number,bus_stop_name.1
37,23067,160,1,Kempegowda Bus Station,,,,
25,23067,5841,2,KBS Connector,,,,
31,23067,33,3,KBS Connector2,,,,
43,23067,5840,4,KBS connector3,2017-01-01 08:25:24.0,28000.0,2.0,KBS connector3
30,23067,8455,5,KBS Dummy Bus Stand,2017-01-01 08:25:14.0,28000.0,2.0,KBS Dummy Bus Stand
10,23067,124,6,Maharanis College,2017-01-01 08:29:02.0,28000.0,2.0,Maharanis College
45,23067,61,7,KR Circle,2017-01-01 08:30:23.0,28000.0,2.0,KR Circle
33,23067,126,8,St Marthas Hospital,,,,
15,23067,158,9,Corporation,2017-01-01 08:33:42.0,28000.0,2.0,Corporation
29,23067,159,10,Subbaiah Circle,2017-01-01 08:35:21.0,28000.0,2.0,Subbaiah Circle


In [402]:
route_point_joined_df.filter(col("route_id") == 28788).show(90)

+-----------+--------+-----------+--------------------+----------------+-----------------+
|bus_stop_id|route_id|route_order|       bus_stop_name|latitude_current|longitude_current|
+-----------+--------+-----------+--------------------+----------------+-----------------+
|        160|   28788|          1|Kempegowda Bus St...|     12.97751447|      77.57178022|
|       5841|   28788|          2|       KBS Connector|     12.97752049|      77.57099145|
|         33|   28788|          3|      KBS Connector2|     12.97944962|      77.57116351|
|       5840|   28788|          4|      KBS connector3|     12.97989895|      77.57139177|
|       8455|   28788|          5| KBS Dummy Bus Stand|       12.980274|      77.57191479|
|        124|   28788|          6|   Maharanis College|     12.97705906|      77.58594229|
|         61|   28788|          7|           KR Circle|     12.97472667|         77.58691|
|        126|   28788|          8| St Marthas Hospital|     12.96962529|      77.58719475|

In [391]:
sqlContext.sql("use bmtcwaybill")
sqlContext.sql("show tables").show()
waybill_df = sqlContext.sql("select * from waybill_trip_details")

DataFrame[]

+--------------------+-----------+
|           tableName|isTemporary|
+--------------------+-----------+
|         vts_jan0517|      false|
|waybill_trip_details|      false|
| temp_vw_window_3_df|       true|
+--------------------+-----------+



In [397]:
waybill_df.filter(col("schedule_no") == 28000).select('schedule_no','trip_number',\
                                                             'start_bus_stop_name',\
                                                             'end_bus_stop_name',\
                                                             'route_id').distinct().orderBy('trip_number').show(50,truncate=False)

+-----------+-----------+--------------------------------------+------------------------------------+--------+
|schedule_no|trip_number|start_bus_stop_name                   |end_bus_stop_name                   |route_id|
+-----------+-----------+--------------------------------------+------------------------------------+--------+
|28000      |1          |Depot-7                               |Kempegowda Bus Station              |13509   |
|28000      |1          |Depot-07 Subhashnagara                |Kempegowda Bus Station              |13509   |
|28000      |1          |Depot-07                              |Kempegowda Bus Station              |13509   |
|28000      |2          |Kempegowda Bus Station                |Bannerghatta National Park(Arrival) |23067   |
|28000      |2          |Kempegowda Bus Station                |Bannerughatta National Park         |23067   |
|28000      |2          |Kempegowda Bus Station                |Bannerughatta National Park(Arrival)|23067   |
|