In [1]:
sc

<pyspark.context.SparkContext at 0x7f2c157e4d90>

In [2]:
sc.applicationId

u'application_1529929920393_0075'

In [3]:
import sys
sys.path.append("/usr/lib/python2.7/site-packages")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime
from pyspark.sql.functions import year, month, dayofmonth, isnan, hour, format_number
from pyspark.sql.functions import col, unix_timestamp,abs, log, from_unixtime, avg
from pyspark.sql.functions import count, sum, desc, date_format, lit, concat, udf
from pyspark.sql import functions as F
from pyspark.sql import Window

In [4]:
# For displaying multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
sqlContext.sql("use bmtc_eta_default")
months = ["jan17", "feb17", "mar17", "apr17"]

# Full bus stops are frames filled with actual information
# plus null where data is unavailable
all_months = ["vts_full_bus_stops_" + mth for mth in months]
all_months

vts_df = []
for index in range(len(all_months)):
    sql_query = "select * from " + all_months[index]
    vts_df.append(sqlContext.sql(sql_query))

DataFrame[]

['vts_full_bus_stops_jan17',
 'vts_full_bus_stops_feb17',
 'vts_full_bus_stops_mar17',
 'vts_full_bus_stops_apr17']

In [6]:
# Combine the data from all months into a single dataframe
# Declare a counter to store the number of VTS rows
# in each month
counter = []

# Append all of the data frames into a single DF, via union
vts_main_df = vts_df[0]
counter.append(vts_df[0].count())
for index in range(len(vts_df)-1):
    counter.append(vts_df[index+1].count())
    vts_main_df = vts_main_df.union(vts_df[index+1])

In [7]:
counter

[3694562, 3262450, 3428942, 2972514]

In [8]:
vts_main_count = vts_main_df.count()
vts_main_count

13358468

In [9]:
vts_main_df.filter(col("trip_number") > 5).show(1)

+----------+-------------+-----------+--------+-----------+----------+--------+-----------+-----------+--------------+-----------+--------------------+---------+---------+----------+------------------+------------+--------+--------------------+--------------------+----------------+-----------------+----+--------+----------+------------+
|   duty_dt|vts_device_id|schedule_no|route_id|trip_number|start_time|end_time|route_order|bus_stop_id| bus_stop_name|         id|            ist_date|      lat|longitude| ist_epoch|     schedule_name|service_type|route_no|      etm_start_time|        etm_end_time|latitude_current|longitude_current|dist|min_dist|  min_time|lagged_epoch|
+----------+-------------+-----------+--------+-----------+----------+--------+-----------+-----------+--------------+-----------+--------------------+---------+---------+----------+------------------+------------+--------+--------------------+--------------------+----------------+-----------------+----+--------+--------

## Feature Extraction

### 1) Day of week

In [10]:
weekDay =  udf(lambda x: datetime.strptime(x, '%Y-%m-%d').strftime('%w'))

vts_featured_df = vts_main_df
vts_featured_df = vts_featured_df.withColumn('day_of_week', weekDay(vts_featured_df['duty_dt']))

In [11]:
vts_featured_df.show(2)

+----------+-------------+-----------+--------+-----------+----------+--------+-----------+-----------+--------------------+-----------+--------------------+---------+---------+----------+-----------------+------------+---------+--------------------+--------------------+----------------+-----------------+----+--------+----------+------------+-----------+
|   duty_dt|vts_device_id|schedule_no|route_id|trip_number|start_time|end_time|route_order|bus_stop_id|       bus_stop_name|         id|            ist_date|      lat|longitude| ist_epoch|    schedule_name|service_type| route_no|      etm_start_time|        etm_end_time|latitude_current|longitude_current|dist|min_dist|  min_time|lagged_epoch|day_of_week|
+----------+-------------+-----------+--------+-----------+----------+--------+-----------+-----------+--------------------+-----------+--------------------+---------+---------+----------+-----------------+------------+---------+--------------------+--------------------+---------------

### 2) Distance between bus stops

In [12]:
sqlContext.sql("use bmtc")

# Get the route map
route_map_df = sqlContext.sql("select route_id,start_bus_stop_id,end_bus_stop_id,\
                                      distance,time_to_travel, bus_stop_order from route_map")

DataFrame[]

In [13]:
route_map_df.filter(col("route_id") == 19776).show(50)

+--------+-----------------+---------------+--------+--------------+--------------+
|route_id|start_bus_stop_id|end_bus_stop_id|distance|time_to_travel|bus_stop_order|
+--------+-----------------+---------------+--------+--------------+--------------+
|   19776|             4310|           4970|     855|            80|             1|
|   19776|             4970|           5365|    1485|           113|             2|
|   19776|             5365|            214|    1569|           171|             3|
|   19776|              214|            215|     365|            42|             4|
|   19776|              215|            216|     656|            47|             5|
|   19776|              216|            217|     527|            36|             6|
|   19776|              217|           5309|     372|            45|             7|
|   19776|             5309|            218|     365|            66|             8|
|   19776|              218|            219|    1006|           113|        

In [14]:
route_map_df = route_map_df.withColumnRenamed("end_bus_stop_id", "bus_stop_id")
route_map_df = route_map_df.withColumnRenamed("time_to_travel", "google_time_to_travel")
route_map_df = route_map_df.drop(col("start_bus_stop_id"))
route_map_df = route_map_df.drop(col("bus_stop_order"))

In [15]:
route_map_df.show(2)

+--------+-----------+--------+---------------------+
|route_id|bus_stop_id|distance|google_time_to_travel|
+--------+-----------+--------+---------------------+
|     809|       5841|     140|                   10|
|     809|         33|     200|                   14|
+--------+-----------+--------+---------------------+
only showing top 2 rows



In [16]:
# Adding "distance" from route_map_df to reach the current bus stop from the previous one
vts_featured_1_df = vts_featured_df.join(route_map_df,
                                        ["route_id", "bus_stop_id"],
                                        "left_outer")
vts_featured_1_count = vts_featured_1_df.count()
vts_featured_1_count

13358484

In [17]:
vts_featured_1_df.show(1)

+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+--------+--------+------------+-----------+--------+---------------------+
|route_id|bus_stop_id|   duty_dt|vts_device_id|schedule_no|trip_number|start_time|end_time|route_order|  bus_stop_name|  id|ist_date| lat|longitude|ist_epoch|schedule_name|service_type|route_no|etm_start_time|etm_end_time|latitude_current|longitude_current|dist|min_dist|min_time|lagged_epoch|day_of_week|distance|google_time_to_travel|
+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+--------+--------+------------+-----------+--------+------------

In [18]:
# This is an anomaly - perhaps due to the fact that the route_map table has had a new entry??
vts_featured_1_df.filter((col("distance").isNull()) & (col("route_order") != 1)).select("route_id").distinct().show()

+--------+
|route_id|
+--------+
|   19776|
+--------+



### 3) Time taken to travel to a bus stop from the previous one

In [19]:
vts_featured_2_df = vts_featured_1_df.withColumn("time_from_prev_stop", 
                                                 vts_featured_1_df["ist_epoch"] - vts_featured_1_df["lagged_epoch"])

In [20]:
vts_featured_2_df.show(2)

+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+--------+--------+------------+-----------+--------+---------------------+-------------------+
|route_id|bus_stop_id|   duty_dt|vts_device_id|schedule_no|trip_number|start_time|end_time|route_order|  bus_stop_name|  id|ist_date| lat|longitude|ist_epoch|schedule_name|service_type|route_no|etm_start_time|etm_end_time|latitude_current|longitude_current|dist|min_dist|min_time|lagged_epoch|day_of_week|distance|google_time_to_travel|time_from_prev_stop|
+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+--------+--------+------

In [21]:
vts_featured_2_count = vts_featured_2_df.count()
vts_featured_2_count

13358484

### Check for the negative time travel to a bus stop from the previous bus stop

In [22]:
# Remove rows with the negative timestamp
vts_featured_2_neg_df = vts_featured_2_df.filter(col('time_from_prev_stop') < 0)
vts_featured_2_neg_count = vts_featured_2_neg_df.count()
vts_featured_2_neg_count

367896

In [23]:
# Turns out that a significant number of entries have ist_epoch as null
vts_featured_2_df.filter(vts_featured_2_df["ist_epoch"].isNull()).count()

4431102

In [24]:
vts_featured_2_df.describe("time_from_prev_stop").show()

+-------+-------------------+
|summary|time_from_prev_stop|
+-------+-------------------+
|  count|            7987584|
|   mean| 155.10999095596367|
| stddev|  1845.717970044498|
|    min|             -94796|
|    max|              95104|
+-------+-------------------+



In [25]:
vts_featured_2_df.filter(col('trip_number') == 3).show(1)

+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+-------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+--------+--------+------------+-----------+--------+---------------------+-------------------+
|route_id|bus_stop_id|   duty_dt|vts_device_id|schedule_no|trip_number|start_time|end_time|route_order|bus_stop_name|  id|ist_date| lat|longitude|ist_epoch|schedule_name|service_type|route_no|etm_start_time|etm_end_time|latitude_current|longitude_current|dist|min_dist|min_time|lagged_epoch|day_of_week|distance|google_time_to_travel|time_from_prev_stop|
+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+-------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+--------+--------+------------

In [26]:
vts_featured_2_df.filter((col("duty_dt") == "2017-01-01") &
                                    (col("vts_device_id") == 150219641) &
                                     (col("schedule_no") == 24906) &
                                     (col("route_id") == 28884) &
                                     (col("trip_number") == 3) &
                                     (col("start_time") == "09:20:00") &
                                     (col("end_time") == "11:00:00"))\
                .select("duty_dt","trip_number", "route_order", "bus_stop_name",
                                       "ist_epoch", "lagged_epoch", "time_from_prev_stop" )\
                .orderBy("route_order")\
                .show(10)

+----------+-----------+-----------+--------------------+----------+------------+-------------------+
|   duty_dt|trip_number|route_order|       bus_stop_name| ist_epoch|lagged_epoch|time_from_prev_stop|
+----------+-----------+-----------+--------------------+----------+------------+-------------------+
|2017-01-01|          3|          1|Kadugodi Bus Station|1483243736|        null|               null|
|2017-01-01|          3|          2|      KDGD Connector|1483243856|  1483243736|                120|
|2017-01-01|          3|          3|     KDGD Dummy Stop|1483243686|  1483243856|               -170|
|2017-01-01|          3|          4|           Hope Farm|1483243308|  1483243686|               -378|
|2017-01-01|          3|          5|                 BPL|1483244593|  1483243308|               1285|
|2017-01-01|          3|          6|   GR Tech Park ITPL|1483244632|  1483244593|                 39|
|2017-01-01|          3|          7|                ITPL|1483244693|  1483244632| 

### Make the negative time_from_prev_stop as NULL

In [27]:
def negTimeTravelNullConverter(time_from_prev_stop):
    if(time_from_prev_stop < 0):
        time_from_prev_stop = None
    return time_from_prev_stop

In [28]:
from pyspark.sql.types import IntegerType
negTimeTravelNullConverterUDF = udf(negTimeTravelNullConverter, IntegerType())

In [29]:
vts_featured_3_df = vts_featured_2_df.withColumn("time_from_prev_stop", 
                                                      negTimeTravelNullConverterUDF(vts_featured_2_df.time_from_prev_stop))

In [30]:
# vts_featured_3_df.dtypes

In [31]:
vts_featured_3_df.filter((col("duty_dt") == "2017-01-01") &
                                    (col("vts_device_id") == 150219641) &
                                     (col("schedule_no") == 24906) &
                                     (col("route_id") == 28884) &
                                     (col("trip_number") == 3) &
                                     (col("start_time") == "09:20:00") &
                                     (col("end_time") == "11:00:00"))\
                .select("duty_dt","trip_number", "route_order", "bus_stop_name",
                                       "ist_epoch", "lagged_epoch", "time_from_prev_stop" )\
                .orderBy("route_order")\
                .show(10)

+----------+-----------+-----------+--------------------+----------+------------+-------------------+
|   duty_dt|trip_number|route_order|       bus_stop_name| ist_epoch|lagged_epoch|time_from_prev_stop|
+----------+-----------+-----------+--------------------+----------+------------+-------------------+
|2017-01-01|          3|          1|Kadugodi Bus Station|1483243736|        null|               null|
|2017-01-01|          3|          2|      KDGD Connector|1483243856|  1483243736|                120|
|2017-01-01|          3|          3|     KDGD Dummy Stop|1483243686|  1483243856|               null|
|2017-01-01|          3|          4|           Hope Farm|1483243308|  1483243686|               null|
|2017-01-01|          3|          5|                 BPL|1483244593|  1483243308|               1285|
|2017-01-01|          3|          6|   GR Tech Park ITPL|1483244632|  1483244593|                 39|
|2017-01-01|          3|          7|                ITPL|1483244693|  1483244632| 

### 4) Feature: time_slot

In [32]:
vts_featured_4_df = vts_featured_3_df.withColumn("hour_of_day", hour(vts_featured_1_df.ist_date))

In [33]:
vts_featured_4_df.show(1)

+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+--------+--------+------------+-----------+--------+---------------------+-------------------+-----------+
|route_id|bus_stop_id|   duty_dt|vts_device_id|schedule_no|trip_number|start_time|end_time|route_order|  bus_stop_name|  id|ist_date| lat|longitude|ist_epoch|schedule_name|service_type|route_no|etm_start_time|etm_end_time|latitude_current|longitude_current|dist|min_dist|min_time|lagged_epoch|day_of_week|distance|google_time_to_travel|time_from_prev_stop|hour_of_day|
+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+----+--------+----+---------+---------+-------------+------------+--------+--------------+------------+----------------+-----------------+----+

In [34]:
vts_featured_4_count = vts_featured_4_df.count()
vts_featured_4_count

13358484

### 5) Feature: month_of_trip

In [49]:
vts_featured_5_df = vts_featured_4_df.withColumn('month_of_trip', month(vts_featured_4_df['duty_dt']))

## Remove null rows

In [50]:
vts_filtered_df = vts_featured_5_df.filter(col("time_from_prev_stop").isNotNull())

In [51]:
vts_filtered_count = vts_filtered_df.count()
vts_filtered_count

7619688

In [52]:
vts_filtered_df.show(1)

+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+-----------+--------------------+---------+---------+----------+-------------------+------------+--------+--------------------+--------------------+----------------+-----------------+----+--------+----------+------------+-----------+--------+---------------------+-------------------+-----------+-------------+
|route_id|bus_stop_id|   duty_dt|vts_device_id|schedule_no|trip_number|start_time|end_time|route_order|  bus_stop_name|         id|            ist_date|      lat|longitude| ist_epoch|      schedule_name|service_type|route_no|      etm_start_time|        etm_end_time|latitude_current|longitude_current|dist|min_dist|  min_time|lagged_epoch|day_of_week|distance|google_time_to_travel|time_from_prev_stop|hour_of_day|month_of_trip|
+--------+-----------+----------+-------------+-----------+-----------+----------+--------+-----------+---------------+-----------+---------

## Ready for Naive Prediction of ETA

In [None]:
vts_filtered_mean_gdf = vts_filtered_df.groupBy("route_id", "bus_stop_id", \
                                                "month_of_trip", "day_of_week", "hour_of_day")\
                                       .mean("time_from_prev_stop")

In [47]:
vts_filtered_mean_gdf.show(10)

+--------+-----------+-----------+-----------+------------------------+
|route_id|bus_stop_id|day_of_week|hour_of_day|avg(time_from_prev_stop)|
+--------+-----------+-----------+-----------+------------------------+
|   13340|       9297|          0|         11|      236.21621621621622|
|   13340|       9297|          5|         12|      234.07142857142858|
|   20187|       5841|          6|         22|                  184.25|
|   21709|         20|          1|         15|                    71.0|
|   25937|      10287|          0|         17|       333.4861111111111|
|   27857|        695|          2|         15|                   318.0|
|   29006|        162|          0|         23|                    67.6|
|   29006|        162|          0|          7|                    60.0|
|   29086|       5559|          1|         19|                   119.0|
|   30485|       2587|          2|         14|      143.97692307692307|
+--------+-----------+-----------+-----------+------------------

In [48]:
vts_filtered_mean_gdf.count()

517584

## Prediction based on the Third Quartile = 75th percentile

In [60]:
sqlContext.sql("use bmtc_eta_default")
vts_filtered_df.registerTempTable("temp_vts_filtered_df")

vts_quartile3_df = sqlContext.sql("select route_id, bus_stop_id, route_order, distance, \
                                   google_time_to_travel, month_of_trip, day_of_week, hour_of_day, \
                                   round(percentile_approx(time_from_prev_stop, 0.75), 0) as Q3_val,\
                                   count(*) as count \
                             from temp_vts_filtered_df \
                             group by route_id, bus_stop_id, route_order,\
                                   distance, google_time_to_travel, month_of_trip, day_of_week, hour_of_day")

DataFrame[]

In [61]:
vts_quartile3_df.show(10)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|Q3_val|count|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+
|    4544|       2589|         23|     900|                  122|            1|          3|         17| 308.0|    1|
|    4996|       8345|         16|    1400|                  191|            3|          3|         19| 598.0|    2|
|    5800|       4650|          7|     500|                   56|            2|          0|         21|  71.0|    1|
|    5806|        218|         15|     700|                   57|            1|          1|         19|  80.0|    1|
|    5806|       4941|          9|     200|                   48|            1|          2|         18| 756.0|    5|
|    7362|          7|         13|     665|                   98

In [62]:
vts_quartile3_df.count()

1433448

In [63]:
vts_quartile3_df.filter((col("route_id") == 16924) & 
                        (col("day_of_week") == 1)).orderBy("route_order").show(50)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|Q3_val|count|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+
|   16924|       5927|          1|    null|                 null|            4|          1|         17|   0.0|    1|
|   16924|        419|          2|    1064|                   99|            3|          1|         16| 175.0|    2|
|   16924|        419|          2|    1064|                   99|            2|          1|         17| 329.0|    6|
|   16924|        419|          2|    1064|                   99|            1|          1|         16| 229.0|    8|
|   16924|        419|          2|    1064|                   99|            1|          1|         17| 288.0|    4|
|   16924|        419|          2|    1064|                   99

In [67]:
# vts_gdf = vts_filtered_df.groupBy("route_id", "bus_stop_id", "route_order", 
#                                   "distance", "google_time_to_travel", 
#                                   "day_of_week", "hour_of_day").agg({"*": "count"})\
#                         .orderBy(col("count(1)"), ascending=True)

In [68]:
# Assuming a bus travels at 6 kmph at a minimum
vts_quartile3_speed_df = vts_quartile3_df.withColumn("time_cap", format_number(col("distance")/1.66,2))

In [69]:
vts_quartile3_speed_df.show(10)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|Q3_val|count|time_cap|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+
|    4544|       2589|         23|     900|                  122|            1|          3|         17| 308.0|    1|  542.17|
|    4996|       8345|         16|    1400|                  191|            3|          3|         19| 598.0|    2|  843.37|
|    5800|       4650|          7|     500|                   56|            2|          0|         21|  71.0|    1|  301.20|
|    5806|        218|         15|     700|                   57|            1|          1|         19|  80.0|    1|  421.69|
|    5806|       4941|          9|     200|                   48|            1|          2|         18| 756.0|    5|  

In [71]:
vts_normal_speed_df = vts_quartile3_speed_df.filter(col("Q3_val") <= col("time_cap"))
vts_normal_speed_count = vts_normal_speed_df.count()
vts_normal_speed_count

1199113

In [72]:
517588 - 425662

91926

In [73]:
# Delete existing table
sqlContext.sql("use bmtc_eta_default")
sqlContext.sql("drop table vts_normal_speed")

DataFrame[]

DataFrame[]

In [74]:
# Instead of creating a persistent table using saveAsTable, make temp table and dump it as a hive table
sqlContext.sql("use bmtc_eta_default")
vts_normal_speed_df.createOrReplaceTempView("temp_vts_normal_speed") 
sqlContext.sql("create table vts_normal_speed as select * from temp_vts_normal_speed")

# sqlContext.sql("insert into vts_bus_stop_min_dist_apr17 select * from temp_VW_window_3_df")

DataFrame[]

DataFrame[]

In [75]:
sqlContext.sql("show tables").show(truncate=False)

+---------------------------+-----------+
|tableName                  |isTemporary|
+---------------------------+-----------+
|vts_365r                   |false      |
|vts_365r_distance          |false      |
|vts_365r_distance_1        |false      |
|vts_365r_filtered          |false      |
|vts_bus_stop_min_dist_apr17|false      |
|vts_bus_stop_min_dist_feb17|false      |
|vts_bus_stop_min_dist_jan17|false      |
|vts_bus_stop_min_dist_mar17|false      |
|vts_full_bus_stops_apr17   |false      |
|vts_full_bus_stops_feb17   |false      |
|vts_full_bus_stops_jan17   |false      |
|vts_full_bus_stops_mar17   |false      |
|vts_jun2017                |false      |
|vts_normal_speed           |false      |
|vts_volvo_2017             |false      |
|vts_volvo_oct2017          |false      |
|vts_waybill_volvo_apr2017  |false      |
|vts_waybill_volvo_feb2017  |false      |
|vts_waybill_volvo_jan2017  |false      |
|vts_waybill_volvo_jun2017  |false      |
+---------------------------+-----

In [76]:
vts_df = sqlContext.sql("select * from vts_normal_speed")
vts_df.count()

1199113