In [170]:
sc

<pyspark.context.SparkContext at 0x7f06359c5d90>

In [199]:
sc.applicationId

u'application_1529929920393_0152'

## Load the libraries

In [200]:
import sys
sys.path.append("/usr/lib/python2.7/site-packages")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datetime import datetime

from pyspark.sql.functions import year, month, dayofmonth, isnan, hour, format_number
from pyspark.sql.functions import col, unix_timestamp,abs, log, from_unixtime, avg
from pyspark.sql.functions import count, sum, desc, date_format, lit, concat, udf
from pyspark.sql import functions as F
from pyspark.sql import Window
from pyspark.sql.types import IntegerType

In [201]:
# For displaying multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load the data from Part 5 

In [202]:
sqlContext.sql("use bmtc_eta_default")

# Read bus stop traversal times with a cap on the gap
# between bus stops
engineered_df = sqlContext.sql("select * from vts_normal_speed")
engineered_count = engineered_df.count()
engineered_count

DataFrame[]

1199113

In [203]:
engineered_df.show(5)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|q3_val|count|time_cap|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+
|    4544|       2589|         23|     900|                  122|            1|          3|         17| 308.0|    1|  542.17|
|    4996|       8345|         16|    1400|                  191|            3|          3|         19| 598.0|    2|  843.37|
|    5800|       4650|          7|     500|                   56|            2|          0|         21|  71.0|    1|  301.20|
|    5806|        218|         15|     700|                   57|            1|          1|         19|  80.0|    1|  421.69|
|    7362|         21|         10|     386|                   45|            3|          3|         15|  60.0|    1|  

In [204]:
sqlContext.sql("use bmtc")

# Get the route map
route_map_df = sqlContext.sql("select route_id,start_bus_stop_id as prev_bus_stop_id,\
                               end_bus_stop_id as bus_stop_id,\
                               bus_stop_order as prev_bus_stop_order from route_map")

# Get the route_point
route_point_df = sqlContext.sql("select route_id, route_order, bus_stop_id from route_point")

DataFrame[]

In [205]:
route_map_df.filter(col("route_id") == 4544).show(1)

+--------+----------------+-----------+-------------------+
|route_id|prev_bus_stop_id|bus_stop_id|prev_bus_stop_order|
+--------+----------------+-----------+-------------------+
|    4544|             480|       8882|                  1|
+--------+----------------+-----------+-------------------+
only showing top 1 row



## Data Engineering:

### For OD matrix, add the previous bus stop information based upon the route ID

In [206]:
OD_entry_df = engineered_df.join(route_map_df, ["route_id", "bus_stop_id"], "left_outer")

In [207]:
OD_entry_df.show(5)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|q3_val|count|time_cap|prev_bus_stop_id|prev_bus_stop_order|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+
|    7114|        887|         29|     605|                   69|            2|          3|          6|  79.0|    1|  364.46|             888|                 28|
|   13340|       6091|         15|     739|                   49|            2|          1|          6|  60.0|    8|  445.18|            7509|                 14|
|   13340|       6091|         15|     739|                   49|            3|          4|         16|  52.0|    2|  445.18|            7509|                 14|
|   13340|       6091|

In [None]:
#TODO: What if the bus stop id is 1

### Add the categorical features

In [208]:
# Information as conveyed to us by BMTC regarding traffic patterns
def monthBlock(mth):
    if(mth <= 5):
        block = 1
    elif(mth <= 10):
        block = 2
    elif(mth <= 12):
        block = 3
    else:
        block = -1
    return(block)

In [209]:
udfMonthBlock = udf(monthBlock)

In [210]:
OD_entry_2_df = OD_entry_df.withColumn("month_block", udfMonthBlock(OD_entry_df.month_of_trip))

In [211]:
OD_entry_2_df.show(1)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|q3_val|count|time_cap|prev_bus_stop_id|prev_bus_stop_order|month_block|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+
|    7114|        887|         29|     605|                   69|            2|          3|          6|  79.0|    1|  364.46|             888|                 28|          1|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+
only showing top 1 row



In [212]:
def hourBlock(hour_of_day):
    if(hour_of_day <= 7):
        block = 1
    elif(hour_of_day <= 11):
        block = 2
    elif(hour_of_day <= 16):
        block = 3
    elif(hour_of_day <= 20):
        block = 4
    elif(hour_of_day <= 23):
        block = 1
    else:
        block = -1
    return(block)

In [213]:
udfHourBlock = udf(hourBlock)

In [214]:
OD_entry_3_df = OD_entry_2_df.withColumn("hour_block", udfHourBlock(OD_entry_2_df.hour_of_day))

In [215]:
OD_entry_3_df.show(1)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+----------+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|q3_val|count|time_cap|prev_bus_stop_id|prev_bus_stop_order|month_block|hour_block|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+----------+
|    7114|        887|         29|     605|                   69|            2|          3|          6|  79.0|    1|  364.46|             888|                 28|          1|         1|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+----------+
only showing top 1 row



In [216]:
OD_entry_3_df = OD_entry_3_df.withColumn("day_of_week", OD_entry_3_df["day_of_week"].cast(IntegerType()))

In [217]:
def dayBlock(day_of_week):
    # print("dayBlock: {}".format(day_of_week))
    if(day_of_week == 0):
        block = 1
    elif(day_of_week <= 5):
        block = 2
    elif(day_of_week == 6):
        block = 1
    else:
        block = -1
    return(block)

In [218]:
udfDayBlock = udf(dayBlock)

In [219]:
OD_entry_4_df = OD_entry_3_df.withColumn("day_block", udfDayBlock(OD_entry_3_df.day_of_week))

In [225]:
OD_entry_4_df = OD_entry_4_df.withColumn("q3_count", col("q3_val") * col("count"))

In [226]:
OD_entry_4_df.show(2)

+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+----------+---------+--------+
|route_id|bus_stop_id|route_order|distance|google_time_to_travel|month_of_trip|day_of_week|hour_of_day|q3_val|count|time_cap|prev_bus_stop_id|prev_bus_stop_order|month_block|hour_block|day_block|q3_count|
+--------+-----------+-----------+--------+---------------------+-------------+-----------+-----------+------+-----+--------+----------------+-------------------+-----------+----------+---------+--------+
|    7114|        887|         29|     605|                   69|            2|          3|          6|  79.0|    1|  364.46|             888|                 28|          1|         1|        2|    79.0|
|   13340|       6091|         15|     739|                   49|            2|          1|          6|  60.0|    8|  445.18|            7509|                 14|          1|      

In [227]:
OD_entry_4_count = OD_entry_4_df.count()
OD_entry_4_count

1199116

### Ready to create OD matrices by [ month_block, day_block, hour_block ]

In [228]:
from pyspark.sql.functions import lag, col
from pyspark.sql.window import Window

In [231]:
OD_entry_5_df = OD_entry_4_df.groupBy("month_block","day_block","hour_block", 
                                    "prev_bus_stop_id", "bus_stop_id").agg({"q3_count": "sum", "count": "sum"})

In [232]:
OD_entry_5_df.show(2)

+-----------+---------+----------+----------------+-----------+-------------+----------+
|month_block|day_block|hour_block|prev_bus_stop_id|bus_stop_id|sum(q3_count)|sum(count)|
+-----------+---------+----------+----------------+-----------+-------------+----------+
|          1|        2|         1|              64|       8207|         50.0|         4|
|          1|        2|         3|            9291|       7391|    1202956.0|      5034|
+-----------+---------+----------+----------------+-----------+-------------+----------+
only showing top 2 rows



In [235]:
OD_entry_6_df = OD_entry_5_df.withColumnRenamed("sum(q3_count)", "sum_q3_count") 
OD_entry_6_df = OD_entry_6_df.withColumnRenamed("sum(count)", "sum_count") 
OD_entry_6_df = OD_entry_6_df.withColumn("avg_time_taken", col("sum_q3_count")/col("sum_count")) 

In [236]:
OD_entry_6_df = OD_entry_6_df.withColumn("avg_time_taken",
                                         col("avg_time_taken").cast(IntegerType()))

In [237]:
OD_entry_6_df.show(10)

+-----------+---------+----------+----------------+-----------+------------+---------+--------------+
|month_block|day_block|hour_block|prev_bus_stop_id|bus_stop_id|sum_q3_count|sum_count|avg_time_taken|
+-----------+---------+----------+----------------+-----------+------------+---------+--------------+
|          1|        2|         2|             362|       5561|   1331749.0|    12170|           109|
|          1|        1|         3|              28|      10287|    443558.0|     1302|           340|
|          1|        1|         4|              57|      10005|     23474.0|      204|           115|
|          1|        2|         2|             560|        281|      3372.0|       50|            67|
|          1|        1|         3|              45|         38|    207035.0|      848|           244|
|          1|        1|         2|             205|       5707|       811.0|        5|           162|
|          1|        2|         4|            5365|       7502|   1053153.0|     3

### Now construct the route_map with the OD_matrix

In [77]:
### Given that there are multiple google_time_to_travel against the same pair of stops, we average them

# route_map_gdf = route_map_df.groupBy("prev_bus_stop_id", "bus_stop_id").agg({"distance": "avg", 
#                                                                              "google_time_to_travel": "avg"})
# route_map_gdf.show(5)

# route_map_1_gdf = route_map_gdf.withColumnRenamed("avg(google_time_to_travel)", "google_time_to_travel") 
# route_map_1_gdf = route_map_1_gdf.withColumnRenamed("avg(distance)", "distance") 
# route_map_1_gdf = route_map_1_gdf.withColumn("google_time_to_travel", col("google_time_to_travel").cast(IntegerType()))
# route_map_1_gdf = route_map_1_gdf.withColumn("distance", col("distance").cast(IntegerType()))

# OD_entry_7_df = OD_entry_6_df.join(route_map_1_gdf, ["prev_bus_stop_id", "bus_stop_id"], "left_outer")

In [None]:
# Filter the volvo route_id's
volvo_route_id_list = OD_entry_4_df.select("route_id").rdd.map(lambda x:x[0]).distinct().collect()
len(volvo_route_id_list)

In [238]:
sqlContext.sql("use bmtc")

# Get the route map, not taking the variation in the selected pair of bus stops across different route id
# i.e. ignoring the distance and time to travel variation
route_map_df = sqlContext.sql("select route_id, start_bus_stop_id as prev_bus_stop_id,\
                                      end_bus_stop_id as bus_stop_id, bus_stop_order, distance, \
                                      time_to_travel as google_time_to_travel \
                                from route_map")

DataFrame[]

In [239]:
route_map_df.show(2)
route_map_df.count()

+--------+----------------+-----------+--------------+--------+---------------------+
|route_id|prev_bus_stop_id|bus_stop_id|bus_stop_order|distance|google_time_to_travel|
+--------+----------------+-----------+--------------+--------+---------------------+
|     809|             160|       5841|             1|     140|                   10|
|     809|            5841|         33|             2|     200|                   14|
+--------+----------------+-----------+--------------+--------+---------------------+
only showing top 2 rows



1043894

In [240]:
volvo_route_map_df = route_map_df.filter(col("route_id").isin(volvo_route_id_list))
volvo_route_map_df.count()

23033

In [241]:
volvo_route_map_df.select("bus_stop_id").distinct().count()

2484

In [242]:
OD_entry_6_df.show(2)

+-----------+---------+----------+----------------+-----------+------------+---------+--------------+
|month_block|day_block|hour_block|prev_bus_stop_id|bus_stop_id|sum_q3_count|sum_count|avg_time_taken|
+-----------+---------+----------+----------------+-----------+------------+---------+--------------+
|          1|        2|         2|             362|       5561|   1331749.0|    12170|           109|
|          1|        1|         3|              28|      10287|    443558.0|     1302|           340|
+-----------+---------+----------+----------------+-----------+------------+---------+--------------+
only showing top 2 rows



In [251]:
# Formulate the volvo_route_map_df with the OD matrix.
# The route_order and prev_bus_stop_order were eliminated during OD matrix creation.
# Now the bus_stop_order represents the prev_bus_stop_id
volvo_route_map_joined_df = volvo_route_map_df.join(OD_entry_6_df,['prev_bus_stop_id', 'bus_stop_id'],"left_outer")
volvo_route_map_joined_df.count() # The rows will increase based upon the selected combination of block

154327

In [252]:
volvo_route_map_joined_df.show(3)

+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+------------+---------+--------------+
|prev_bus_stop_id|bus_stop_id|route_id|bus_stop_order|distance|google_time_to_travel|month_block|day_block|hour_block|sum_q3_count|sum_count|avg_time_taken|
+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+------------+---------+--------------+
|             126|        158|   28807|             8|     764|                  215|          1|        2|         3|    573837.0|     1759|           326|
|             126|        158|   28807|             8|     764|                  215|          1|        2|         2|    476824.0|     1885|           252|
|             126|        158|   28807|             8|     764|                  215|          1|        1|         2|    185339.0|      846|           219|
+----------------+-----------+--------+--------------+----

In [253]:
# Validation for a route
volvo_route_map_joined_df.filter(col("route_id") == 23067).orderBy("bus_stop_order").show(3)

+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+------------+---------+--------------+
|prev_bus_stop_id|bus_stop_id|route_id|bus_stop_order|distance|google_time_to_travel|month_block|day_block|hour_block|sum_q3_count|sum_count|avg_time_taken|
+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+------------+---------+--------------+
|             160|       5841|   23067|             1|      80|                    5|          1|        1|         2|     51467.0|     1134|            45|
|             160|       5841|   23067|             1|      80|                    5|          1|        2|         1|    112214.0|     2786|            40|
|             160|       5841|   23067|             1|      80|                    5|          1|        2|         3|    185092.0|     5284|            35|
+----------------+-----------+--------+--------------+----

In [254]:
volvo_route_map_joined_df = volvo_route_map_joined_df.drop("sum_q3_count")

In [261]:
volvo_route_map_null_df = volvo_route_map_joined_df.filter(col("avg_time_taken").isNull())

In [263]:
# Just for exploration
volvo_route_map_null_df.count()
volvo_route_map_null_df.show(5)

1455

+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+
|prev_bus_stop_id|bus_stop_id|route_id|bus_stop_order|distance|google_time_to_travel|month_block|day_block|hour_block|sum_count|avg_time_taken|
+----------------+-----------+--------+--------------+--------+---------------------+-----------+---------+----------+---------+--------------+
|             405|       6914|   28767|            35|    1935|                  411|       null|     null|      null|     null|          null|
|            1094|       1095|    9244|             3|     249|                   21|       null|     null|      null|     null|          null|
|            1094|       1095|    3359|             5|     200|                   20|       null|     null|      null|     null|          null|
|            1707|       2713|   22611|            11|     855|                  258|       null|     null|      null|     null|        

In [256]:
volvo_route_map_joined_pdf = volvo_route_map_joined_df.toPandas()
volvo_route_map_joined_pdf.head(10)

Unnamed: 0,prev_bus_stop_id,bus_stop_id,route_id,bus_stop_order,distance,google_time_to_travel,month_block,day_block,hour_block,sum_count,avg_time_taken
0,126,158,20187,8,764,209,1,1,1,1015.0,169.0
1,126,158,20187,8,764,209,1,2,1,2732.0,169.0
2,126,158,20187,8,764,209,1,1,3,1257.0,275.0
3,126,158,20187,8,764,209,1,1,2,846.0,219.0
4,126,158,20187,8,764,209,1,2,4,1297.0,309.0
5,126,158,20187,8,764,209,1,2,2,1885.0,252.0
6,126,158,20187,8,764,209,1,2,3,1759.0,326.0
7,126,158,20187,8,764,209,1,1,4,1060.0,282.0
8,126,158,23067,8,764,209,1,1,1,1015.0,169.0
9,126,158,23067,8,764,209,1,2,1,2732.0,169.0


In [271]:
sqlContext.sql("use bmtc_eta_default")
volvo_route_map_joined_df.createOrReplaceTempView("temp_volvo_route_map_joined_df") 
sqlContext.sql("create table vts_od_matrix as select * from temp_volvo_route_map_joined_df")

DataFrame[]

DataFrame[]

In [272]:
vts_od_matrix_df = sqlContext.sql("select * from vts_od_matrix")

In [273]:
vts_od_matrix_df.count()

154327

In [260]:
volvo_route_map_null_pdf = volvo_route_map_joined_pdf["avg_time_taken"].isnull()
volvo_route_map_null_pdf.head(3)

0    False
1    False
2    False
Name: avg_time_taken, dtype: bool

## Imputation

In [264]:
#volvo_route_map_joined_pdf['avg_traversal_Q3'].isnull().value_counts()

In [None]:
## Drive speed of travel in kmph
# engineered_feature1_pdf["speed"] = engineered_feature1_pdf.apply(
#                                         lambda row: 3.6*(row["distance"]/row["q3_val"]) if row["q3_val"] > 0 else None,
#                                                                 axis = 1)