In [1]:
sc

<pyspark.context.SparkContext at 0x7f9748621e10>

In [2]:
sc.applicationId

u'application_1522648856070_0187'

## Load the required packages 

In [3]:
import sys
sys.path.append("/usr/lib/python2.7/site-packages")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import col, asc, desc,log

In [4]:
# For displaying multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load vehicle tracking data

In [5]:
sqlContext.sql("use bmtcvts")

# Laod the VTS data for the duration of jul16 - jun 17
months = ["jul16", "aug16", "sep16", "oct16", "nov16", "dec16", \
          "jan17", "feb17", "mar17", "apr17", "may17", "jun17"]
#          "jul17", "aug17", "sep17", "oct17", "nov17", "dec17"]

all_months = ["vts_" + mth + "_parquet" for mth in months]
all_months

DataFrame[]

['vts_jul16_parquet',
 'vts_aug16_parquet',
 'vts_sep16_parquet',
 'vts_oct16_parquet',
 'vts_nov16_parquet',
 'vts_dec16_parquet',
 'vts_jan17_parquet',
 'vts_feb17_parquet',
 'vts_mar17_parquet',
 'vts_apr17_parquet',
 'vts_may17_parquet',
 'vts_jun17_parquet']

### We are retaining the data frames month by month so we can count rows

In [6]:
vts_df = []
for index in range(len(all_months)):
    sql_query = "select * from " + all_months[index]
    vts_df.append(sqlContext.sql(sql_query))

In [7]:
len(vts_df)
type(vts_df)
type(vts_df[0])

12

list

pyspark.sql.dataframe.DataFrame

In [8]:
# Combine the data from all months into a single dataframe
# Declare a counter to store the number of VTS rows
# in each month
counter = []

# Append all of the data frames into a single DF, via union
vts_main_df = vts_df[0]
counter.append(vts_df[0].count())
for index in range(len(vts_df)-1):
    counter.append(vts_df[index+1].count())
    vts_main_df = vts_main_df.union(vts_df[index+1])

In [9]:
# VTS data accounts to one year from July 2016 to June 2017
# vts_main_df.count()

In [10]:
# VTS data by month, staring index corresponds to July 2016 and last index to June 2017
counter

[1013168409,
 1109032457,
 974045190,
 1061168099,
 1040199416,
 1095604367,
 1067855533,
 976218546,
 1080027417,
 999717643,
 1005573387,
 984483441]

## Load static data into Spark dataframes

### Load the waybill trip details for all of 2016-17

In [11]:
sqlContext.sql("use bmtcwaybill")

# Get the waybill details, and clean it
waybill_trip_details_df = sqlContext.sql("select id,waybill_id,duty_dt,device_id,\
                                          status,schedule_type_id,schedule_no,schedule_name,\
                                          service_type,service_name,trip_number,\
                                          start_point,start_bus_stop_name,end_point,end_bus_stop_name,\
                                          route_id,route_no,distance,start_time,\
                                          act_start_time,etm_start_time,end_time,act_end_time,\
                                          etm_end_time,running_time,is_dread_trip \
                                          from waybill_trip_details")

DataFrame[]

In [12]:
# Filter for Jan month
# waybill_trip_details_filtered_df = waybill_trip_details_df.filter(((year(waybill_trip_details_df.duty_dt) == 2017) & \
#                                                                      (month(waybill_trip_details_df.duty_dt) == 1)))
waybill_trip_details_filtered_df = waybill_trip_details_df.filter(((year(waybill_trip_details_df.duty_dt) == 2016) & 
                                                                   (month(waybill_trip_details_df.duty_dt) >= 6)) |
                                                                  (year(waybill_trip_details_df.duty_dt) == 2017))

## Extract VTS data for 365R/1-All Days

### Get the Schedule No. for the Schedule Name 365R/1-All Days

In [13]:
# Get the list of all schedule names starting with 365
# schedule_365R = waybill_trip_details_filtered_df.filter(col('schedule_name').like("%365R%"))\
# .select("schedule_name").distinct().rdd.map(lambda x:x[0]).collect()

In [14]:
# schedule_365R

In [15]:
# Take schedule 365R/1-All Days, and we see that it takes two schedule no, which corresponds to two form four IDs
# waybill_trip_details_filtered_df.filter(col('schedule_name')=='365R/1-All Days')\
# .select("schedule_no").distinct().show()

In [16]:
# Get the waybill for data for 365R schedule
waybill_trip_details_filtered_365_df = waybill_trip_details_filtered_df.filter(col('schedule_no')==3037)

In [17]:
# Get the list of deviceID's that run on 365R
device_id_365R_list = waybill_trip_details_filtered_365_df.select('device_id')\
.distinct().rdd.map(lambda x:x[0]).collect()

In [18]:
# device_id_365R_list

In [19]:
len(device_id_365R_list)

58

In [20]:
# List the buses running on the schedule across the years
device_id_365R_list.remove('')
#device_id_365R_list

## Filter Stage 1: 
From the one-year corpus, take the VTS data for all the device IDs that have run on the 365R/1-All Days schedule. 

In [21]:
# Filter VTS data for the 365R's deviceID
vts_filtered_df = vts_main_df[vts_main_df.device_id.isin(device_id_365R_list)]

In [67]:
# vts_filtered_df.count()
# Count = 112274282

112274282

### Writing the results of the stage 1 into the hive table
We must run this block across the months and append to table

In [22]:
sqlContext.sql("use bmtc_eta_default")

DataFrame[]

In [23]:
# Save the vts data for 365 to hive
# Instead of creating a persistent table using saveAsTable, make temp table and dump it as a hive table
# vts_filtered_df.createOrReplaceTempView("temp_vts_filtered_df") 

# The lifetime of this temporary table is tied to the :class:`SparkSession`

# Creat a hive table to dump data from temp table
# sqlContext.sql("create table vts_365R as select * from temp_vts_filtered_df")

In [24]:
# Code to append to the exiting table
# Filter VTS data for the 365R's deviceID
#vts_filtered_feb_df = vts_feb_df[vts_feb_df.device_id.isin(device_id_365R_list)]

# Save the vts data for 365 to hive
# Instead of creating a persistent table using saveAsTable, make temp table and dump it as a hive table
#vts_filtered_feb_df.createOrReplaceTempView("temp_vts_filtered_feb_df") 


# The lifetime of this temporary table is tied to the :class:`SparkSession`

# Create a hive table to dump data from temp table
# sqlContext.sql("insert into table vts_365r select * from temp_vts_filtered_feb_df")

In [22]:
sqlContext.sql("show tables").show()

+---------+-----------+
|tableName|isTemporary|
+---------+-----------+
| vts_365r|      false|
+---------+-----------+



In [None]:
# sqlContext.sql("select id, device_id, lat, longitude, ist_date from vts_365r limit 10").show()

+----------+---------+---------+---------+--------------------+
|        id|device_id|      lat|longitude|            ist_date|
+----------+---------+---------+---------+--------------------+
|5867036184|150223075| 12.93866|77.447968|2016-06-30 23:59:...|
|5867036363|150814749|12.871403|77.586349|2016-06-30 23:59:...|
|5867036504|150221823|13.028916| 77.59285|2016-06-30 23:59:...|
|5867036623|150812902|13.026287|77.637184|2016-06-30 23:59:...|
|5867036652|150221245|12.903559|77.473869|2016-06-30 23:59:...|
|5867036764|150812902|13.026287|77.637184|2016-07-01 00:00:...|
|5867036766|150218443|  13.1006|77.594955|2016-06-30 23:59:...|
|5867036831|150221245|12.903559|77.473869|2016-06-30 23:59:...|
|5867036999|150223075|12.938661|77.447968|2016-06-30 23:59:...|
|5867037004|150814749|12.871403|77.586349|2016-06-30 23:59:...|
+----------+---------+---------+---------+--------------------+



In [86]:
# sqlContext.sql("select count(*) from vts_365r").show()

+---------+
| count(1)|
+---------+
|112274282|
+---------+



## Filter Stage 2:
Building upon the stage 1, remove the rows for the device_id when that bus didn't run on the 365R/1-All Days schedule

### Build a dictionary [key = device id, values = days on which they plied on the schedule]

In [23]:
# Create a dictionary of days on which a device (bus) plied on the Schedule

# The results of this dictionary are saved now.
# device_date_mapping = {}

# for device_id in device_id_365R_list:
#     dates_of_plying_list = waybill_trip_details_filtered_365_df.\
#     filter(waybill_trip_details_filtered_365_df.device_id == device_id).\
#     select("duty_dt").distinct().select("duty_dt").rdd.map(lambda x: x[0]).collect()
    
#     device_date_mapping[device_id] = dates_of_plying_list

# # Save the dictionary: so the next time, we can read directly instead of running it again
# import pickle
# import os

# pickle_file = open('device_date_mapping.pickle', 'wb')
# pickle.dump(obj = device_date_mapping, file = pickle_file)
# pickle_file.close()

In [23]:
os.getcwd()

'/home/bdagr1/pubs/ETA'

In [25]:
import pickle
import os

# Load the mappings
pickle_file = open('device_date_mapping.pickle', 'rb')
device_date_mapping = pickle.load(pickle_file)

In [26]:
countVals = sum(len(v) for v in device_date_mapping.itervalues())
countVals

527

In [27]:
date_list = [v for v in device_date_mapping.itervalues()]

In [29]:
# Run this to check if there are multiple buses attached
# to a schedule on a given day
# from itertools import chain
# date_list = list(chain.from_iterable(date_list))
# date_list.sort()
# date_list

In [28]:
device_date_mapping_broadcast = sc.broadcast(device_date_mapping)

In [29]:
# Make use of the device_date_mapping dictionary to 
# eliminate rows from vts_filtered_df for days on
# which a device was NOT running the schedule

# We do this by iterating over the rows of vts_filtered_df
# and applying a function that validates if the 
# corresponding device_id was assigned the schedule (365R)
# on that given date

def isValidForDate(device_id, date):
    if(device_id in device_date_mapping_broadcast.value):
        if(date in device_date_mapping_broadcast.value[device_id]):
            return '1'
        else:
            return '0'
    else: 
        return '-1'

In [31]:
# Test code
# temp_df = vts_filtered_df.limit(4)
# temp_df.withColumn("Valid", isValidForDate(temp_df.device_id, temp_df.ist_date)
# temp_df.select("ist_date").show()
# temp_df = temp_df.withColumn("ist_date_part", date_part)
# temp_df.select("ist_date","ist_date_part").show()
# temp_df = temp_df.withColumn("isValid", isValidForDateUDF(temp_df.device_id, temp_df.ist_date_part))
# temp_df.select("device_id", "ist_date","ist_date_part", "isValid").show()

In [30]:
from pyspark.sql.functions import unix_timestamp, from_unixtime
dateTimeFmt = "yyyy-MM-dd HH:mm:ss.S"
dateFmt = "yyyy-MM-dd"

date_part = from_unixtime(unix_timestamp('ist_date', format=dateTimeFmt), format=dateFmt)

In [31]:
from pyspark.sql.functions import udf

isValidForDateUDF = udf(isValidForDate)

In [32]:
vts_filtered_stage2_df = vts_filtered_df.select("id", "device_id", "ign_status", \
                                                        "acc_distance", "ist_date",  \
                                                        "lat", "longitude", "vehicle_direction")
vts_filtered_stage2_df.show(2)

+----------+---------+----------+------------+--------------------+---------+---------+-----------------+
|        id|device_id|ign_status|acc_distance|            ist_date|      lat|longitude|vehicle_direction|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+
|5867036184|150223075|         1|    19749869|2016-06-30 23:59:...| 12.93866|77.447968|            200.0|
|5867036363|150814749|         0|    14960243|2016-06-30 23:59:...|12.871403|77.586349|            136.0|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+
only showing top 2 rows



In [33]:
vts_filtered_stage2_df = vts_filtered_stage2_df.withColumn("ist_date_part", date_part)
vts_filtered_stage2_df.show(2)

+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+
|        id|device_id|ign_status|acc_distance|            ist_date|      lat|longitude|vehicle_direction|ist_date_part|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+
|5867036184|150223075|         1|    19749869|2016-06-30 23:59:...| 12.93866|77.447968|            200.0|   2016-06-30|
|5867036363|150814749|         0|    14960243|2016-06-30 23:59:...|12.871403|77.586349|            136.0|   2016-06-30|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+
only showing top 2 rows



In [34]:
vts_filtered_stage2_valid_df = vts_filtered_stage2_df.withColumn("isValid", 
                                             isValidForDateUDF(vts_filtered_stage2_df.device_id, 
                                                               vts_filtered_stage2_df.ist_date_part))
vts_filtered_stage2_valid_df.show(2)

+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
|        id|device_id|ign_status|acc_distance|            ist_date|      lat|longitude|vehicle_direction|ist_date_part|isValid|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
|5867036184|150223075|         1|    19749869|2016-06-30 23:59:...| 12.93866|77.447968|            200.0|   2016-06-30|      0|
|5867036363|150814749|         0|    14960243|2016-06-30 23:59:...|12.871403|77.586349|            136.0|   2016-06-30|      0|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
only showing top 2 rows



In [35]:
# vts_filtered_stage3_df = vts_filtered_stage2_df.where(col("isValid") == '1')
vts_filtered_stage2_results_df = vts_filtered_stage2_valid_df.filter(vts_filtered_stage2_valid_df.isValid == '1')
vts_filtered_stage2_results_df.show(2)

+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
|        id|device_id|ign_status|acc_distance|            ist_date|      lat|longitude|vehicle_direction|ist_date_part|isValid|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
|5867192541|150814730|         0|     9201134|2016-07-01 00:12:...|12.790125|77.706451|             40.0|   2016-07-01|      1|
|5867355190|150814730|         0|     9201134|2016-07-01 00:27:...|12.790125|77.706451|             40.0|   2016-07-01|      1|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
only showing top 2 rows



In [42]:
stage2_count = vts_filtered_stage2_results_df.count()
stage2_count

2787154

In [36]:
sqlContext.sql("use bmtc_eta_default")
sqlContext.sql("show tables").show(truncate=False)

DataFrame[]

+---------+-----------+
|tableName|isTemporary|
+---------+-----------+
|vts_365r |false      |
+---------+-----------+



In [37]:
# Save the vts data for 365 to hive
# Instead of creating a persistent table using saveAsTable, make temp table and dump it as a hive table
vts_filtered_stage2_results_df.createOrReplaceTempView("temp_vts_filtered_stage2_results_df")

# To drop the temp view
# spark.catalog.dropTempView("temp_vts_filtered_stage2_results_df")

In [38]:
sqlContext.sql("show tables").show(truncate=False)

+-----------------------------------+-----------+
|tableName                          |isTemporary|
+-----------------------------------+-----------+
|vts_365r                           |false      |
|temp_vts_filtered_stage2_results_df|true       |
+-----------------------------------+-----------+



In [68]:
sqlContext.sql("select * from temp_vts_filtered_stage2_results_df limit 2").show()

+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
|        id|device_id|ign_status|acc_distance|            ist_date|      lat|longitude|vehicle_direction|ist_date_part|isValid|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+
|5867192541|150814730|         0|     9201134|2016-07-01 00:12:...|12.790125|77.706451|             40.0|   2016-07-01|      1|
|5867355190|150814730|         0|     9201134|2016-07-01 00:27:...|12.790125|77.706451|             40.0|   2016-07-01|      1|
+----------+---------+----------+------------+--------------------+---------+---------+-----------------+-------------+-------+



In [39]:
# The lifetime of this temporary table is tied to the :class:`SparkSession`
# Create a Hive table to dump data from temp table
sqlContext.sql("create table vts_365r_filtered as \
               select * from temp_vts_filtered_stage2_results_df")

DataFrame[]

In [40]:
sc.applicationId

u'application_1522648856070_0187'

In [41]:
sqlContext.sql("show tables").show(truncate = False)

+-----------------------------------+-----------+
|tableName                          |isTemporary|
+-----------------------------------+-----------+
|vts_365r                           |false      |
|vts_365r_filtered                  |false      |
|temp_vts_filtered_stage2_results_df|true       |
+-----------------------------------+-----------+

