In [1]:
sc

<pyspark.context.SparkContext at 0x7fa33008cd90>

In [2]:
sc.applicationId

u'application_1529929920393_0090'

## Load libraries

In [17]:
import sys
sys.path.append("/usr/lib/python2.7/site-packages")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import col, unix_timestamp,abs, log, from_unixtime, avg
from pyspark.sql.functions import count, sum, desc, date_format, lit, concat
from pyspark.sql import functions as F

In [18]:
# For displaying multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load static data into Spark dataframes

In [19]:
sqlContext.sql("use bmtc")

DataFrame[]

In [20]:
# Get the route map
route_map_df = sqlContext.sql("select route_id,start_bus_stop_id,end_bus_stop_id,\
                                      distance,time_to_travel,bus_stop_order,status \
                               from route_map")

# Get the route_point
route_point_df = sqlContext.sql("select route_id, route_order, bus_stop_id from route_point")

bus_stop_df = sqlContext.sql("select bus_stop_id,bus_stop_name,latitude_current,longitude_current from bus_stop")

# Drop corrupted locations
bus_stop_df = bus_stop_df.na.drop(subset=["latitude_current"])
bus_stop_df = bus_stop_df.na.drop(subset=["longitude_current"])

# Join the bus stop ID with lat,long
route_point_joined_df = route_point_df.join(bus_stop_df,\
                                            ["bus_stop_id"],\
                                            "left_outer")

form_four_df = sqlContext.sql("select form_four_id,form_four_name,schedule_number_id,\
                                      schedule_number_name,no_of_trips,start_time,\
                                      route_id,route_number,toll_zone,\
                                      area_limit,total_km,total_dead_km,\
                                      actual_km,total_running_time,total_break_time,\
                                      total_steering_time,spread_over_hours,ot_hours \
                               from form_four")

schedule_df = sqlContext.sql('select * from schedule')

schedule_df = schedule_df.select("schedule_id","schedule_number","depot_id","route_id","schedule_type")

schedule_details_df = sqlContext.sql('select * from schedule_details')

schedule_details_df = schedule_details_df.select("schedule_details_id","form_four_id","schedule_number","number_of_trips",\
                           "trip_number","trip_type","start_point","end_point","route_number_id",\
                           "route_number","route_direction","distance","start_time","end_time",\
                           "running_time","break_type_id","shift_type_id","is_dread_trip")

In [21]:
sqlContext.sql("use bmtcwaybill")

# Get the waybill details, and clean it
waybill_trip_details_df = sqlContext.sql("select * from waybill_trip_details")

waybill_trip_details_df = waybill_trip_details_df.select("id","waybill_id","duty_dt",
                                                         col("device_id").alias("WB_device_id"),
                                                         "schedule_type_id","schedule_no","schedule_name",
                                                         "service_type", "service_name",
                                                         "trip_number","start_point","start_bus_stop_name",
                                                         "end_point","end_bus_stop_name",
                                                         "route_id","route_no","distance",
                                                         "start_time","act_start_time",
                                                         "etm_start_time","end_time","act_end_time","etm_end_time",
                                                         "running_time","is_dread_trip")

DataFrame[]

## Run this month-wise 
### >>> Input: Month

In [22]:
# Filter for months in 2017
waybill_trip_details_filtered_df = waybill_trip_details_df.filter(((year(waybill_trip_details_df.duty_dt) == 2017) & 
                                                                   (month(waybill_trip_details_df.duty_dt) == 11)))

In [23]:
volvo_waybill_df = waybill_trip_details_filtered_df.filter(col("schedule_name").like("V%"))
volvo_waybill_count = volvo_waybill_df.count()
volvo_waybill_count

236588

In [24]:
# volvo_waybill_grouped_df = volvo_waybill_df.groupby([year(volvo_waybill_df.duty_dt),
#                                                         month(volvo_waybill_df.duty_dt),
#                                                         dayofmonth(volvo_waybill_df.duty_dt)])\
#                                            .count().orderBy(desc('dayofmonth(duty_dt)'))

# volvo_waybill_grouped_df.show(31)

## Filter out invalid trips as recorded in waybill as null values in ETM start and end times

In [25]:
# Find out why certain entries have null ETM values
volvo_waybill_etm_invalid_df = volvo_waybill_df.filter((col("etm_start_time") == "null") |  
                                                      (col("etm_end_time") == "null"))

In [26]:
volvo_waybill_etm_valid_df = volvo_waybill_df.filter((col("etm_start_time") != "null") & 
                                                     (col("etm_end_time") != "null"))

In [27]:
# These rows correspond to trips that were COMPLETED
volvo_waybill_etm_valid_df.show(5)

+--------+----------+----------+------------+----------------+-----------+------------------+------------+------------+-----------+-----------+--------------------+---------+--------------------+--------+--------+--------+----------+--------------+--------------------+--------+------------+--------------------+------------+-------------+
|      id|waybill_id|   duty_dt|WB_device_id|schedule_type_id|schedule_no|     schedule_name|service_type|service_name|trip_number|start_point| start_bus_stop_name|end_point|   end_bus_stop_name|route_id|route_no|distance|start_time|act_start_time|      etm_start_time|end_time|act_end_time|        etm_end_time|running_time|is_dread_trip|
+--------+----------+----------+------------+----------------+-----------+------------------+------------+------------+-----------+-----------+--------------------+---------+--------------------+--------+--------+--------+----------+--------------+--------------------+--------+------------+--------------------+--------

In [28]:
# These rows correspond to trips that were CANCELLED
volvo_waybill_etm_invalid_df.show(5)

+--------+----------+----------+------------+----------------+-----------+-------------------+------------+------------+-----------+-----------+--------------------+---------+--------------------+--------+--------+--------+----------+--------------+--------------+--------+------------+------------+------------+-------------+
|      id|waybill_id|   duty_dt|WB_device_id|schedule_type_id|schedule_no|      schedule_name|service_type|service_name|trip_number|start_point| start_bus_stop_name|end_point|   end_bus_stop_name|route_id|route_no|distance|start_time|act_start_time|etm_start_time|end_time|act_end_time|etm_end_time|running_time|is_dread_trip|
+--------+----------+----------+------------+----------------+-----------+-------------------+------------+------------+-----------+-----------+--------------------+---------+--------------------+--------+--------+--------+----------+--------------+--------------+--------+------------+------------+------------+-------------+
|61587112|    14314

In [29]:
volvo_waybill_etm_valid_count = volvo_waybill_etm_valid_df.count()
volvo_waybill_etm_valid_count

98676

## Feature engineering: Time Window to construct trips

In [30]:
# Select the required columns
volvo_waybill_filtered_df = volvo_waybill_etm_valid_df.select("duty_dt", "WB_device_id", 
                                                              "schedule_no", "schedule_name",
                                                              "service_type", "trip_number",
                                                              "start_time", "end_time",
                                                              "route_id", "route_no",
                                                              "etm_start_time", "etm_end_time","is_dread_trip")

In [31]:
# Define functions to extract elements of a datetype object
dateTimeFmt = "yyyy-MM-dd HH:mm:ss.S"
dateFmt = "yyyy-MM-dd"
timeFmt = "HH:mm:ss"

extractDateWB = from_unixtime(unix_timestamp('etm_start_time', format=dateTimeFmt), format=dateFmt)
extractStartTime = from_unixtime(unix_timestamp('etm_start_time', format=dateTimeFmt), format=timeFmt)
extractEndTime = from_unixtime(unix_timestamp('etm_end_time', format=dateTimeFmt), format=timeFmt)
extractStartTimestamp = unix_timestamp('etm_start_time', format=dateTimeFmt)
extractEndTimestamp = unix_timestamp('etm_end_time', format=dateTimeFmt)
extractScheduleStartTimestamp = unix_timestamp('start_time', format=timeFmt) + \
                                unix_timestamp('WB_commute_date', format=dateFmt) + 19800
extractScheduleEndTimestamp = unix_timestamp('end_time', format=timeFmt) + \
                                unix_timestamp('WB_commute_date', format=dateFmt) + 19800

In [32]:
# Add columns corresponding to the trip details for the schedules
volvo_waybill_time_window_df = volvo_waybill_filtered_df.withColumn("WB_commute_date", extractDateWB)
volvo_waybill_time_window_df = volvo_waybill_time_window_df.withColumn("trip_start_time", extractStartTime)
volvo_waybill_time_window_df = volvo_waybill_time_window_df.withColumn("trip_end_time", extractEndTime)
volvo_waybill_time_window_df = volvo_waybill_time_window_df.withColumn("epoch_start_time", extractStartTimestamp)
volvo_waybill_time_window_df = volvo_waybill_time_window_df.withColumn("epoch_end_time", extractEndTimestamp)
volvo_waybill_time_window_df = volvo_waybill_time_window_df.withColumn("epoch_sched_start_time", 
                                                                       extractScheduleStartTimestamp)
volvo_waybill_time_window_df = volvo_waybill_time_window_df.withColumn("epoch_sched_end_time", 
                                                                       extractScheduleEndTimestamp)

In [33]:
volvo_waybill_time_window_df.show(1)
volvo_waybill_time_window_count = volvo_waybill_time_window_df.count()
volvo_waybill_time_window_count

+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+
|   duty_dt|WB_device_id|schedule_no|     schedule_name|service_type|trip_number|start_time|end_time|route_id|route_no|      etm_start_time|        etm_end_time|is_dread_trip|WB_commute_date|trip_start_time|trip_end_time|epoch_start_time|epoch_end_time|epoch_sched_start_time|epoch_sched_end_time|
+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+
|2017-11-01|   150218364|      34081|V-342F/10-All Days|           3|          2|  07:20:00|08:50:00|   23

98676

## If the duration of the trip per the ETM timestamps is absurd (start to end < 30% of schedule duration), then delete the corresponding rows

In [34]:
from pyspark.sql.functions import udf

def isValidETMRecordingFunc(epoch_start_time,epoch_end_time,epoch_sched_start_time,epoch_sched_end_time):
    etm_running_time = (epoch_end_time - epoch_start_time)
    schedule_running_time = (epoch_sched_end_time - epoch_sched_start_time)
    if (etm_running_time > 0.3*schedule_running_time):
        return 1
    else:
        return 0

In [35]:
isValidETMRecordingFuncUDF = udf(isValidETMRecordingFunc)

In [36]:
volvo_waybill_time_window_valid_df = volvo_waybill_time_window_df.withColumn("isValidETMRecording",
                                                                             isValidETMRecordingFuncUDF(
                                                                             volvo_waybill_time_window_df.epoch_start_time,
                                                                             volvo_waybill_time_window_df.epoch_end_time,
                                                                             volvo_waybill_time_window_df.epoch_sched_start_time,
                                                                             volvo_waybill_time_window_df.epoch_sched_end_time))
volvo_waybill_time_window_valid_count = volvo_waybill_time_window_valid_df.count()
volvo_waybill_time_window_valid_count

98676

In [37]:
volvo_waybill_time_window_valid_df.show(2)

+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+-------------------+
|   duty_dt|WB_device_id|schedule_no|     schedule_name|service_type|trip_number|start_time|end_time|route_id|route_no|      etm_start_time|        etm_end_time|is_dread_trip|WB_commute_date|trip_start_time|trip_end_time|epoch_start_time|epoch_end_time|epoch_sched_start_time|epoch_sched_end_time|isValidETMRecording|
+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+-------------------+
|2017-11-01|   150218364|      34081|V-342F/10

In [38]:
volvo_waybill_time_window_valid_df.describe("isValidETMRecording").show()

+-------+-------------------+
|summary|isValidETMRecording|
+-------+-------------------+
|  count|              98676|
|   mean|  0.856834488629454|
| stddev| 0.3502433309685543|
|    min|                  0|
|    max|                  1|
+-------+-------------------+



In [39]:
volvo_waybill_time_window_valid_df.cache()

DataFrame[duty_dt: string, WB_device_id: string, schedule_no: bigint, schedule_name: string, service_type: int, trip_number: int, start_time: string, end_time: string, route_id: int, route_no: string, etm_start_time: string, etm_end_time: string, is_dread_trip: int, WB_commute_date: string, trip_start_time: string, trip_end_time: string, epoch_start_time: bigint, epoch_end_time: bigint, epoch_sched_start_time: bigint, epoch_sched_end_time: bigint, isValidETMRecording: string]

In [40]:
volvo_waybill_time_window_valid_1_df = volvo_waybill_time_window_valid_df.filter(col("isValidETMRecording") == 1)

In [41]:
volvo_waybill_time_window_valid_1_df.show(2)

+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+-------------------+
|   duty_dt|WB_device_id|schedule_no|     schedule_name|service_type|trip_number|start_time|end_time|route_id|route_no|      etm_start_time|        etm_end_time|is_dread_trip|WB_commute_date|trip_start_time|trip_end_time|epoch_start_time|epoch_end_time|epoch_sched_start_time|epoch_sched_end_time|isValidETMRecording|
+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+-------------------+
|2017-11-01|   150218364|      34081|V-342F/10

In [42]:
volvo_waybill_time_window_valid_1_count = volvo_waybill_time_window_valid_1_df.count()
volvo_waybill_time_window_valid_1_count

84549

### Read the VTS data from Hive

In [43]:
sqlContext.sql("use bmtc_eta_default")
vts_volvo_df = sqlContext.sql("select * from vts_volvo_2017")
vts_volvo_count = vts_volvo_df.count()
vts_volvo_count

DataFrame[]

956091019

In [44]:
vts_volvo_df.show(2)

+-----------+-------------+--------------------+---------+---------+
|         id|vts_device_id|            ist_date|      lat|longitude|
+-----------+-------------+--------------------+---------+---------+
|12139365428|    150221993|2017-01-01 08:56:...|12.914202| 77.60006|
|12139369184|    150220827|2017-01-01 08:56:...|12.919712| 77.64328|
+-----------+-------------+--------------------+---------+---------+
only showing top 2 rows



### >>> Input: Select the desired month again here

In [45]:
vts_volvo_filtered_df = vts_volvo_df.filter(((year(vts_volvo_df.ist_date) == 2017) & 
                                             (month(vts_volvo_df.ist_date) == 11)))

In [46]:
# Define function to extract datetime objects
extractDateVTS = from_unixtime(unix_timestamp('ist_date', format=dateTimeFmt), format=dateFmt)
extractTime = from_unixtime(unix_timestamp('ist_date', format=dateTimeFmt), format=timeFmt)
extractTimestamp = unix_timestamp('ist_date', format=dateTimeFmt)

vts_volvo_augmented_df = vts_volvo_filtered_df.withColumn("commute_date", extractDateVTS)
vts_volvo_augmented_df = vts_volvo_augmented_df.withColumn("ist_time", extractTime)
vts_volvo_augmented_df = vts_volvo_augmented_df.withColumn("ist_epoch", extractTimestamp)

In [47]:
vts_volvo_augmented_df.show(5)

+-----------+-------------+--------------------+---------+---------+------------+--------+----------+
|         id|vts_device_id|            ist_date|      lat|longitude|commute_date|ist_time| ist_epoch|
+-----------+-------------+--------------------+---------+---------+------------+--------+----------+
|22205704442|    150810061|2017-11-01 08:51:...|13.040477|77.624336|  2017-11-01|08:51:54|1509506514|
|22205705570|    150812124|2017-11-01 08:51:...|12.977475|77.725647|  2017-11-01|08:51:58|1509506518|
|22205709431|    150221065|2017-11-01 08:50:...|12.996613|77.669182|  2017-11-01|08:50:12|1509506412|
|22205709495|    150221737|2017-11-01 08:52:...|12.976693|77.726303|  2017-11-01|08:52:04|1509506524|
|22205718394|    150221760|2017-11-01 08:52:...|12.966127|77.606819|  2017-11-01|08:52:18|1509506538|
+-----------+-------------+--------------------+---------+---------+------------+--------+----------+
only showing top 5 rows



## We define functions to augment vts_parse_data with schedule_id, route_id and trip_number based upon the available device_id and ist_date

In [48]:
# Define a UDF for calculating the distance between the two geo points
from pyspark.sql.functions import udf

In [49]:
volvo_waybill_time_window_df.show(1)
vts_volvo_augmented_df.show(1)

+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+
|   duty_dt|WB_device_id|schedule_no|     schedule_name|service_type|trip_number|start_time|end_time|route_id|route_no|      etm_start_time|        etm_end_time|is_dread_trip|WB_commute_date|trip_start_time|trip_end_time|epoch_start_time|epoch_end_time|epoch_sched_start_time|epoch_sched_end_time|
+----------+------------+-----------+------------------+------------+-----------+----------+--------+--------+--------+--------------------+--------------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+
|2017-11-01|   150218364|      34081|V-342F/10-All Days|           3|          2|  07:20:00|08:50:00|   23

In [50]:
join_condition = [vts_volvo_augmented_df.vts_device_id == volvo_waybill_time_window_valid_1_df.WB_device_id,
                  vts_volvo_augmented_df.ist_epoch > volvo_waybill_time_window_valid_1_df.epoch_start_time - 300,
                  vts_volvo_augmented_df.ist_epoch < volvo_waybill_time_window_valid_1_df.epoch_end_time + 300]

vts_volvo_waybill_joined_df = vts_volvo_augmented_df.join(volvo_waybill_time_window_valid_1_df,
                                                           join_condition, 
                                                           "left_outer").drop("WB_device_id")

In [51]:
# vts_waybill_augmented_1_grouped_count = vts_waybill_augmented_1_df.groupby([year(vts_waybill_augmented_1_df.ist_date),
#                                                                  month(vts_waybill_augmented_1_df.ist_date),
#                                                                  dayofmonth(vts_waybill_augmented_1_df.ist_date)])\
#                                                        .count().orderBy(desc('dayofmonth(ist_date)'))
# vts_waybill_augmented_1_grouped_count.show(50)

In [52]:
vts_volvo_waybill_joined_df.show(2)

+-----------+-------------+--------------------+---------+---------+------------+--------+----------+-------+-----------+-------------+------------+-----------+----------+--------+--------+--------+--------------+------------+-------------+---------------+---------------+-------------+----------------+--------------+----------------------+--------------------+-------------------+
|         id|vts_device_id|            ist_date|      lat|longitude|commute_date|ist_time| ist_epoch|duty_dt|schedule_no|schedule_name|service_type|trip_number|start_time|end_time|route_id|route_no|etm_start_time|etm_end_time|is_dread_trip|WB_commute_date|trip_start_time|trip_end_time|epoch_start_time|epoch_end_time|epoch_sched_start_time|epoch_sched_end_time|isValidETMRecording|
+-----------+-------------+--------------------+---------+---------+------------+--------+----------+-------+-----------+-------------+------------+-----------+----------+--------+--------+--------+--------------+------------+--------

In [53]:
vts_volvo_waybill_joined_count = vts_volvo_waybill_joined_df.count()
vts_volvo_waybill_joined_count

89802371

In [54]:
vts_waybill_distinct_df = vts_volvo_waybill_joined_df.distinct()
vts_waybill_distinct_count = vts_waybill_distinct_df.count()
vts_waybill_distinct_count

89802371

In [55]:
# Drop columns with Null route id
vts_waybill_distinct_2_df = vts_waybill_distinct_df.filter(col("route_id").isNotNull())
#vts_waybill_augmented_3_df.cache()

vts_waybill_distinct_2_count = vts_waybill_distinct_2_df.count()
vts_waybill_distinct_2_count

27017321

### Remove rows corresponding to negative time window

In [56]:
vts_waybill_distinct_3_df = vts_waybill_distinct_2_df.filter(vts_waybill_distinct_2_df.epoch_end_time >= \
                                                             vts_waybill_distinct_2_df.epoch_start_time)

In [57]:
vts_waybill_distinct_3_count = vts_waybill_distinct_3_df.count()
vts_waybill_distinct_3_count

27017321

## Explore the data for consistency - we pick Volvo 44489 | V-365J/7-All Days

In [260]:
# vts_waybill_365J_df = vts_waybill_distinct_3_df.filter(col("schedule_no") == 44489)
# vts_waybill_365J_count = vts_waybill_365J_df.count()
# vts_waybill_365J_count

In [261]:
# vts_waybill_365J_21Oct_df = vts_waybill_365J_df.filter(dayofmonth(vts_waybill_365J_df.ist_date) == 23)
# vts_waybill_365J_21Oct_count = vts_waybill_365J_21Oct_df.count()
# vts_waybill_365J_21Oct_count

# vts_waybill_365J_21Oct_ordered_df = vts_waybill_365J_21Oct_df.orderBy("ist_epoch")

# vts_waybill_365J_21Oct_ordered_df.select("ist_date", "lat", "longitude", "ist_epoch", "trip_number", "route_id", 
#                                          "etm_start_time", "etm_end_time").show(715)

## Saving the intermediate result to Hive

In [262]:
sqlContext.sql("show tables").show()

+--------------------+-----------+
|           tableName|isTemporary|
+--------------------+-----------+
|            vts_365r|      false|
|   vts_365r_distance|      false|
| vts_365r_distance_1|      false|
|   vts_365r_filtered|      false|
|vts_bus_stop_min_...|      false|
|vts_bus_stop_min_...|      false|
|vts_bus_stop_min_...|      false|
|vts_bus_stop_min_...|      false|
|vts_full_bus_stop...|      false|
|vts_full_bus_stop...|      false|
|vts_full_bus_stop...|      false|
|vts_full_bus_stop...|      false|
|         vts_jun2017|      false|
|    vts_normal_speed|      false|
|      vts_volvo_2017|      false|
|   vts_volvo_oct2017|      false|
|vts_waybill_volvo...|      false|
|vts_waybill_volvo...|      false|
|vts_waybill_volvo...|      false|
|vts_waybill_volvo...|      false|
+--------------------+-----------+
only showing top 20 rows



### >>> Input: Month in table name 

In [263]:
# Instead of creating a persistent table using saveAsTable, make temp table and dump it as a hive table
vts_waybill_distinct_3_df.createOrReplaceTempView("temp_vts_waybill_distinct_3_df") 
sqlContext.sql("create table vts_waybill_volvo_dec2017 as select * from temp_vts_waybill_distinct_3_df")

DataFrame[]

In [4]:
sqlContext.sql("use bmtc_eta_default")
sqlContext.sql("show tables").show(40,truncate=False)

+---------------------------+-----------+
|tableName                  |isTemporary|
+---------------------------+-----------+
|vts_365r                   |false      |
|vts_365r_distance          |false      |
|vts_365r_distance_1        |false      |
|vts_365r_filtered          |false      |
|vts_bus_stop_min_dist_apr17|false      |
|vts_bus_stop_min_dist_feb17|false      |
|vts_bus_stop_min_dist_jan17|false      |
|vts_bus_stop_min_dist_mar17|false      |
|vts_full_bus_stops_apr17   |false      |
|vts_full_bus_stops_feb17   |false      |
|vts_full_bus_stops_jan17   |false      |
|vts_full_bus_stops_mar17   |false      |
|vts_jun2017                |false      |
|vts_normal_speed           |false      |
|vts_volvo_2017             |false      |
|vts_volvo_oct2017          |false      |
|vts_waybill_volvo_apr2017  |false      |
|vts_waybill_volvo_aug2017  |false      |
|vts_waybill_volvo_dec2017  |false      |
|vts_waybill_volvo_feb2017  |false      |
|vts_waybill_volvo_jan2017  |false

### Validation Part 

In [5]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_jan2017")
temp_count = temp_df.count()
temp_count

38277567

In [6]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_feb2017")
temp_count = temp_df.count()
temp_count

33598754

In [7]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_mar2017")
temp_count = temp_df.count()
temp_count

36067690

In [8]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_apr2017")
temp_count = temp_df.count()
temp_count

32082425

In [9]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_may2017")
temp_count = temp_df.count()
temp_count

31459481

In [10]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_jun2017")
temp_count = temp_df.count()
temp_count

24305551

In [11]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_jul2017")
temp_count = temp_df.count()
temp_count

24460343

In [12]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_aug2017")
temp_count = temp_df.count()
temp_count

27068360

In [13]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_sep2017")
temp_count = temp_df.count()
temp_count

23532256

In [14]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_oct2017")
temp_count = temp_df.count()
temp_count

25473694

In [15]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_nov2017")
temp_count = temp_df.count()
temp_count

27017321

In [16]:
temp_df = sqlContext.sql("select * from vts_waybill_volvo_dec2017")
temp_count = temp_df.count()
temp_count

0