In [1]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext, Row
from pyspark.sql import functions as F
from pyspark.mllib.stat import Statistics
from datetime import datetime, timedelta
from pyspark.sql.window import Window
from pyspark.sql.types import TimestampType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from math import floor
import time

from GPSProcessing import *
from AccProcessing import *

In [2]:
# REFERENCE: https://spark.apache.org/docs/latest/configuration.html

conf = SparkConf().setAll([('spark.memory.fraction','0.6'),
                           ('spark.executor.memory', '16g'),
                           ('spark.driver.memory','16g'),
                           ('spark.sql.shuffle.partitions','20'),
                           ('spark.memory.offHeap.enabled', True),
                           ('spark.memory.offHeap.size','16g'),
                           ('spark.cleaner.referenceTracking.cleanCheckpoints', True)]#,
                           #('spark.driver.cores', '4'),
                           #('spark.executor.cores', '4'),
                           #('spark.worker.cleanup.enabled','true'),
                           #('spark.sql.session.timeZone', 'UTC')])
                         )

spark  = SparkSession.builder.config(conf=conf).master("local[*]").appName("GPS+ACC").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
sc.setCheckpointDir('checkpoints')
sc.getConf().getAll()  # or sc._conf.getAll()

[('spark.sql.shuffle.partitions', '20'),
 ('spark.driver.host', '192.168.0.10'),
 ('spark.cleaner.referenceTracking.cleanCheckpoints', 'True'),
 ('spark.driver.port', '60389'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '16g'),
 ('spark.app.name', 'GPS+ACC'),
 ('spark.executor.memory', '16g'),
 ('spark.app.id', 'local-1573899915848'),
 ('spark.rdd.compress', 'True'),
 ('spark.memory.fraction', '0.6'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.memory.offHeap.enabled', 'True'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.memory.offHeap.size', '16g')]

In [3]:
sc.defaultParallelism

8

In [4]:
sc.defaultMinPartitions

2

In [5]:
sc

## PALMS output

In [None]:
palms_path = '/Users/molinaro/Documents/GITHUB/PALMS/Calgary/PALMS_output.csv'

In [None]:
palms_out = spark.read.csv(palms_path, header=True, inferSchema=True, )
palms_out.cache();

In [None]:
palms_out.rdd.getNumPartitions()

In [None]:
palms_out.filter(palms_out['identifier']=='A01').select('identifier','lat','lon','dateTime').show(20,False)

In [None]:
palms_out.filter(palms_out['identifier']=='A01').count()

In [None]:
palms_out.filter(palms_out['identifier']=='A01').printSchema()

In [None]:
date_format = '%Y-%m-%d'
time_format = '%H:%M:%S'
datetime_format = date_format + ' ' + time_format
startdate = datetime.strptime('2016-08-16 18:23:25', datetime_format) 

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.activity==-2))\
.select('identifier','lat','lon','dateTime','activity','activityIntensity','activityBoutNumber').show(29000)

In [None]:
df3.filter((F.col('activityIntensity')==-2) & (F.col('timestamp')>=startdate)).orderBy('timestamp').show(29000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.activityBoutNumber==7))\
.select('lat','lon','dateTime','activity','activityIntensity',
        'activityBoutNumber','sedentaryBoutNumber').show(2000)

In [None]:
df3.filter((F.col('activityBoutNumber')==7) & (F.col('timestamp')>=startdate)).orderBy('timestamp').show(2000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.sedentaryBoutNumber == 14))\
.select('identifier','lat','lon','dateTime','activity',
        'activityBoutNumber','sedentaryBoutNumber').show(20000)

In [None]:
df3.filter((F.col('sedentaryBoutNumber')==14) & (F.col('timestamp')>=startdate)).show(29000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.fixTypeCode==3))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(2000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.fixTypeCode==4))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
gps_data.filter(F.col('fixTypeCode')==3).orderBy('timestamp').show(2000)

In [None]:
gps_data.filter(F.col('fixTypeCode')==4).orderBy('timestamp').count()

In [None]:
palms_out.filter((palms_out.identifier=='A01') )\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(20000,False)

In [None]:
palms_out.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(20000,False)

In [None]:
gps_data.show(20000,False)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.tripType==1))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(2000)

In [None]:
df.drop(*['height','speed','heading','dow','lat','lon'])\
.filter((F.col('tripType')==4)).show(2000)

In [None]:
df.filter(F.col('tripType')==1).show(2000)

In [None]:
spark.catalog.clearCache()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==5))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==3))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==2))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==1))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==4))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==6))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A01')).count()

## GPS data processing

In [None]:
gps_path_raw = '/Users/molinaro/Documents/GITHUB/PALMS/data/PARC_data/NYC/GPS/2001.csv'

In [6]:
gps_path_raw = '/Users/molinaro/Documents/GITHUB/HABITUS/test_data/gps/A02.csv'

In [7]:
gps_data_raw = spark.read.csv(gps_path_raw, header=True, inferSchema=True)

In [9]:
date_format = 'yyyy/MM/dd'
time_format = 'HH:mm:ss'
datetime_format = date_format + ' ' + time_format

gps_data = gen_gps_dataframe(gps_data_raw, datetime_format)
gps_data.cache()

DataFrame[timestamp: timestamp, dow: string, lat: double, lon: double, distance: double, height: double, speed: double]

In [10]:
gps_data.show(20,False)

+-------------------+---+---------+-----------+--------+------+-----+
|timestamp          |dow|lat      |lon        |distance|height|speed|
+-------------------+---+---------+-----------+--------+------+-----+
|2016-06-29 13:29:52|3  |51.0782  |-114.128857|0.0     |0.0   |7.306|
|2016-06-29 13:29:57|3  |51.078183|-114.128778|0.0     |1044.0|7.055|
|2016-06-29 13:30:02|3  |51.078228|-114.128685|0.0     |1037.0|6.179|
|2016-06-29 13:30:07|3  |51.078237|-114.128582|0.0     |1040.0|5.512|
|2016-06-29 13:30:12|3  |51.078243|-114.128482|0.0     |1040.0|5.024|
|2016-06-29 13:30:17|3  |51.078243|-114.128382|0.0     |1039.0|5.333|
|2016-06-29 13:30:22|3  |51.078265|-114.128283|0.0     |1038.0|5.303|
|2016-06-29 13:30:27|3  |51.078277|-114.12819 |0.0     |1038.0|5.679|
|2016-06-29 13:30:32|3  |51.0783  |-114.128093|0.0     |1038.0|5.161|
|2016-06-29 13:30:37|3  |51.078315|-114.128002|0.0     |1039.0|5.186|
|2016-06-29 13:30:42|3  |51.078325|-114.127898|0.0     |1040.0|5.272|
|2016-06-29 13:30:47

In [11]:
# date_format = '%Y-%m-%d'
# time_format = '%H:%M:%S'
# datetime_format = date_format + ' ' + time_format
# startdate = datetime.strptime('2016-06-29 13:00:00', datetime_format) 
# gps_data.filter((F.col('timestamp')>=startdate)).show(20000,False)

In [12]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: double (nullable = false)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)



In [13]:
gps_data.count()

119999

In [14]:
# Round seconds in timestamps according to the interval

interval = 5 # seconds
ts_name = 'timestamp'
ws = 600 # seconds

print("====> align timestamps...")
start_time = time.time()
gps_data = round_timestamp(gps_data, ts_name, interval).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> align timestamps...
      time elapsed: 00:00:00


In [15]:
# Set fix type

ts_name = 'timestamp'
ws=600
print("====> set fix type...")
start_time = time.time()
gps_data = set_fix_type(gps_data, ts_name, ws).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> set fix type...
      time elapsed: 00:00:01


In [16]:
# Apply filter on the velocity

vmax = 130 # km/h

print("====> apply velocity filter...")
start_time = time.time()
gps_data = filter_speed(gps_data, 'speed', vmax).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply velocity filter...
      time elapsed: 00:00:00


In [17]:
# Apply filter over max acceleration
                                             
scol = 'speed'
tscol = 'timestamp'

print("====> apply accelaration filter...")
start_time = time.time()
gps_data = filter_acceleration(gps_data, scol, tscol).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply accelaration filter...
      time elapsed: 00:00:00


In [18]:
# Apply filter on the height variation

dhmax=1000
print("====> apply height variation filter...")
start_time = time.time()
gps_data = filter_height(gps_data, 'height', 'timestamp', dhmax).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply height variation filter...
      time elapsed: 00:00:01


In [19]:
gps_data.count()

119997

In [20]:
# Apply filter over three fixes (it also recalculates distance column)

dcol = 'distance'
tscol = 'timestamp'
dmin = 10

print("====> apply three fixes filter...")
start_time = time.time()
gps_data = filter_change_dist_3_fixes(gps_data, dcol, tscol, dmin).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply three fixes filter...
      time elapsed: 00:00:02


In [21]:
gps_data.count()

119967

In [22]:
# Generate missing values up to maximum signal loss

print("====> fill in missing value...")
start_time = time.time()
gps_data = fill_timestamp(gps_data, 'timestamp', 'fixTypeCode', interval, ws).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> fill in missing value...
      time elapsed: 00:00:08


In [23]:
gps_data.show(20,False)

+-------------------+---+---------+-----------+------------------+------+-----+-----------+
|timestamp          |dow|lat      |lon        |distance          |height|speed|fixTypeCode|
+-------------------+---+---------+-----------+------------------+------+-----+-----------+
|2016-06-29 13:29:50|3  |51.0782  |-114.128857|0.0               |0.0   |7.306|2          |
|2016-06-29 13:29:55|3  |51.078183|-114.128778|5.829974470453223 |1044.0|7.055|1          |
|2016-06-29 13:30:00|3  |51.078228|-114.128685|8.195310872577657 |1037.0|6.179|1          |
|2016-06-29 13:30:05|3  |51.078237|-114.128582|7.2601936276979036|1040.0|5.512|1          |
|2016-06-29 13:30:10|3  |51.078243|-114.128482|7.013296329567071 |1040.0|5.024|1          |
|2016-06-29 13:30:15|3  |51.078243|-114.128382|6.981530106484058 |1039.0|5.333|1          |
|2016-06-29 13:30:20|3  |51.078265|-114.128283|7.331343233848628 |1038.0|5.303|1          |
|2016-06-29 13:30:25|3  |51.078277|-114.12819 |6.628342578770214 |1038.0|5.679|1

In [24]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: string (nullable = true)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- fixTypeCode: integer (nullable = true)



In [25]:
# Filter timestamps over given interval
INTERVAL = 5
ts_name = 'timestamp'
print("====> filter GPS data every {} seconds...".format(str(INTERVAL)))
start_time = time.time()
gps_data = select_gps_intervals(gps_data, ts_name, INTERVAL)
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> filter GPS data every 5 seconds...
      time elapsed: 00:00:00


In [26]:
gps_data = gps_data.limit(20000)

In [27]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: string (nullable = true)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- fixTypeCode: integer (nullable = true)



In [28]:
##%%time

vmax = 130 # km/h
max_dist_per_min = vmax * 1000/60 # meters
min_dist_per_min = 25 # meters
min_pause_duration = 120 # second
max_pause_time = 180 # seconds

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'
fix_type_name = 'fixTypeCode'


###TEST#PARAMETERS###
#min_dist_per_min = 5 # meters
#min_pause_duration = 12 # second
#max_pause_time = 36 # seconds
####################

print("====> detect trips...")
start_time = time.time()
gps_data2 = detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, 
                 min_pause_duration, max_pause_time, vmax).cache()
gps_data2.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

#gps_data2.show()

====> detect trips...
      time elapsed: 00:12:51


In [29]:
gps_data2.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- fixTypeCode: integer (nullable = true)
 |-- dow: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- tripType: integer (nullable = true)
 |-- cum_pause: long (nullable = true)



In [30]:
#df.write.csv('partial.csv')
#gps_data.coalesce(1).write.format('com.databricks.spark.csv').save('partial.csv',header = 'true')
gps_data2.coalesce(1).write.option("header",True).option("inferSchema","true").option("timestampFormat", "yyyy-MM-dd HH:mm:ss").csv("mydata.csv")

In [None]:
gps_data_path = '/Users/molinaro/Documents/GITHUB/HABITUS/partial_5a.csv/part-00000-80943cd6-b7d3-427f-84b1-7988cb76cb55-c000.csv'
gps_data2 = spark.read.csv(gps_data_path, header=True, inferSchema=True)

In [31]:
#%%time

vehicle_speed_cutoff = 35 # km/h
bicycle_speed_cutoff = 10 # km/h 
walk_speed_cutoff = 1 # km/h
speed_percentile = 90
speed_segment_length = 30 # m
min_trip_length = 100 # m
min_trip_duration = 180 # sec

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'

print("====> classify trips...")
start_time = time.time()
gps_data3 = classify_trips(gps_data2, ts_name, dist_name, speed_name, 
                           vehicle_speed_cutoff, bicycle_speed_cutoff, walk_speed_cutoff, 
                           min_trip_length, min_trip_duration, speed_segment_length, speed_percentile).cache()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> classify trips...
      time elapsed: 00:02:20


In [32]:
gps_data3.show(20000,False)

+-------------------+---+---------+-----------+-----------+----------+--------+-------+
|timestamp          |dow|lat      |lon        |fixTypeCode|tripNumber|tripType|tripMOT|
+-------------------+---+---------+-----------+-----------+----------+--------+-------+
|2016-06-29 13:29:50|3  |51.0782  |-114.128857|2          |1         |1       |1      |
|2016-06-29 13:29:55|3  |51.078183|-114.128778|1          |1         |2       |1      |
|2016-06-29 13:30:00|3  |51.078228|-114.128685|1          |1         |2       |1      |
|2016-06-29 13:30:05|3  |51.078237|-114.128582|1          |1         |2       |1      |
|2016-06-29 13:30:10|3  |51.078243|-114.128482|1          |1         |2       |1      |
|2016-06-29 13:30:15|3  |51.078243|-114.128382|1          |1         |2       |1      |
|2016-06-29 13:30:20|3  |51.078265|-114.128283|1          |1         |2       |1      |
|2016-06-29 13:30:25|3  |51.078277|-114.12819 |1          |1         |2       |1      |
|2016-06-29 13:30:30|3  |51.0783

### Process GPS data in Calgary/gps/A01r.csv

In [None]:
# segment 108

In [None]:
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3")
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
# segment 64

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j4', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE4").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j3']).show(20000, False)

In [None]:
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df2, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df3, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
# segment 28

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
# segment 13

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df2 = df.drop(*['j1','j2','j3','j4'])
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df2.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
# segment 108
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df2 = df2.drop(*['j1','j2','j3','j4'])
df2 = set_pause(df2, 'i3', ts_name).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df2 = check_case(df2, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df2, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i3', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
# segment 166
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j3']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j4', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE4").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j3']).show(20000, False)

In [None]:
df2 = df2.drop(*['j1','j2','j3'])
df2 = set_pause(df2, 'j4', ts_name).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j3']).show(20000, False)

In [None]:
df2 = check_case(df2, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4','j1','j2']).show(20000, False)

In [None]:
df3 = df2.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i2', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i2', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i2', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4','j1','j2']).show(20000, False)

In [None]:
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4','j1','j2','j3']).show(20000, False)

In [None]:
# segment 180
df4 = proc_segment(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = df4.drop(*['j1','j2','j3','j4'])
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = df4.drop(*['j1','j2','j3','j4'])
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df4 = df4.drop(*['j1','j2','j3','j4'])
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df5 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df5 = proc_segment(df5, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df5 = df5.drop(*['j1','j2','j3','j4'])
df5 = set_pause(df5, 'i1', ts_name).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df5 = check_case(df5, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df5 = proc_segment(df5, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df5 = df5.drop(*['j1','j2','j3','j4'])
df5 = set_pause(df5, 'i1', ts_name).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df5 = check_case(df5, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df5 = proc_segment(df5, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
#segment 188
df6 = proc_segment(df5, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df6 = df6.drop(*['j1','j2','j3','j4'])
df6 = set_pause(df6, 'i1', ts_name).checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df6 = check_case(df6, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df6 = proc_segment(df6, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','j4','j1','j2','j3']).show(20000, False)

In [None]:
def push_and_pop(rdd):
    # two transformations: moves the head element to the tail
    first = rdd.first()
    return rdd.filter(
        lambda obj: obj != first
    ).union(
        sc.parallelize([first])
    )

def serialize_and_deserialize(rdd):
    # perform a collect() action to evaluate the rdd and create a new instance
    return sc.parallelize(rdd.collect())

def do_test(serialize=False):
    rdd = sc.parallelize(range(1000))
    for i in xrange(25):
        t0 = time.time()
        rdd = push_and_pop(rdd)
        if serialize:
            rdd = serialize_and_deserialize(rdd)
        print("%.3f" % (time.time() - t0))

do_test()

In [None]:
tw = str(60) + ' seconds'
sw = str(60) + ' seconds'

st_sec = str(df.first()[0].second)

offset = st_sec + ' seconds' 

param_name = 'distance'

df.groupBy(
            F.window(ts_name, tw, sw, offset)
          ).sum(param_name)\
           .select('window','sum({})'.format(param_name))\
           .withColumn('start', F.col('window').start)\
           .withColumn('end', F.col('window').end)\
           .drop('window')\
           .show(20, False)
"""
df.groupBy(
            F.window(ts_name, tw, sw, offset)
          ).sum()\
           .sort('window.start')\
           .filter(F.col('sum({})'.format(param_name))>=min_dist_per_min)\
           .select('window')\
           .withColumn('start', F.col('window').start)\
           .withColumn('end', F.col('window').end)\
           .drop('window')\
           .show(20, False)
"""

## Accelerometer data processing

row = Row("id", "date", "value")
df = sc.parallelize([
        row(1, "2015-01-01", 20.0),
        row(2, "2015-01-02", 30.0),
        row(3, "2015-01-03", 0.0),
        row(4, "2015-01-04", 30.0),
        row(5, "2015-01-05", 50.0),
        row(6, "2015-01-06", 0.0),
        row(7, "2015-01-07", 0.0),
        row(8, "2015-01-08", 0.0),
        row(9, "2015-01-09", 0.0),
        row(10, "2015-01-10", 0.0),
        row(11, "2015-01-11", 20.0),
        row(12, "2015-01-12", 0.0),
        row(13, "2015-01-13", 0.0),
        row(14, "2015-01-14", 40.0),
        row(15, "2015-01-15", 8.0),
        row(16, "2015-01-16", 0.0)
    ]).toDF().withColumn("date", F.col("date").cast("timestamp"))

df.show(20, False)

tw = str(24*60) + ' minutes'
sw = str(24*3600) + ' seconds'
offset = str(0) + ' seconds'
datetime_name = 'date'
param_name = 'value'
param_value = 0

df.groupBy(
            F.window(datetime_name, '{}'.format(tw),'{}'.format(sw),'{}'.format(offset))
         ).avg(param_name)\
          .sort('window.start')\
          .filter(F.col('avg({})'.format(param_name))==param_value)\
          .select('window')\
          .withColumn('start', F.col('window').start)\
          .withColumn('end', F.col('window').end)\
          .drop('window')\
          .show(20, False)

In [None]:
acc_path_raw = '/Users/molinaro/Documents/GITHUB/PALMS/data/PARC_data/NYC/accelerometer/2001_1s.csv'

In [33]:
acc_path_raw = '/Users/molinaro/Documents/GITHUB/HABITUS/test_data/acc/A02_5sec.csv'

In [34]:
acc_data_raw = spark.read.text(acc_path_raw)
acc_data_raw.cache()

DataFrame[value: string]

In [35]:
acc_data_raw.count()

144253

In [36]:
interval, acc_data = gen_acc_dataframe(acc_data_raw)

In [37]:
print(interval)
acc_data.show(20,False)

5
+-------------------+-----------------+
|timestamp          |acc_data         |
+-------------------+-----------------+
|2016-06-29 11:51:00|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:05|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:10|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:15|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:20|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:25|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:30|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:35|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:40|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:45|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:50|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:51:55|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:00|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:05|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:10|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:15|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:20|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:25|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:30|0,0,0,0,0,0,0,5,0|
|2016-06-29 11:52:35|0,0,0,0,0,0,0,5,0|
+-------------------+-----------------+
only showing top 20 rows



In [38]:
acc_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- acc_data: string (nullable = true)



In [39]:
acc_columns = ['axis1','axis2','axis3','steps','lux','incl_off','incl_standing','incl_sitting','incl_lying']
acc_data_ext = split_acc_data(acc_data, acc_columns)
acc_data_ext.cache()

DataFrame[timestamp: timestamp, axis1: int, axis2: int, axis3: int, steps: int, lux: int, incl_off: int, incl_standing: int, incl_sitting: int, incl_lying: int]

In [40]:
ts_name = 'timestamp'
window = 5 #seconds

acc_data_act = select_acc_intervals(acc_data_ext, ts_name, interval, window, False, True)
acc_data_act.show(10000, False)

+-------------------+-----+-----+-----+-----+----+--------+-------------+------------+----------+
|timestamp          |axis1|axis2|axis3|steps|lux |incl_off|incl_standing|incl_sitting|incl_lying|
+-------------------+-----+-----+-----+-----+----+--------+-------------+------------+----------+
|2016-06-29 11:51:00|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:05|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:10|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:15|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:20|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:25|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:30|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:35

In [41]:
#LightCO, ModerateCO, HardCO, VeryHardCO = (100, 1953, 5725, 99999)
#LightCO, ModerateCO, HardCO, VeryHardCO = (100, 1953, 5725, 9498)
#LightCO, ModerateCO, HardCO, VeryHardCO = (500, 2000, 3000, 4500)
LightCO, ModerateCO, HardCO, VeryHardCO = (133, 193, 233, 9999)
window = 5 #seconds
acc_data_act = activity_count(acc_data_act, 'timestamp', window, LightCO, ModerateCO, HardCO, VeryHardCO, False)
acc_data_act.cache()

DataFrame[timestamp: timestamp, activity: int, activityIntensity: string]

In [42]:
acc_data_act.show(10000, False)

+-------------------+--------+-----------------+
|timestamp          |activity|activityIntensity|
+-------------------+--------+-----------------+
|2016-06-29 11:51:00|0       |0                |
|2016-06-29 11:51:05|0       |0                |
|2016-06-29 11:51:10|0       |0                |
|2016-06-29 11:51:15|0       |0                |
|2016-06-29 11:51:20|0       |0                |
|2016-06-29 11:51:25|0       |0                |
|2016-06-29 11:51:30|0       |0                |
|2016-06-29 11:51:35|0       |0                |
|2016-06-29 11:51:40|0       |0                |
|2016-06-29 11:51:45|0       |0                |
|2016-06-29 11:51:50|0       |0                |
|2016-06-29 11:51:55|0       |0                |
|2016-06-29 11:52:00|0       |0                |
|2016-06-29 11:52:05|0       |0                |
|2016-06-29 11:52:10|0       |0                |
|2016-06-29 11:52:15|0       |0                |
|2016-06-29 11:52:20|0       |0                |
|2016-06-29 11:52:25

In [43]:
acc_data_ext.show(10000, False)

+-------------------+-----+-----+-----+-----+----+--------+-------------+------------+----------+
|timestamp          |axis1|axis2|axis3|steps|lux |incl_off|incl_standing|incl_sitting|incl_lying|
+-------------------+-----+-----+-----+-----+----+--------+-------------+------------+----------+
|2016-06-29 11:51:00|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:05|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:10|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:15|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:20|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:25|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:30|0    |0    |0    |0    |0   |0       |0            |5           |0         |
|2016-06-29 11:51:35

In [44]:
# DETERMINE NON-WEAR PERIOD
ts_name = 'timestamp'
AC_name = 'activity'
AI_name = 'activityIntensity'
new_col = 'non_wear'
window = 5
minutes_zeros_row = 90

acc_data_act = non_wear_filter(acc_data_act, ts_name, AC_name, AI_name, window, minutes_zeros_row)
acc_data_act#.cache()
acc_data_act.count()

144243

In [45]:
# DETERMINE ACTIVITY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'activityBoutNumber'
window = 5
UP=9999
LOW=1953
DURATION=10
TOL=2

acc_data_act = activity_bout_filter(acc_data_act, ts_name, AC_name, new_col, window,
                                    UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.count()

144243

In [54]:
acc_data_act.show(20, False)

+-------------------+--------+-----------------+------------------+
|timestamp          |activity|activityIntensity|activityBoutNumber|
+-------------------+--------+-----------------+------------------+
|2016-06-29 11:51:00|0       |0                |0                 |
|2016-06-29 11:51:05|0       |0                |0                 |
|2016-06-29 11:51:10|0       |0                |0                 |
|2016-06-29 11:51:15|0       |0                |0                 |
|2016-06-29 11:51:20|0       |0                |0                 |
|2016-06-29 11:51:25|0       |0                |0                 |
|2016-06-29 11:51:30|0       |0                |0                 |
|2016-06-29 11:51:35|0       |0                |0                 |
|2016-06-29 11:51:40|0       |0                |0                 |
|2016-06-29 11:51:45|0       |0                |0                 |
|2016-06-29 11:51:50|0       |0                |0                 |
|2016-06-29 11:51:55|0       |0                |

In [None]:
# import glob
# list_procs = sorted(glob.glob("PALMS_output/*.csv"))
# print(list_procs)
# header_saved = False
# with open('PALMS_output.csv', 'w') as fout:
#     for filename in list_procs:
#         with open(filename) as fin:
#             head = next(fin)
#             if not header_saved:
#                 fout.write(head)
#                 header_saved = True
#             for line in fin:
#                 fout.write(line)

In [46]:
# DETERMINE SEDENTARY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'sedentaryBoutNumber'
window = 5
UP=180
LOW=0
DURATION=30
TOL=1

acc_data_act = sedentary_bout_filter(acc_data_act, ts_name, AC_name, new_col, window,
                                     UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.count()

144243

In [57]:
acc_data_act.show(20,False)

+-------------------+--------+-----------------+------------------+-------------------+
|timestamp          |activity|activityIntensity|activityBoutNumber|sedentaryBoutNumber|
+-------------------+--------+-----------------+------------------+-------------------+
|2016-06-29 11:51:00|0       |0                |0                 |0                  |
|2016-06-29 11:51:05|0       |0                |0                 |0                  |
|2016-06-29 11:51:10|0       |0                |0                 |0                  |
|2016-06-29 11:51:15|0       |0                |0                 |0                  |
|2016-06-29 11:51:20|0       |0                |0                 |0                  |
|2016-06-29 11:51:25|0       |0                |0                 |0                  |
|2016-06-29 11:51:30|0       |0                |0                 |0                  |
|2016-06-29 11:51:35|0       |0                |0                 |0                  |
|2016-06-29 11:51:40|0       |0 

## Merge dataframes

In [47]:
merge_data = gps_data3.join(acc_data, 'timestamp', how='left' ).orderBy('timestamp')
merge_data.cache()
merge_data.count()

20000

In [48]:
merge_data.show(20, False)

+-------------------+---+---------+-----------+-----------+----------+--------+-------+-----------------------+
|timestamp          |dow|lat      |lon        |fixTypeCode|tripNumber|tripType|tripMOT|acc_data               |
+-------------------+---+---------+-----------+-----------+----------+--------+-------+-----------------------+
|2016-06-29 13:29:50|3  |51.0782  |-114.128857|2          |1         |1       |1      |123,276,205,5,0,0,5,0,0|
|2016-06-29 13:29:55|3  |51.078183|-114.128778|1          |1         |2       |1      |126,289,211,5,0,0,5,0,0|
|2016-06-29 13:30:00|3  |51.078228|-114.128685|1          |1         |2       |1      |146,311,204,5,0,0,5,0,0|
|2016-06-29 13:30:05|3  |51.078237|-114.128582|1          |1         |2       |1      |131,296,204,6,0,0,5,0,0|
|2016-06-29 13:30:10|3  |51.078243|-114.128482|1          |1         |2       |1      |104,275,196,5,0,0,5,0,0|
|2016-06-29 13:30:15|3  |51.078243|-114.128382|1          |1         |2       |1      |110,302,200,5,0,0

In [49]:
merge_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- fixTypeCode: integer (nullable = true)
 |-- tripNumber: long (nullable = true)
 |-- tripType: integer (nullable = true)
 |-- tripMOT: string (nullable = true)
 |-- acc_data: string (nullable = true)



In [51]:
merge_data_act = gps_data3.join(acc_data_act, ['timestamp'], how='left' ).orderBy('timestamp')
merge_data_act.count()

20000

In [52]:
merge_data_act.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- fixTypeCode: integer (nullable = true)
 |-- tripNumber: long (nullable = true)
 |-- tripType: integer (nullable = true)
 |-- tripMOT: string (nullable = true)
 |-- activity: integer (nullable = true)
 |-- activityIntensity: string (nullable = true)
 |-- activityBoutNumber: integer (nullable = true)
 |-- sedentaryBoutNumber: integer (nullable = true)



In [53]:
merge_data_act.toPandas().head(40)

Unnamed: 0,timestamp,dow,lat,lon,fixTypeCode,tripNumber,tripType,tripMOT,activity,activityIntensity,activityBoutNumber,sedentaryBoutNumber
0,2016-06-29 13:29:50,3,51.0782,-114.128857,2,1,1,1,123,3,0,0
1,2016-06-29 13:29:55,3,51.078183,-114.128778,1,1,2,1,126,3,0,0
2,2016-06-29 13:30:00,3,51.078228,-114.128685,1,1,2,1,146,3,0,0
3,2016-06-29 13:30:05,3,51.078237,-114.128582,1,1,2,1,131,3,0,0
4,2016-06-29 13:30:10,3,51.078243,-114.128482,1,1,2,1,104,3,0,0
5,2016-06-29 13:30:15,3,51.078243,-114.128382,1,1,2,1,110,3,0,0
6,2016-06-29 13:30:20,3,51.078265,-114.128283,1,1,2,1,108,3,0,0
7,2016-06-29 13:30:25,3,51.078277,-114.12819,1,1,2,1,105,3,0,0
8,2016-06-29 13:30:30,3,51.0783,-114.128093,1,1,2,1,105,3,0,0
9,2016-06-29 13:30:35,3,51.078315,-114.128002,1,1,2,1,108,3,0,0


In [54]:
merge_data_ext = gps_data3.join(acc_data_ext, 'timestamp', how='left').orderBy('timestamp')

In [58]:
merge_data_ext.toPandas().head(20)

Unnamed: 0,timestamp,dow,lat,lon,fixTypeCode,tripNumber,tripType,tripMOT,axis1,axis2,axis3,steps,lux,incl_off,incl_standing,incl_sitting,incl_lying
0,2016-06-29 13:29:50,3,51.0782,-114.128857,2,1,1,1,123,276,205,5,0,0,5,0,0
1,2016-06-29 13:29:55,3,51.078183,-114.128778,1,1,2,1,126,289,211,5,0,0,5,0,0
2,2016-06-29 13:30:00,3,51.078228,-114.128685,1,1,2,1,146,311,204,5,0,0,5,0,0
3,2016-06-29 13:30:05,3,51.078237,-114.128582,1,1,2,1,131,296,204,6,0,0,5,0,0
4,2016-06-29 13:30:10,3,51.078243,-114.128482,1,1,2,1,104,275,196,5,0,0,5,0,0
5,2016-06-29 13:30:15,3,51.078243,-114.128382,1,1,2,1,110,302,200,5,0,0,5,0,0
6,2016-06-29 13:30:20,3,51.078265,-114.128283,1,1,2,1,108,299,194,5,0,0,5,0,0
7,2016-06-29 13:30:25,3,51.078277,-114.12819,1,1,2,1,105,297,211,6,0,0,5,0,0
8,2016-06-29 13:30:30,3,51.0783,-114.128093,1,1,2,1,105,277,207,5,0,0,5,0,0
9,2016-06-29 13:30:35,3,51.078315,-114.128002,1,1,2,1,108,280,189,5,0,0,5,0,0


In [76]:
merge_data2 = acc_data.join(gps_data3, 'timestamp', how='left' ).orderBy('timestamp')
merge_data2.cache()
merge_data2.count()

144245

In [77]:
merge_data2.show(20000)

+-------------------+--------------------+----+---------+-----------+-----------+----------+--------+-------+
|          timestamp|            acc_data| dow|      lat|        lon|fixTypeCode|tripNumber|tripType|tripMOT|
+-------------------+--------------------+----+---------+-----------+-----------+----------+--------+-------+
|2016-06-29 11:51:00|   0,0,0,0,0,0,0,5,0|null|     null|       null|       null|      null|    null|   null|
|2016-06-29 11:51:05|   0,0,0,0,0,0,0,5,0|null|     null|       null|       null|      null|    null|   null|
|2016-06-29 11:51:10|   0,0,0,0,0,0,0,5,0|null|     null|       null|       null|      null|    null|   null|
|2016-06-29 11:51:15|   0,0,0,0,0,0,0,5,0|null|     null|       null|       null|      null|    null|   null|
|2016-06-29 11:51:20|   0,0,0,0,0,0,0,5,0|null|     null|       null|       null|      null|    null|   null|
|2016-06-29 11:51:25|   0,0,0,0,0,0,0,5,0|null|     null|       null|       null|      null|    null|   null|
|2016-06-2

In [104]:
# merge_data2.coalesce(1).write.format("com.databricks.spark.csv").option("header", "true").save("myfile.csv")

In [138]:
merge_data2_pd = merge_data2.toPandas()

In [139]:
merge_data2_pd.dtypes

timestamp      datetime64[ns]
acc_data               object
dow                    object
lat                   float64
lon                   float64
fixTypeCode           float64
tripNumber            float64
tripType              float64
tripMOT                object
dtype: object

In [148]:
merge_data2_pd.fixTypeCode = merge_data2_pd.fixTypeCode.astype('Int64')
merge_data2_pd.tripNumber = merge_data2_pd.tripNumber.astype('Int64')
merge_data2_pd.tripType = merge_data2_pd.tripType.astype('Int64')

In [141]:
merge_data2_pd.dtypes

timestamp      datetime64[ns]
acc_data               object
dow                    object
lat                   float64
lon                   float64
fixTypeCode             Int64
tripNumber              Int64
tripType                Int64
tripMOT                object
dtype: object

In [145]:
merge_data2_pd[merge_data2_pd['tripNumber'] >= 0]

Unnamed: 0,timestamp,acc_data,dow,lat,lon,fixTypeCode,tripNumber,tripType,tripMOT
1186,2016-06-29 13:29:50,123276205500500,3,51.078200,-114.128857,2,1,1,1
1187,2016-06-29 13:29:55,126289211500500,3,51.078183,-114.128778,1,1,2,1
1188,2016-06-29 13:30:00,146311204500500,3,51.078228,-114.128685,1,1,2,1
1189,2016-06-29 13:30:05,131296204600500,3,51.078237,-114.128582,1,1,2,1
1190,2016-06-29 13:30:10,104275196500500,3,51.078243,-114.128482,1,1,2,1
1191,2016-06-29 13:30:15,110302200500500,3,51.078243,-114.128382,1,1,2,1
1192,2016-06-29 13:30:20,108299194500500,3,51.078265,-114.128283,1,1,2,1
1193,2016-06-29 13:30:25,105297211600500,3,51.078277,-114.128190,1,1,2,1
1194,2016-06-29 13:30:30,105277207500500,3,51.078300,-114.128093,1,1,2,1
1195,2016-06-29 13:30:35,108280189500500,3,51.078315,-114.128002,1,1,2,1
