In [None]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext, Row
from pyspark.sql import functions as F
from pyspark.mllib.stat import Statistics
from datetime import datetime, timedelta
from pyspark.sql.window import Window
from pyspark.sql.types import TimestampType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.sql.types import FloatType
from pyspark.sql.types import DoubleType
from math import floor
import time

from GPSProcessing import *
from AccProcessing import *

In [None]:
# REFERENCE: https://spark.apache.org/docs/latest/configuration.html

conf = SparkConf().setAll([('spark.memory.fraction','0.6'),
                           ('spark.executor.memory', '16g'),
                           ('spark.driver.memory','16g'),
                           ('spark.sql.shuffle.partitions','20'),
                           ('spark.memory.offHeap.enabled', True),
                           ('spark.memory.offHeap.size','16g'),
                           ('spark.cleaner.referenceTracking.cleanCheckpoints', True)]#,
                           #('spark.driver.cores', '4'),
                           #('spark.executor.cores', '4'),
                           #('spark.worker.cleanup.enabled','true'),
                           #('spark.sql.session.timeZone', 'UTC')])
                         )

spark  = SparkSession.builder.config(conf=conf).master("local[*]").appName("GPS+ACC").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
sc.setCheckpointDir('checkpoints')
sc.getConf().getAll()  # or sc._conf.getAll()

In [None]:
sc.defaultParallelism

In [None]:
sc.defaultMinPartitions

In [None]:
sc

## PALMS output

In [None]:
palms_path = '/Users/molinaro/Documents/GITHUB/PALMS/Calgary/PALMS_output.csv'

In [None]:
palms_out = spark.read.csv(palms_path, header=True, inferSchema=True, )
palms_out.cache();

In [None]:
palms_out.rdd.getNumPartitions()

In [None]:
palms_out.filter(palms_out['identifier']=='A01').select('identifier','lat','lon','dateTime').show(20,False)

In [None]:
palms_out.filter(palms_out['identifier']=='A01').count()

In [None]:
palms_out.filter(palms_out['identifier']=='A01').printSchema()

In [None]:
date_format = '%Y-%m-%d'
time_format = '%H:%M:%S'
datetime_format = date_format + ' ' + time_format
startdate = datetime.strptime('2016-08-16 18:23:25', datetime_format) 

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.activity==-2))\
.select('identifier','lat','lon','dateTime','activity','activityIntensity','activityBoutNumber').show(29000)

In [None]:
df3.filter((F.col('activityIntensity')==-2) & (F.col('timestamp')>=startdate)).orderBy('timestamp').show(29000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.activityBoutNumber==7))\
.select('lat','lon','dateTime','activity','activityIntensity',
        'activityBoutNumber','sedentaryBoutNumber').show(2000)

In [None]:
df3.filter((F.col('activityBoutNumber')==7) & (F.col('timestamp')>=startdate)).orderBy('timestamp').show(2000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.sedentaryBoutNumber == 14))\
.select('identifier','lat','lon','dateTime','activity',
        'activityBoutNumber','sedentaryBoutNumber').show(20000)

In [None]:
df3.filter((F.col('sedentaryBoutNumber')==14) & (F.col('timestamp')>=startdate)).show(29000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.fixTypeCode==3))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(2000)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.fixTypeCode==4))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
gps_data.filter(F.col('fixTypeCode')==3).orderBy('timestamp').show(2000)

In [None]:
gps_data.filter(F.col('fixTypeCode')==4).orderBy('timestamp').count()

In [None]:
palms_out.filter((palms_out.identifier=='A01') )\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(20000,False)

In [None]:
palms_out.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(20000,False)

In [None]:
gps_data.show(20000,False)

In [None]:
palms_out.filter((palms_out.identifier=='A01') & (palms_out.tripType==1))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').show(2000)

In [None]:
df.drop(*['height','speed','heading','dow','lat','lon'])\
.filter((F.col('tripType')==4)).show(2000)

In [None]:
df.filter(F.col('tripType')==1).show(2000)

In [None]:
spark.catalog.clearCache()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==5))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==3))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==2))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==1))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==4))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A02') & (palms_out.fixTypeCode==6))\
.select('lat','lon','dateTime','fixTypeCode','tripType','activity',
        'activityIntensity','activityBoutNumber').count()

In [None]:
palms_out.filter((palms_out.identifier=='A01')).count()

## GPS data processing

In [None]:
gps_path_raw = '/Users/molinaro/Documents/GITHUB/PALMS/data/PARC_data/NYC/GPS/2001.csv'

In [None]:
gps_path_raw = '/Users/molinaro/Documents/GITHUB/HABITUS/test_data/gps/A02.csv'

In [None]:
gps_data_raw = spark.read.csv(gps_path_raw, header=True, inferSchema=True)

In [None]:
date_format = 'yyyy/MM/dd'
time_format = 'HH:mm:ss'
datetime_format = date_format + ' ' + time_format

gps_data = gen_gps_dataframe(gps_data_raw, datetime_format)
gps_data.cache()

In [None]:
gps_data.show(20,False)

In [None]:
# date_format = '%Y-%m-%d'
# time_format = '%H:%M:%S'
# datetime_format = date_format + ' ' + time_format
# startdate = datetime.strptime('2016-06-29 13:00:00', datetime_format) 
# gps_data.filter((F.col('timestamp')>=startdate)).show(20000,False)

In [None]:
gps_data.printSchema()

In [None]:
gps_data.count()

In [None]:
# Round seconds in timestamps according to the interval

interval = 5 # seconds
ts_name = 'timestamp'
ws = 600 # seconds

print("====> align timestamps...")
start_time = time.time()
gps_data = round_timestamp(gps_data, ts_name, interval).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
# Set fix type

ts_name = 'timestamp'
ws=600
print("====> set fix type...")
start_time = time.time()
gps_data = set_fix_type(gps_data, ts_name, ws).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
# Apply filter on the velocity

vmax = 130 # km/h

print("====> apply velocity filter...")
start_time = time.time()
gps_data = filter_speed(gps_data, 'speed', vmax).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
# Apply filter over max acceleration
                                             
scol = 'speed'
tscol = 'timestamp'

print("====> apply accelaration filter...")
start_time = time.time()
gps_data = filter_acceleration(gps_data, scol, tscol).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
# Apply filter on the height variation

dhmax=1000
print("====> apply height variation filter...")
start_time = time.time()
gps_data = filter_height(gps_data, 'height', 'timestamp', dhmax).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data.count()

In [None]:
# Apply filter over three fixes (it also recalculates distance column)

dcol = 'distance'
tscol = 'timestamp'
dmin = 10

print("====> apply three fixes filter...")
start_time = time.time()
gps_data = filter_change_dist_3_fixes(gps_data, dcol, tscol, dmin).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data.count()

In [None]:
# Generate missing values up to maximum signal loss

print("====> fill in missing value...")
start_time = time.time()
gps_data = fill_timestamp(gps_data, 'timestamp', 'fixTypeCode', interval, ws).cache()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data.show(20,False)

In [None]:
gps_data.printSchema()

In [None]:
# Filter timestamps over given interval
INTERVAL = 5
ts_name = 'timestamp'
print("====> filter GPS data every {} seconds...".format(str(INTERVAL)))
start_time = time.time()
gps_data = select_gps_intervals(gps_data, ts_name, INTERVAL)
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data = gps_data.limit(20000)

In [None]:
gps_data.printSchema()

In [None]:
vmax = 130 # km/h
max_dist_per_min = vmax * 1000/60 # meters
min_dist_per_min = 25 # meters
min_pause_duration = 120 # second
max_pause_time = 180 # seconds

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'
fix_type_name = 'fixTypeCode'

print("====> detect trips...")
start_time = time.time()
gps_data2 = detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, 
                 min_pause_duration, max_pause_time, vmax).cache()
gps_data2.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data2.printSchema()

In [None]:
vehicle_speed_cutoff = 35 # km/h
bicycle_speed_cutoff = 10 # km/h 
walk_speed_cutoff = 1 # km/h
speed_percentile = 90
speed_segment_length = 30 # m
min_trip_length = 100 # m
min_trip_duration = 180 # sec

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'

print("====> classify trips...")
start_time = time.time()
gps_data3 = classify_trips(gps_data2, ts_name, dist_name, speed_name, 
                           vehicle_speed_cutoff, bicycle_speed_cutoff, walk_speed_cutoff, 
                           min_trip_length, min_trip_duration, speed_segment_length, speed_percentile).cache()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data3.printSchema()

### Process GPS data in Calgary/gps/A01r.csv

In [None]:
# segment 108

In [None]:
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3")
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
# segment 64

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j4', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE4").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j3']).show(20000, False)

In [None]:
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df2, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df3, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
# segment 28

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
# segment 13

In [None]:
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i1', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df2 = df.drop(*['j1','j2','j3','j4'])
df2 = set_pause(df2, 'i1', ts_name).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df2 = check_case(df2, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df2.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i1', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i3','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
# segment 108
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i3', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df = check_case(df, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df2 = df2.drop(*['j1','j2','j3','j4'])
df2 = set_pause(df2, 'i3', ts_name).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
df2 = check_case(df2, 'i3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df2, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i1','j4']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i3', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i2']).show(20000, False)

In [None]:
# segment 166
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = proc_segment(df, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = df.drop(*['j1','j2','j3','j4'])
df = set_pause(df, 'i2', ts_name).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df = check_case(df, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j3']).show(20000, False)

In [None]:
df2 = proc_segment(df, 'j4', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE4").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j3']).show(20000, False)

In [None]:
df2 = df2.drop(*['j1','j2','j3'])
df2 = set_pause(df2, 'j4', ts_name).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j3']).show(20000, False)

In [None]:
df2 = check_case(df2, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df2 = proc_segment(df2, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df2.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4','j1','j2']).show(20000, False)

In [None]:
df3 = df2.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i2', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i2', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = df3.drop(*['j1','j2','j3','j4'])
df3 = set_pause(df3, 'i2', ts_name).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df3 = check_case(df3, 'i2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df3 = proc_segment(df3, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4','j1','j2']).show(20000, False)

In [None]:
df3.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4','j1','j2','j3']).show(20000, False)

In [None]:
# segment 180
df4 = proc_segment(df3, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = df4.drop(*['j1','j2','j3','j4'])
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = df4.drop(*['j1','j2','j3','j4'])
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df4 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df4 = proc_segment(df4, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df4 = df4.drop(*['j1','j2','j3','j4'])
df4 = set_pause(df4, 'i1', ts_name).checkpoint()
df4.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df5 = check_case(df4, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df5 = proc_segment(df5, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df5 = df5.drop(*['j1','j2','j3','j4'])
df5 = set_pause(df5, 'i1', ts_name).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df5 = check_case(df5, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df5 = proc_segment(df5, 'j2', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 3, "CASE2").checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3']).show(20000, False)

In [None]:
df5 = df5.drop(*['j1','j2','j3','j4'])
df5 = set_pause(df5, 'i1', ts_name).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df5 = check_case(df5, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df5 = proc_segment(df5, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df5.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
#segment 188
df6 = proc_segment(df5, 'j1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 3, 2, "CASE1").checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df6 = df6.drop(*['j1','j2','j3','j4'])
df6 = set_pause(df6, 'i1', ts_name).checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3']).show(20000, False)

In [None]:
df6 = check_case(df6, 'i1', ts_name, min_dist_per_min, min_pause_duration, max_pause_time).checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i2','i3','j4']).show(20000, False)

In [None]:
df6 = proc_segment(df6, 'j3', ts_name, min_dist_per_min, min_pause_duration, max_pause_time, 0, 0, "CASE3").checkpoint()
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','i1','i3','j4']).show(20000, False)

In [None]:
df6.drop(*['lat','lon','dow','distance','fixTypeCode',
         'cum_pause','total_sec','height','speed','heading','state_cp','tripType_cp','j4','j1','j2','j3']).show(20000, False)

In [None]:
def push_and_pop(rdd):
    # two transformations: moves the head element to the tail
    first = rdd.first()
    return rdd.filter(
        lambda obj: obj != first
    ).union(
        sc.parallelize([first])
    )

def serialize_and_deserialize(rdd):
    # perform a collect() action to evaluate the rdd and create a new instance
    return sc.parallelize(rdd.collect())

def do_test(serialize=False):
    rdd = sc.parallelize(range(1000))
    for i in xrange(25):
        t0 = time.time()
        rdd = push_and_pop(rdd)
        if serialize:
            rdd = serialize_and_deserialize(rdd)
        print("%.3f" % (time.time() - t0))

do_test()

In [None]:
tw = str(60) + ' seconds'
sw = str(60) + ' seconds'

st_sec = str(df.first()[0].second)

offset = st_sec + ' seconds' 

param_name = 'distance'

df.groupBy(
            F.window(ts_name, tw, sw, offset)
          ).sum(param_name)\
           .select('window','sum({})'.format(param_name))\
           .withColumn('start', F.col('window').start)\
           .withColumn('end', F.col('window').end)\
           .drop('window')\
           .show(20, False)
"""
df.groupBy(
            F.window(ts_name, tw, sw, offset)
          ).sum()\
           .sort('window.start')\
           .filter(F.col('sum({})'.format(param_name))>=min_dist_per_min)\
           .select('window')\
           .withColumn('start', F.col('window').start)\
           .withColumn('end', F.col('window').end)\
           .drop('window')\
           .show(20, False)
"""

## Accelerometer data processing

In [None]:
acc_path_raw = '/Users/molinaro/Documents/GITHUB/PALMS/data/PARC_data/NYC/accelerometer/2001_1s.csv'

In [None]:
acc_path_raw = '/Users/molinaro/Documents/GITHUB/HABITUS/test_data/acc/A02_5sec.csv'

In [None]:
acc_data_raw = spark.read.text(acc_path_raw)
acc_data_raw.cache()

In [None]:
acc_data_raw.count()

In [None]:
interval, acc_data = gen_acc_dataframe(acc_data_raw)

In [None]:
acc_data.printSchema()

In [None]:
acc_columns = ['axis1','axis2','axis3','steps','lux','incl_off','incl_standing','incl_sitting','incl_lying']
acc_data_ext = split_acc_data(acc_data, acc_columns)
acc_data_ext.cache()

In [None]:
ts_name = 'timestamp'
window = 5 #seconds

acc_data_act = select_acc_intervals(acc_data_ext, ts_name, interval, window, False, True)
acc_data_act.show(20, False)

In [None]:
LightCO, ModerateCO, HardCO, VeryHardCO = (133, 193, 233, 9999)
window = 5 #seconds
acc_data_act = activity_count(acc_data_act, 'timestamp', window, LightCO, ModerateCO, HardCO, VeryHardCO, False)
acc_data_act.cache()

In [None]:
acc_data_act.print_schema()

In [None]:
acc_data_ext.show(10000, False)

In [None]:
# DETERMINE NON-WEAR PERIOD
ts_name = 'timestamp'
AC_name = 'activity'
AI_name = 'activityIntensity'
new_col = 'non_wear'
window = 5
minutes_zeros_row = 90

acc_data_act = non_wear_filter(acc_data_act, ts_name, AC_name, AI_name, window, minutes_zeros_row)
acc_data_act.cache()
acc_data_act.count()

In [None]:
# DETERMINE ACTIVITY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'activityBoutNumber'
window = 5
UP=9999
LOW=1953
DURATION=10
TOL=2

acc_data_act = activity_bout_filter(acc_data_act, ts_name, AC_name, new_col, window,
                                    UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.count()

In [None]:
acc_data_act.show(20, False)

In [None]:
# DETERMINE SEDENTARY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'sedentaryBoutNumber'
window = 5
UP=180
LOW=0
DURATION=30
TOL=1

acc_data_act = sedentary_bout_filter(acc_data_act, ts_name, AC_name, new_col, window,
                                     UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.count()

In [None]:
acc_data_act.show(20,False)

## Merge dataframes

In [None]:
merge_data = gps_data3.join(acc_data, 'timestamp', how='left' ).orderBy('timestamp')
merge_data.cache()
merge_data.count()

In [None]:
merge_data.printSchema()

In [None]:
merge_data.show(20, False)