In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
%%HTML
<style> .CodeMirror pre { font-size: 120% !important; } </style>

In [3]:
import pyspark
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession, SQLContext, Row
from pyspark.sql import functions as F
from datetime import datetime, timedelta
from math import floor
import time

from src.GPSProcessing import *
from src.AccProcessing import *

%load_ext line_profiler

In [4]:
# REFERENCE: https://spark.apache.org/docs/latest/configuration.html

conf = SparkConf().setAll([('spark.memory.fraction', '0.6'),
                           ('spark.executor.memory', '32g'),
                           ('spark.driver.memory', '32g'),
                           ('spark.sql.shuffle.partitions', '20'),
                           ('spark.memory.offHeap.enabled', True),
                           ('spark.memory.offHeap.size', '16g'),
                           ('spark.cleaner.referenceTracking.cleanCheckpoints', True),
                           ('spark.driver.host', '127.0.0.1'),
                           ('spark.scheduler.listenerbus.eventqueue.capacity', '50000')]  # ,
                          # ('spark.driver.cores', '4'),
                          # ('spark.executor.cores', '4'),
                          # ('spark.worker.cleanup.enabled','true'),
                          # ('spark.sql.session.timeZone', 'UTC')])
                          )

spark = SparkSession.builder.config(conf=conf).master(
    "local[*]").appName("test").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
sc.setCheckpointDir('checkpoints')
sc.getConf().getAll()

[('spark.sql.shuffle.partitions', '20'),
 ('spark.app.name', 'test'),
 ('spark.driver.memory', '32g'),
 ('spark.driver.host', '127.0.0.1'),
 ('spark.cleaner.referenceTracking.cleanCheckpoints', 'True'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1575660989603'),
 ('spark.executor.memory', '32g'),
 ('spark.scheduler.listenerbus.eventqueue.capacity', '50000'),
 ('spark.rdd.compress', 'True'),
 ('spark.memory.fraction', '0.6'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.memory.offHeap.enabled', 'True'),
 ('spark.driver.port', '62306'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.memory.offHeap.size', '16g')]

In [5]:
sc.defaultParallelism

8

In [6]:
sc.defaultMinPartitions

2

In [7]:
sc

In [5]:
sc.stop()

## GPS data processing

In [8]:
gps_path_raw = '/Users/molinaro/Documents/GITHUB/HABITUS/notebooks/data/PP001_GPS_T0.csv'

In [9]:
gps_data_raw = spark.read.csv(gps_path_raw, header=True, inferSchema=True)

In [10]:
gps_data_raw.show()

+---+------+-----------+---------+-----------+-----------+---+-----------+----+----------+----+----------+-----------+
|RCR| INDEX|   UTC DATE| UTC TIME| LOCAL DATE| LOCAL TIME| MS|   LATITUDE| N/S| LONGITUDE| E/W|    HEIGHT|      SPEED|
+---+------+-----------+---------+-----------+-----------+---+-----------+----+----------+----+----------+-----------+
|RCR|   3.0| 2016/07/04| 13:18:59| 2016/07/04|   14:18:59|0.0|50.85629377|   N|5.71624049|   E| 149.978 M| 0.155 km/h|
|RCR|   4.0| 2016/07/04| 13:19:09| 2016/07/04|   14:19:09|0.0|50.85632719|   N|5.71641579|   E| 150.017 M| 0.578 km/h|
|RCR|   5.0| 2016/07/04| 13:19:19| 2016/07/04|   14:19:19|0.0|50.85633267|   N|5.71632168|   E| 150.014 M| 0.722 km/h|
|RCR|   6.0| 2016/07/04| 13:19:29| 2016/07/04|   14:19:29|0.0|50.85609586|   N|5.71436013|   E| 132.444 M| 0.807 km/h|
|RCR|   7.0| 2016/07/04| 13:19:39| 2016/07/04|   14:19:39|0.0|50.85608287|   N|5.71434262|   E| 131.921 M| 0.350 km/h|
|RCR|   8.0| 2016/07/04| 13:19:49| 2016/07/04|  

In [11]:
date_format = 'yyyy/MM/dd'
time_format = 'HH:mm:ss'
datetime_format = date_format + ' ' + time_format

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'
height_name = 'height'
fix_type_name = 'fixTypeCode'

gps_data = gen_gps_dataframe(gps_data_raw, ts_name, datetime_format)

In [None]:
gps_data.printSchema()

In [13]:
gps_data.count()

51333

In [12]:
gps_data.show()

+-------------------+---+-----------+----------+--------+-------+-----+
|          timestamp|dow|        lat|       lon|distance| height|speed|
+-------------------+---+-----------+----------+--------+-------+-----+
|2016-07-04 15:18:59|  1|50.85629377|5.71624049|     0.0|149.978|0.155|
|2016-07-04 15:19:09|  1|50.85632719|5.71641579|     0.0|150.017|0.578|
|2016-07-04 15:19:19|  1|50.85633267|5.71632168|     0.0|150.014|0.722|
|2016-07-04 15:19:29|  1|50.85609586|5.71436013|     0.0|132.444|0.807|
|2016-07-04 15:19:39|  1|50.85608287|5.71434262|     0.0|131.921| 0.35|
|2016-07-04 15:19:49|  1|50.85608319|5.71434473|     0.0|131.829|0.078|
|2016-07-04 15:19:59|  1|50.85608475|5.71435562|     0.0|131.727|0.511|
|2016-07-04 15:20:09|  1|50.85608899|5.71437457|     0.0|131.511|0.134|
|2016-07-04 15:20:19|  1|50.85607096|5.71436563|     0.0|124.778| 0.18|
|2016-07-04 15:20:29|  1|50.85606151|5.71434868|     0.0|123.459|0.063|
|2016-07-04 15:20:39|  1|50.85605709|5.71432674|     0.0| 123.08

In [13]:
# Filter timestamps over given interval
INTERVAL = 60

print("====> filter GPS data every {} seconds...".format(str(INTERVAL)))
start_time = time.time()
gps_data = select_gps_intervals(gps_data, ts_name, INTERVAL)
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> filter GPS data every 60 seconds...
      time elapsed: 00:00:01


In [14]:
# Set fix type

ws = 600

print("====> set fix type...")
start_time = time.time()
gps_data = set_fix_type(gps_data, ts_name, fix_type_name, ws).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> set fix type...
      time elapsed: 00:00:03


In [15]:
gps_data.show()

+-------------------+---+-----------+----------+--------+-------+-----+-----------+
|          timestamp|dow|        lat|       lon|distance| height|speed|fixTypeCode|
+-------------------+---+-----------+----------+--------+-------+-----+-----------+
|2016-07-04 15:18:59|  1|50.85629377|5.71624049|     0.0|149.978|0.155|          2|
|2016-07-04 15:19:59|  1|50.85608475|5.71435562|     0.0|131.727|0.511|          1|
|2016-07-04 15:20:59|  1|50.85604772|5.71433521|     0.0|123.092|0.442|          1|
|2016-07-04 15:21:59|  1|50.85598331| 5.7140342|     0.0| 93.506|0.193|          1|
|2016-07-04 15:22:59|  1|50.85605236|5.71394756|     0.0| 93.143|0.799|          1|
|2016-07-04 15:23:59|  1|50.85612686|5.71396679|     0.0| 91.268|0.466|          1|
|2016-07-04 15:24:59|  1|50.85604035|5.71402272|     0.0| 91.493|1.392|          1|
|2016-07-04 15:25:59|  1|50.85599181|5.71415213|     0.0| 96.562|0.952|          1|
|2016-07-04 15:26:59|  1|50.85603135|5.71424691|     0.0|102.231|1.056|     

In [16]:
# Apply filter on the velocity

vmax = 130 # km/h

print("====> apply velocity filter...")
start_time = time.time()
gps_data = filter_speed(gps_data, speed_name, fix_type_name, vmax).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply velocity filter...
      time elapsed: 00:00:00


In [17]:
# Apply filter over max acceleration
                                             
print("====> apply accelaration filter...")
start_time = time.time()
gps_data = filter_acceleration(gps_data, speed_name, ts_name, fix_type_name).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply accelaration filter...
      time elapsed: 00:00:00


In [18]:
# Apply filter on the height variation

dhmax=1000

print("====> apply height variation filter...")
start_time = time.time()
gps_data = filter_height(gps_data, height_name, ts_name, fix_type_name, dhmax).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply height variation filter...
      time elapsed: 00:00:01


In [19]:
gps_data.count()

8547

In [20]:
# Apply filter over three fixes (it also recalculates distance column)

dmin = 10
dcol = 'distance'

print("====> apply three fixes filter...")
start_time = time.time()
gps_data = filter_change_dist_3_fixes(gps_data, dist_name, ts_name, fix_type_name, dmin).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply three fixes filter...
      time elapsed: 00:00:01


In [21]:
gps_data.count()

8352

In [22]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: string (nullable = true)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- fixTypeCode: integer (nullable = false)



In [23]:
# Round seconds in timestamps according to the interval
interval = INTERVAL # seconds
ts_name = 'timestamp'

print("====> align timestamps...")
start_time = time.time()
gps_data = round_timestamp(gps_data, ts_name, interval).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> align timestamps...
      time elapsed: 00:00:00


In [24]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: string (nullable = true)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- fixTypeCode: integer (nullable = false)



In [25]:
gps_data.show()

+-------------------+---+-----------+----------+-------------------+-------+-----+-----------+
|          timestamp|dow|        lat|       lon|           distance| height|speed|fixTypeCode|
+-------------------+---+-----------+----------+-------------------+-------+-----+-----------+
|2016-07-04 15:18:00|  1|50.85629377|5.71624049|                0.0|149.978|0.155|          2|
|2016-07-04 15:19:00|  1|50.85608475|5.71435562| 134.24798041134827|131.727|0.511|          1|
|2016-07-04 15:20:00|  1|50.85604772|5.71433521| 4.3569327100030275|123.092|0.442|          1|
|2016-07-04 15:21:00|  1|50.85598331| 5.7140342|  22.29599246518401| 93.506|0.193|          1|
|2016-07-04 15:22:00|  1|50.85605236|5.71394756|  9.788643400583524| 93.143|0.799|          1|
|2016-07-04 15:23:00|  1|50.85612686|5.71396679|   8.38800543260042| 91.268|0.466|          1|
|2016-07-04 15:24:00|  1|50.85604035|5.71402272| 10.383249732586508| 91.493|1.392|          1|
|2016-07-04 15:25:00|  1|50.85599181|5.71415213| 1

In [26]:
# gps_data = gps_data.limit(1000)

In [27]:
vmax = 130 # km/h
min_dist_per_min = 25 # meters
min_pause_duration = 120 # second
max_pause_time = 180 # seconds

In [28]:
# %lprun -f detect_trips detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, min_pause_duration, max_pause_time, vmax)

In [29]:
print("====> detect trips...")
start_time = time.time()
gps_data2 = detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, 
                 min_pause_duration, max_pause_time, vmax).cache()
gps_data2.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> detect trips...
      time elapsed: 00:25:27


In [30]:
gps_data2.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- fixTypeCode: integer (nullable = false)
 |-- dow: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- speed: double (nullable = true)
 |-- duration: integer (nullable = true)
 |-- tripType: integer (nullable = true)
 |-- cum_pause: long (nullable = true)



In [31]:
gps_data2.show(10000)

+-------------------+-----------+----------+-----------+---+--------------------+-------+--------+--------+---------+
|          timestamp|        lat|       lon|fixTypeCode|dow|            distance|  speed|duration|tripType|cum_pause|
+-------------------+-----------+----------+-----------+---+--------------------+-------+--------+--------+---------+
|2016-07-04 15:18:00|50.85629377|5.71624049|          2|  1|                 0.0|  0.155|       0|       1|        0|
|2016-07-04 15:19:00|50.85608475|5.71435562|          1|  1|  134.24798041134827|  0.511|      60|       2|       60|
|2016-07-04 15:20:00|50.85604772|5.71433521|          1|  1|  4.3569327100030275|  0.442|      60|       3|      120|
|2016-07-04 15:21:00|50.85598331| 5.7140342|          1|  1|   22.29599246518401|  0.193|      60|       3|      180|
|2016-07-04 15:22:00|50.85605236|5.71394756|          1|  1|   9.788643400583524|  0.799|      60|       2|      240|
|2016-07-04 15:23:00|50.85612686|5.71396679|          1|

In [32]:
vehicle_speed_cutoff = 35 # km/h
bicycle_speed_cutoff = 10 # km/h 
walk_speed_cutoff = 1 # km/h
speed_percentile = 90
speed_segment_length = 30 # m
min_trip_length = 100 # m
min_trip_duration = 120 # sec

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'

print("====> classify trips...")
start_time = time.time()
gps_data3 = classify_trips(gps_data2, ts_name, dist_name, speed_name, fix_type_name,
                           vehicle_speed_cutoff, bicycle_speed_cutoff, walk_speed_cutoff, 
                           min_trip_length, min_trip_duration, speed_segment_length, speed_percentile).cache()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> classify trips...
      time elapsed: 00:01:00


In [33]:
gps_data3.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- fixTypeCode: integer (nullable = false)
 |-- tripNumber: long (nullable = true)
 |-- tripType: integer (nullable = true)
 |-- tripMOT: string (nullable = true)



In [34]:
gps_data3.show(10000)

+-------------------+---+-----------+----------+-----------+----------+--------+-------+
|          timestamp|dow|        lat|       lon|fixTypeCode|tripNumber|tripType|tripMOT|
+-------------------+---+-----------+----------+-----------+----------+--------+-------+
|2016-07-04 15:18:00|  1|50.85629377|5.71624049|          2|         1|       1|      1|
|2016-07-04 15:19:00|  1|50.85608475|5.71435562|          1|         1|       2|      1|
|2016-07-04 15:20:00|  1|50.85604772|5.71433521|          1|         1|       3|      0|
|2016-07-04 15:21:00|  1|50.85598331| 5.7140342|          1|         1|       3|      0|
|2016-07-04 15:22:00|  1|50.85605236|5.71394756|          1|         1|       2|      1|
|2016-07-04 15:23:00|  1|50.85612686|5.71396679|          1|         1|       2|      1|
|2016-07-04 15:24:00|  1|50.85604035|5.71402272|          1|         1|       4|      1|
|2016-07-04 15:25:00|  1|50.85599181|5.71415213|          1|         0|       0|      0|
|2016-07-04 15:26:00|

In [35]:
# Generate missing values up to maximum signal loss

print("====> fill in missing value...")
start_time = time.time()
gps_data3 = fill_timestamp(gps_data3, ts_name, fix_type_name, interval, ws).cache()
gps_data3.checkpoint()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> fill in missing value...
      time elapsed: 00:00:08


In [36]:
gps_data3.show(10000)

+-------------------+---+-----------+----------+-----------+----------+--------+-------+
|          timestamp|dow|        lat|       lon|fixTypeCode|tripNumber|tripType|tripMOT|
+-------------------+---+-----------+----------+-----------+----------+--------+-------+
|2016-07-04 15:18:00|  1|50.85629377|5.71624049|          2|         1|       1|      1|
|2016-07-04 15:19:00|  1|50.85608475|5.71435562|          1|         1|       2|      1|
|2016-07-04 15:20:00|  1|50.85604772|5.71433521|          1|         1|       3|      0|
|2016-07-04 15:21:00|  1|50.85598331| 5.7140342|          1|         1|       3|      0|
|2016-07-04 15:22:00|  1|50.85605236|5.71394756|          1|         1|       2|      1|
|2016-07-04 15:23:00|  1|50.85612686|5.71396679|          1|         1|       2|      1|
|2016-07-04 15:24:00|  1|50.85604035|5.71402272|          1|         1|       4|      1|
|2016-07-04 15:25:00|  1|50.85599181|5.71415213|          1|         0|       0|      0|
|2016-07-04 15:26:00|

## Accelerometer data processing

In [37]:
acc_path_raw = '/Users/molinaro/Documents/GITHUB/HABITUS/notebooks/data/PP001_actigraph_10.csv'

In [38]:
acc_data_raw = spark.read.text(acc_path_raw)
acc_data_raw.cache()
acc_data_raw.checkpoint()

DataFrame[value: string]

In [39]:
acc_data_raw.count()

80056

In [40]:
ts_name = 'timestamp'

interval, acc_data = gen_acc_dataframe(acc_data_raw, ts_name)

In [41]:
acc_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- acc_data: string (nullable = true)



In [42]:
acc_columns = ['axis1','axis2','axis3','steps','lux','incl_off','incl_standing','incl_sitting','incl_lying']
acc_data_ext = split_acc_data(acc_data, acc_columns)
acc_data_ext.cache()

DataFrame[timestamp: timestamp, axis1: int, axis2: int, axis3: int, steps: int, lux: int, incl_off: int, incl_standing: int, incl_sitting: int, incl_lying: int]

In [44]:
ts_name = 'timestamp'
INTERVAL = 60 #seconds
window = INTERVAL

acc_data_act = select_acc_intervals(acc_data_ext, ts_name, interval, window, False, True)
acc_data_act.show(20, False)

+-------------------+-----+-----+-----+-----+---+--------+-------------+------------+----------+
|timestamp          |axis1|axis2|axis3|steps|lux|incl_off|incl_standing|incl_sitting|incl_lying|
+-------------------+-----+-----+-----+-----+---+--------+-------------+------------+----------+
|2016-07-04 09:00:00|0    |0    |0    |0    |0  |null    |null         |null        |null      |
|2016-07-04 09:01:00|0    |0    |0    |0    |0  |null    |null         |null        |null      |
|2016-07-04 09:02:00|0    |0    |0    |0    |0  |null    |null         |null        |null      |
|2016-07-04 09:03:00|0    |0    |0    |0    |0  |null    |null         |null        |null      |
|2016-07-04 09:04:00|0    |0    |0    |0    |0  |null    |null         |null        |null      |
|2016-07-04 09:05:00|0    |0    |0    |0    |0  |null    |null         |null        |null      |
|2016-07-04 09:06:00|0    |0    |0    |0    |0  |null    |null         |null        |null      |
|2016-07-04 09:07:00|0    |0  

In [45]:
LightCO, ModerateCO, HardCO, VeryHardCO = (100, 1953, 5725, 9498)
window = INTERVAL
acc_data_act = activity_count(acc_data_act, 'timestamp', window, LightCO, ModerateCO, HardCO, VeryHardCO, False)
acc_data_act.cache()
acc_data_act.checkpoint()

DataFrame[timestamp: timestamp, activity: bigint, activityIntensity: string]

In [46]:
acc_data_act.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- activity: long (nullable = true)
 |-- activityIntensity: string (nullable = true)



In [47]:
acc_data_act.show()

+-------------------+--------+-----------------+
|          timestamp|activity|activityIntensity|
+-------------------+--------+-----------------+
|2016-07-04 09:00:00|       0|                0|
|2016-07-04 09:01:00|       0|                0|
|2016-07-04 09:02:00|       0|                0|
|2016-07-04 09:03:00|       0|                0|
|2016-07-04 09:04:00|       0|                0|
|2016-07-04 09:05:00|       0|                0|
|2016-07-04 09:06:00|       0|                0|
|2016-07-04 09:07:00|       0|                0|
|2016-07-04 09:08:00|       0|                0|
|2016-07-04 09:09:00|       0|                0|
|2016-07-04 09:10:00|       0|                0|
|2016-07-04 09:11:00|       0|                0|
|2016-07-04 09:12:00|       0|                0|
|2016-07-04 09:13:00|       0|                0|
|2016-07-04 09:14:00|       0|                0|
|2016-07-04 09:15:00|       0|                0|
|2016-07-04 09:16:00|       0|                0|
|2016-07-04 09:17:00

In [48]:
# DETERMINE NON-WEAR PERIOD
ts_name = 'timestamp'
AC_name = 'activity'
AI_name = 'activityIntensity'
new_col = 'non_wear'
window = INTERVAL
minutes_zeros_row = 60

acc_data_act = non_wear_filter(acc_data_act, ts_name, AC_name, AI_name, window, minutes_zeros_row)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

13341

In [49]:
# DETERMINE ACTIVITY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'activityBoutNumber'
window = INTERVAL
UP = 9999
LOW = 1953
DURATION = 10
TOL = 2

acc_data_act = activity_bout_filter(acc_data_act, ts_name, AC_name, new_col, window, UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

13341

In [50]:
acc_data_act.show(20, False)

+-------------------+--------+-----------------+------------------+
|timestamp          |activity|activityIntensity|activityBoutNumber|
+-------------------+--------+-----------------+------------------+
|2016-07-04 09:00:00|-2      |-2               |0                 |
|2016-07-04 09:01:00|-2      |-2               |0                 |
|2016-07-04 09:02:00|-2      |-2               |0                 |
|2016-07-04 09:03:00|-2      |-2               |0                 |
|2016-07-04 09:04:00|-2      |-2               |0                 |
|2016-07-04 09:05:00|-2      |-2               |0                 |
|2016-07-04 09:06:00|-2      |-2               |0                 |
|2016-07-04 09:07:00|-2      |-2               |0                 |
|2016-07-04 09:08:00|-2      |-2               |0                 |
|2016-07-04 09:09:00|-2      |-2               |0                 |
|2016-07-04 09:10:00|-2      |-2               |0                 |
|2016-07-04 09:11:00|-2      |-2               |

In [51]:
# DETERMINE SEDENTARY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'sedentaryBoutNumber'
window = 5
UP=180
LOW=0
DURATION=30
TOL=1

acc_data_act = sedentary_bout_filter(acc_data_act, ts_name, AC_name, new_col, window, UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

13341

In [52]:
acc_data_act.show(20,False)

+-------------------+--------+-----------------+------------------+-------------------+
|timestamp          |activity|activityIntensity|activityBoutNumber|sedentaryBoutNumber|
+-------------------+--------+-----------------+------------------+-------------------+
|2016-07-04 09:00:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:01:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:02:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:03:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:04:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:05:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:06:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:07:00|-2      |-2               |0                 |0                  |
|2016-07-04 09:08:00|-2      |-2

In [None]:
# acc_data_act.toPandas().to_csv('PP001_out_' + str(INTERVAL) + '.csv')

## Merge dataframes

In [53]:
merge_data = gps_data3.join(acc_data_act, 'timestamp', how='left' ).orderBy('timestamp')
merge_data.cache()
merge_data.count()

8740

In [54]:
merge_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- fixTypeCode: integer (nullable = true)
 |-- tripNumber: long (nullable = true)
 |-- tripType: integer (nullable = true)
 |-- tripMOT: string (nullable = true)
 |-- activity: long (nullable = true)
 |-- activityIntensity: string (nullable = true)
 |-- activityBoutNumber: integer (nullable = true)
 |-- sedentaryBoutNumber: integer (nullable = true)



In [55]:
merge_data.show(20, False)

+-------------------+---+-----------+----------+-----------+----------+--------+-------+--------+-----------------+------------------+-------------------+
|timestamp          |dow|lat        |lon       |fixTypeCode|tripNumber|tripType|tripMOT|activity|activityIntensity|activityBoutNumber|sedentaryBoutNumber|
+-------------------+---+-----------+----------+-----------+----------+--------+-------+--------+-----------------+------------------+-------------------+
|2016-07-04 15:18:00|1  |50.85629377|5.71624049|2          |1         |1       |1      |44      |0                |0                 |0                  |
|2016-07-04 15:19:00|1  |50.85608475|5.71435562|1          |1         |2       |1      |0       |0                |0                 |0                  |
|2016-07-04 15:20:00|1  |50.85604772|5.71433521|1          |1         |3       |0      |130     |1                |0                 |0                  |
|2016-07-04 15:21:00|1  |50.85598331|5.7140342 |1          |1         