In [2]:
from pyspark.conf import SparkConf
from src.GPSProcessing import *
from src.AccProcessing import *

In [3]:
# REFERENCE: https://spark.apache.org/docs/latest/configuration.html

conf = SparkConf().setAll([('spark.memory.fraction', '0.6'),
                           ('spark.executor.memory', '16g'),
                           ('spark.driver.memory', '16g'),
                           ('spark.sql.shuffle.partitions', '20'),
                           ('spark.memory.offHeap.enabled', True),
                           ('spark.memory.offHeap.size', '16g'),
                           ('spark.cleaner.referenceTracking.cleanCheckpoints', True),
                           ('spark.driver.host', '127.0.0.1'),
                           ('spark.scheduler.listenerbus.eventqueue.capacity', '50000')]  # ,
                          )

spark = SparkSession.builder.config(conf=conf).master("local[*]").appName("test").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
sc.setCheckpointDir('checkpoints')
sc.getConf().getAll()

[('spark.sql.shuffle.partitions', '20'),
 ('spark.app.name', 'test'),
 ('spark.driver.host', '127.0.0.1'),
 ('spark.app.id', 'local-1637835521851'),
 ('spark.cleaner.referenceTracking.cleanCheckpoints', 'True'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '16g'),
 ('spark.driver.port', '62262'),
 ('spark.executor.memory', '16g'),
 ('spark.scheduler.listenerbus.eventqueue.capacity', '50000'),
 ('spark.rdd.compress', 'True'),
 ('spark.memory.fraction', '0.6'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.memory.offHeap.enabled', 'True'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.memory.offHeap.size', '16g')]

In [4]:
sc.defaultParallelism

8

In [5]:
sc.defaultMinPartitions

2

In [6]:
sc

## GPS data processing

In [7]:
gps_path_raw = '../tests/raw-data/gps/1.csv'

In [8]:
gps_data_raw = spark.read.csv(gps_path_raw, header=True, inferSchema=True)

In [9]:
gps_data_raw.show()

+-----+----+---------+--------+----------+----------+---+-----+---------+---+----------+---+------+-----+----------+----+----+----+----+----+----+----+---------+-------+----+--------+
|INDEX| RCR| UTC DATE|UTC TIME|LOCAL DATE|LOCAL TIME| MS|VALID| LATITUDE|N/S| LONGITUDE|E/W|HEIGHT|SPEED|   HEADING|DSTA|DAGE|PDOP|HDOP|VDOP|NSAT| SID|Elevation|Azimuth| SNR|Distance|
+-----+----+---------+--------+----------+----------+---+-----+---------+---+----------+---+------+-----+----------+----+----+----+----+----+----+----+---------+-------+----+--------+
|    1|null|2016/8/17| 0:23:29| 2016/8/17|   0:23:29|  0| null| 51.01609|  N|114.099305|  W|     0|0.121|113.437589|   0|   0|   0|   0|   0|   0|null|     null|   null|null|     0.0|
|    2|null|2016/8/17| 0:23:34| 2016/8/17|   0:23:34|  0| null|51.016075|  N| 114.09925|  W|  1084|2.127|291.672465|   0|   0|   0|   0|   0|   0|null|     null|   null|null|    4.19|
|    3|null|2016/8/17| 0:23:39| 2016/8/17|   0:23:39|  0| null|51.016078|  N|114

In [10]:
date_format = 'yyyy/MM/dd'
time_format = 'HH:mm:ss'
datetime_format = date_format + ' ' + time_format

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'
height_name = 'height'
fix_type_name = 'fixTypeCode'

gps_data = gen_gps_dataframe(gps_data_raw, ts_name, datetime_format)

In [11]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: double (nullable = false)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)



In [12]:
gps_data.count()

115135

In [13]:
gps_data.show()

+-------------------+---+---------+-----------+--------+------+-----+
|          timestamp|dow|      lat|        lon|distance|height|speed|
+-------------------+---+---------+-----------+--------+------+-----+
|2016-08-16 18:23:29|  2| 51.01609|-114.099305|     0.0|   0.0|0.121|
|2016-08-16 18:23:34|  2|51.016075| -114.09925|     0.0|1084.0|2.127|
|2016-08-16 18:23:39|  2|51.016078|-114.099263|     0.0|1076.0|0.312|
|2016-08-16 18:23:44|  2|51.016072| -114.09925|     0.0|1077.0|0.544|
|2016-08-16 18:23:49|  2|51.016083|-114.099242|     0.0|1077.0|0.061|
|2016-08-16 18:23:54|  2|51.016085| -114.09924|     0.0|1078.0|0.538|
|2016-08-16 18:23:59|  2|51.016088|-114.099247|     0.0|1077.0|0.515|
|2016-08-16 18:24:04|  2|51.016088|-114.099247|     0.0|1077.0|0.174|
|2016-08-16 18:24:09|  2|51.016092|-114.099217|     0.0|1076.0|4.102|
|2016-08-16 18:24:14|  2|51.016152|-114.099205|     0.0|1077.0|5.964|
|2016-08-16 18:24:19|  2|51.016205|-114.099188|     0.0|1077.0|4.868|
|2016-08-16 18:24:24

In [14]:
# Filter timestamps over given interval
INTERVAL = 5

print("====> filter GPS data every {} seconds...".format(str(INTERVAL)))
start_time = time.time()
gps_data = select_gps_intervals(gps_data, ts_name, INTERVAL)
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> filter GPS data every 5 seconds...
      time elapsed: 00:00:02


In [15]:
# Set fix type

ws = 600

print("====> set fix type...")
start_time = time.time()
gps_data = set_fix_type(gps_data, ts_name, fix_type_name, ws).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> set fix type...
      time elapsed: 00:00:05


In [16]:
gps_data.show()

+-------------------+---+---------+-----------+--------+------+-----+-----------+
|          timestamp|dow|      lat|        lon|distance|height|speed|fixTypeCode|
+-------------------+---+---------+-----------+--------+------+-----+-----------+
|2016-08-16 18:23:29|  2| 51.01609|-114.099305|     0.0|   0.0|0.121|          2|
|2016-08-16 18:23:34|  2|51.016075| -114.09925|     0.0|1084.0|2.127|          1|
|2016-08-16 18:23:39|  2|51.016078|-114.099263|     0.0|1076.0|0.312|          1|
|2016-08-16 18:23:44|  2|51.016072| -114.09925|     0.0|1077.0|0.544|          1|
|2016-08-16 18:23:49|  2|51.016083|-114.099242|     0.0|1077.0|0.061|          1|
|2016-08-16 18:23:54|  2|51.016085| -114.09924|     0.0|1078.0|0.538|          1|
|2016-08-16 18:23:59|  2|51.016088|-114.099247|     0.0|1077.0|0.515|          1|
|2016-08-16 18:24:04|  2|51.016088|-114.099247|     0.0|1077.0|0.174|          1|
|2016-08-16 18:24:09|  2|51.016092|-114.099217|     0.0|1076.0|4.102|          1|
|2016-08-16 18:2

In [17]:
# Apply filter on the velocity

vmax = 130 # km/h

print("====> apply velocity filter...")
start_time = time.time()
gps_data = filter_speed(gps_data, speed_name, fix_type_name, vmax).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply velocity filter...
      time elapsed: 00:00:00


In [18]:
# Apply filter over max acceleration
                                             
print("====> apply accelaration filter...")
start_time = time.time()
gps_data = filter_acceleration(gps_data, speed_name, ts_name, fix_type_name).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply accelaration filter...
      time elapsed: 00:00:00


In [19]:
# Apply filter on the height variation

dhmax=1000

print("====> apply height variation filter...")
start_time = time.time()
gps_data = filter_height(gps_data, height_name, ts_name, fix_type_name, dhmax).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply height variation filter...
      time elapsed: 00:00:01


In [20]:
gps_data.count()

115013

In [21]:
# Apply filter over three fixes (it also recalculates distance column)

dmin = 10
dcol = 'distance'

print("====> apply three fixes filter...")
start_time = time.time()
gps_data = filter_change_dist_3_fixes(gps_data, dist_name, ts_name, fix_type_name, dmin).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> apply three fixes filter...
      time elapsed: 00:00:02


In [22]:
gps_data.count()

114936

In [23]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: string (nullable = true)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- fixTypeCode: integer (nullable = false)



In [24]:
# Round seconds in timestamps according to the interval
interval = INTERVAL # seconds
ts_name = 'timestamp'

print("====> align timestamps...")
start_time = time.time()
gps_data = round_timestamp(gps_data, ts_name, interval).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> align timestamps...
      time elapsed: 00:00:00


In [25]:
gps_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- dow: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- distance: string (nullable = true)
 |-- height: double (nullable = true)
 |-- speed: double (nullable = true)
 |-- fixTypeCode: integer (nullable = false)



In [26]:
gps_data.show()

+-------------------+---+---------+-----------+-------------------+------+-----+-----------+
|          timestamp|dow|      lat|        lon|           distance|height|speed|fixTypeCode|
+-------------------+---+---------+-----------+-------------------+------+-----+-----------+
|2016-08-16 18:23:25|  2| 51.01609|-114.099305|                0.0|   0.0|0.121|          2|
|2016-08-16 18:23:30|  2|51.016075| -114.09925|  4.190762261814708|1084.0|2.127|          1|
|2016-08-16 18:23:35|  2|51.016078|-114.099263| 0.9680335753755167|1076.0|0.312|          1|
|2016-08-16 18:23:40|  2|51.016072| -114.09925| 1.1271674272734382|1077.0|0.544|          1|
|2016-08-16 18:23:45|  2|51.016083|-114.099242| 1.3442430707892563|1077.0|0.061|          1|
|2016-08-16 18:23:50|  2|51.016085| -114.09924|0.26257240275150484|1078.0|0.538|          1|
|2016-08-16 18:23:55|  2|51.016088|-114.099247|  0.592128034196194|1077.0|0.515|          1|
|2016-08-16 18:24:00|  2|51.016088|-114.099247|                0.0|107

In [27]:
gps_data = gps_data.limit(1000)

In [28]:
min_dist_per_min = 25 # meters
min_pause_duration = 120 # second
max_pause_time = 180 # seconds

In [29]:
# %lprun -f detect_trips detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, min_pause_duration, max_pause_time, vmax)

In [30]:
print("====> detect trips...")
start_time = time.time()
gps_data2 = detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, 
                 min_pause_duration, max_pause_time, vmax).cache()
gps_data2.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

====> detect trips...
      time elapsed: 00:02:15


In [None]:
gps_data2.printSchema()

In [None]:
gps_data2.show(10000)

In [None]:
vehicle_speed_cutoff = 35 # km/h
bicycle_speed_cutoff = 10 # km/h 
walk_speed_cutoff = 1 # km/h
speed_percentile = 90
speed_segment_length = 30 # m
min_trip_length = 100 # m
min_trip_duration = 120 # sec

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'

print("====> classify trips...")
start_time = time.time()
gps_data3 = classify_trips(gps_data2, ts_name, dist_name, speed_name, fix_type_name,
                           vehicle_speed_cutoff, bicycle_speed_cutoff, walk_speed_cutoff, 
                           min_trip_length, min_trip_duration, speed_segment_length, speed_percentile).cache()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data3.printSchema()

In [None]:
gps_data3.show(10000)

In [None]:
# Generate missing values up to maximum signal loss

print("====> fill in missing value...")
start_time = time.time()
gps_data3 = fill_timestamp(gps_data3, ts_name, fix_type_name, interval, ws).cache()
gps_data3.checkpoint()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data3.show(10000)

In [None]:
# gps_data3.toPandas().to_csv('A01_gpu_out_' + str(INTERVAL) + '.csv')

## Accelerometer data processing

In [31]:
acc_path_raw = '../tests/raw-data/acc/1.csv'

In [32]:
acc_data_raw = spark.read.text(acc_path_raw)
acc_data_raw.cache()
acc_data_raw.checkpoint()

DataFrame[value: string]

In [33]:
acc_data_raw.count()

149341

In [34]:
ts_name = 'timestamp'

interval, acc_data = gen_acc_dataframe(acc_data_raw, ts_name)

In [35]:
acc_data.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- acc_data: string (nullable = true)



In [36]:
acc_columns = ['axis1','axis2','axis3','steps','lux','incl_off','incl_standing','incl_sitting','incl_lying']
acc_data_ext = split_acc_data(acc_data, acc_columns)
acc_data_ext.cache()
acc_data_ext.show(20)

+-------------------+-----+-----+-----+-----+---+--------+-------------+------------+----------+
|          timestamp|axis1|axis2|axis3|steps|lux|incl_off|incl_standing|incl_sitting|incl_lying|
+-------------------+-----+-----+-----+-----+---+--------+-------------+------------+----------+
|2016-08-15 21:35:00|  325|   85|  176|    2|  0|       0|            5|           0|         0|
|2016-08-15 21:35:05|  181|  116|   91|    1|  0|       0|            5|           0|         0|
|2016-08-15 21:35:10|    4|   47|  153|    0|  0|       0|            0|           0|         5|
|2016-08-15 21:35:15|  246|  258|  306|    2|  0|       0|            5|           0|         0|
|2016-08-15 21:35:20|  131|   81|   31|    1|  0|       0|            5|           0|         0|
|2016-08-15 21:35:25|    0|    0|    0|    0|  0|       0|            0|           0|         5|
|2016-08-15 21:35:30|    0|    0|    0|    0|  0|       0|            0|           0|         5|
|2016-08-15 21:35:35|    0|   

In [37]:
ts_name = 'timestamp'
INTERVAL = 5 #seconds
window = INTERVAL

acc_data_act = select_acc_intervals(acc_data_ext, ts_name, interval, window, False, True)
acc_data_act.show(20, False)

+-------------------+-----+-----+-----+-----+---+--------+-------------+------------+----------+
|timestamp          |axis1|axis2|axis3|steps|lux|incl_off|incl_standing|incl_sitting|incl_lying|
+-------------------+-----+-----+-----+-----+---+--------+-------------+------------+----------+
|2016-08-15 21:35:00|325  |85   |176  |2    |0  |0       |5            |0           |0         |
|2016-08-15 21:35:05|181  |116  |91   |1    |0  |0       |5            |0           |0         |
|2016-08-15 21:35:10|4    |47   |153  |0    |0  |0       |0            |0           |5         |
|2016-08-15 21:35:15|246  |258  |306  |2    |0  |0       |5            |0           |0         |
|2016-08-15 21:35:20|131  |81   |31   |1    |0  |0       |5            |0           |0         |
|2016-08-15 21:35:25|0    |0    |0    |0    |0  |0       |0            |0           |5         |
|2016-08-15 21:35:30|0    |0    |0    |0    |0  |0       |0            |0           |5         |
|2016-08-15 21:35:35|0    |0  

In [38]:
LightCO, ModerateCO, HardCO, VeryHardCO = (100, 1953, 5725, 9498)
window = INTERVAL
acc_data_act = activity_count(acc_data_act, 'timestamp', window, LightCO, ModerateCO, HardCO, VeryHardCO, False)
acc_data_act.cache()
acc_data_act.checkpoint()

DataFrame[timestamp: timestamp, activity: int, activityIntensity: string]

In [39]:
acc_data_act.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- activity: integer (nullable = true)
 |-- activityIntensity: string (nullable = true)



In [40]:
acc_data_act.show()

+-------------------+--------+-----------------+
|          timestamp|activity|activityIntensity|
+-------------------+--------+-----------------+
|2016-08-15 21:35:00|     325|                2|
|2016-08-15 21:35:05|     181|                2|
|2016-08-15 21:35:10|       4|                0|
|2016-08-15 21:35:15|     246|                2|
|2016-08-15 21:35:20|     131|                1|
|2016-08-15 21:35:25|       0|                0|
|2016-08-15 21:35:30|       0|                0|
|2016-08-15 21:35:35|       0|                0|
|2016-08-15 21:35:40|       0|                0|
|2016-08-15 21:35:45|       0|                0|
|2016-08-15 21:35:50|       0|                0|
|2016-08-15 21:35:55|       0|                0|
|2016-08-15 21:36:00|       0|                0|
|2016-08-15 21:36:05|       0|                0|
|2016-08-15 21:36:10|       0|                0|
|2016-08-15 21:36:15|       0|                0|
|2016-08-15 21:36:20|       0|                0|
|2016-08-15 21:36:25

In [41]:
# DETERMINE NON-WEAR PERIOD
ts_name = 'timestamp'
AC_name = 'activity'
AI_name = 'activityIntensity'
new_col = 'non_wear'
window = INTERVAL
minutes_zeros_row = 90

acc_data_act = non_wear_filter(acc_data_act, ts_name, AC_name, AI_name, window, minutes_zeros_row)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

149331

In [42]:
# DETERMINE ACTIVITY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'activityBoutNumber'
window = INTERVAL
UP = 9999
LOW = 1953
DURATION = 10
TOL = 2

acc_data_act = activity_bout_filter(acc_data_act, ts_name, AC_name, new_col, window, UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

149331

In [43]:
# DETERMINE SEDENTARY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'sedentaryBoutNumber'
window = INTERVAL
UP=180
LOW=0
DURATION=30
TOL=1

acc_data_act = sedentary_bout_filter(acc_data_act, ts_name, AC_name, new_col, window, UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

149331

In [44]:
acc_data_act.show(20,False)

+-------------------+--------+-----------------+------------------+-------------------+
|timestamp          |activity|activityIntensity|activityBoutNumber|sedentaryBoutNumber|
+-------------------+--------+-----------------+------------------+-------------------+
|2016-08-15 21:35:00|325     |2                |0                 |0                  |
|2016-08-15 21:35:05|181     |2                |0                 |0                  |
|2016-08-15 21:35:10|4       |0                |0                 |1                  |
|2016-08-15 21:35:15|246     |2                |0                 |1                  |
|2016-08-15 21:35:20|131     |1                |0                 |1                  |
|2016-08-15 21:35:25|0       |0                |0                 |1                  |
|2016-08-15 21:35:30|0       |0                |0                 |1                  |
|2016-08-15 21:35:35|0       |0                |0                 |1                  |
|2016-08-15 21:35:40|0       |0 

In [None]:
# acc_data_act.toPandas().to_csv('A01_acc_out_' + str(INTERVAL) + '.csv')

## Merge dataframes

In [None]:
merge_data = gps_data3.join(acc_data_act, 'timestamp', how='left' ).orderBy('timestamp')
merge_data.cache()
merge_data.count()

In [None]:
merge_data.printSchema()

In [None]:
merge_data.show(20, False)

In [None]:
merge_data2 = acc_data_act.join(gps_data3, 'timestamp', how='left' ).orderBy('timestamp')
merge_data2.cache()
merge_data2.count()

In [None]:
merge_data2.printSchema()

In [None]:
merge_data2.show(2000, False)

In [None]:
# merge_data2.toPandas().to_csv('A01_merged_out_' + str(INTERVAL) + '.csv')