In [2]:
from pyspark.conf import SparkConf
from src.GPSProcessing import *
from src.AccProcessing import *

In [None]:
# REFERENCE: https://spark.apache.org/docs/latest/configuration.html

conf = SparkConf().setAll([('spark.memory.fraction', '0.6'),
                           ('spark.executor.memory', '16g'),
                           ('spark.driver.memory', '16g'),
                           ('spark.sql.shuffle.partitions', '20'),
                           ('spark.memory.offHeap.enabled', True),
                           ('spark.memory.offHeap.size', '16g'),
                           ('spark.cleaner.referenceTracking.cleanCheckpoints', True),
                           ('spark.driver.host', '127.0.0.1'),
                           ('spark.scheduler.listenerbus.eventqueue.capacity', '50000')]  # ,
                          )

spark = SparkSession.builder.config(conf=conf).master("local[*]").appName("test").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")
sc.setCheckpointDir('checkpoints')
sc.getConf().getAll()

In [None]:
sc.defaultParallelism

In [None]:
sc.defaultMinPartitions

In [None]:
sc

## GPS data processing

In [7]:
gps_path_raw = '../tests/raw-data/gps/1.csv'

In [8]:
gps_data_raw = spark.read.csv(gps_path_raw, header=True, inferSchema=True)

In [None]:
gps_data_raw.show()

In [10]:
date_format = 'yyyy/MM/dd'
time_format = 'HH:mm:ss'
datetime_format = date_format + ' ' + time_format

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'
height_name = 'height'
fix_type_name = 'fixTypeCode'

gps_data = gen_gps_dataframe(gps_data_raw, ts_name, datetime_format)

In [None]:
gps_data.printSchema()

In [None]:
gps_data.count()

In [None]:
gps_data.show()

In [None]:
# Filter timestamps over given interval
INTERVAL = 5

print("====> filter GPS data every {} seconds...".format(str(INTERVAL)))
start_time = time.time()
gps_data = select_gps_intervals(gps_data, ts_name, INTERVAL)
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
# Set fix type

ws = 600

print("====> set fix type...")
start_time = time.time()
gps_data = set_fix_type(gps_data, ts_name, fix_type_name, ws).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data.show()

In [None]:
# Apply filter on the velocity

vmax = 130 # km/h

print("====> apply velocity filter...")
start_time = time.time()
gps_data = filter_speed(gps_data, speed_name, fix_type_name, vmax).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
# Apply filter over max acceleration
                                             
print("====> apply accelaration filter...")
start_time = time.time()
gps_data = filter_acceleration(gps_data, speed_name, ts_name, fix_type_name).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
# Apply filter on the height variation

dhmax=1000

print("====> apply height variation filter...")
start_time = time.time()
gps_data = filter_height(gps_data, height_name, ts_name, fix_type_name, dhmax).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data.count()

In [None]:
# Apply filter over three fixes (it also recalculates distance column)

dmin = 10
dcol = 'distance'

print("====> apply three fixes filter...")
start_time = time.time()
gps_data = filter_change_dist_3_fixes(gps_data, dist_name, ts_name, fix_type_name, dmin).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data.count()

In [None]:
gps_data.printSchema()

In [None]:
# Round seconds in timestamps according to the interval
interval = INTERVAL # seconds
ts_name = 'timestamp'

print("====> align timestamps...")
start_time = time.time()
gps_data = round_timestamp(gps_data, ts_name, interval).cache()
gps_data.checkpoint()
gps_data.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data.printSchema()

In [None]:
gps_data.show()

In [27]:
gps_data = gps_data.limit(1000)

In [28]:
min_dist_per_min = 25 # meters
min_pause_duration = 120 # second
max_pause_time = 180 # seconds

In [29]:
# %lprun -f detect_trips detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, min_pause_duration, max_pause_time, vmax)

In [None]:
print("====> detect trips...")
start_time = time.time()
gps_data2 = detect_trips(gps_data, ts_name, dist_name, speed_name, fix_type_name, min_dist_per_min, 
                 min_pause_duration, max_pause_time, vmax).cache()
gps_data2.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data2.printSchema()

In [None]:
gps_data2.show(10000)

In [None]:
vehicle_speed_cutoff = 35 # km/h
bicycle_speed_cutoff = 10 # km/h 
walk_speed_cutoff = 1 # km/h
speed_percentile = 90
speed_segment_length = 30 # m
min_trip_length = 100 # m
min_trip_duration = 120 # sec

ts_name = 'timestamp'
dist_name = 'distance'
speed_name = 'speed'

print("====> classify trips...")
start_time = time.time()
gps_data3 = classify_trips(gps_data2, ts_name, dist_name, speed_name, fix_type_name,
                           vehicle_speed_cutoff, bicycle_speed_cutoff, walk_speed_cutoff, 
                           min_trip_length, min_trip_duration, speed_segment_length, speed_percentile).cache()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data3.printSchema()

In [None]:
gps_data3.show(10000)

In [None]:
# Generate missing values up to maximum signal loss

print("====> fill in missing value...")
start_time = time.time()
gps_data3 = fill_timestamp(gps_data3, ts_name, fix_type_name, interval, ws).cache()
gps_data3.checkpoint()
gps_data3.count()
elapsed_time = time.time() - start_time
print("      time elapsed: {}".format(time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

In [None]:
gps_data3.show(10000)

In [None]:
# gps_data3.toPandas().to_csv('1_gpu_out_' + str(INTERVAL) + '.csv')

## Accelerometer data processing

In [31]:
acc_path_raw = '../tests/raw-data/acc/1.csv'

In [None]:
acc_data_raw = spark.read.text(acc_path_raw)
acc_data_raw.cache()
acc_data_raw.checkpoint()

In [None]:
acc_data_raw.count()

In [34]:
ts_name = 'timestamp'

interval, acc_data = gen_acc_dataframe(acc_data_raw, ts_name)

In [None]:
acc_data.printSchema()

In [None]:
acc_columns = ['axis1','axis2','axis3','steps','lux','incl_off','incl_standing','incl_sitting','incl_lying']
acc_data_ext = split_acc_data(acc_data, acc_columns)
acc_data_ext.cache()
acc_data_ext.show(20)

In [None]:
ts_name = 'timestamp'
INTERVAL = 5 #seconds
window = INTERVAL

acc_data_act = select_acc_intervals(acc_data_ext, ts_name, interval, window, False, True)
acc_data_act.show(20, False)

In [None]:
LightCO, ModerateCO, HardCO, VeryHardCO = (100, 1953, 5725, 9498)
window = INTERVAL
acc_data_act = activity_count(acc_data_act, 'timestamp', window, LightCO, ModerateCO, HardCO, VeryHardCO, False)
acc_data_act.cache()
acc_data_act.checkpoint()

In [None]:
acc_data_act.printSchema()

In [None]:
acc_data_act.show()

In [None]:
# DETERMINE NON-WEAR PERIOD
ts_name = 'timestamp'
AC_name = 'activity'
AI_name = 'activityIntensity'
new_col = 'non_wear'
window = INTERVAL
minutes_zeros_row = 90

acc_data_act = non_wear_filter(acc_data_act, ts_name, AC_name, AI_name, window, minutes_zeros_row)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

In [None]:
# DETERMINE ACTIVITY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'activityBoutNumber'
window = INTERVAL
UP = 9999
LOW = 1953
DURATION = 10
TOL = 2

acc_data_act = activity_bout_filter(acc_data_act, ts_name, AC_name, new_col, window, UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

In [None]:
# DETERMINE SEDENTARY BOUT NUMBER
ts_name = 'timestamp'
AC_name = 'activity'
new_col = 'sedentaryBoutNumber'
window = INTERVAL
UP=180
LOW=0
DURATION=30
TOL=1

acc_data_act = sedentary_bout_filter(acc_data_act, ts_name, AC_name, new_col, window, UP, LOW, DURATION, TOL)
acc_data_act.cache()
acc_data_act.checkpoint()
acc_data_act.count()

In [None]:
acc_data_act.show(20,False)

In [None]:
# acc_data_act.toPandas().to_csv('1_acc_out_' + str(INTERVAL) + '.csv')

## Merge dataframes

In [None]:
merge_data = gps_data3.join(acc_data_act, 'timestamp', how='left' ).orderBy('timestamp')
merge_data.cache()
merge_data.count()

In [None]:
merge_data.printSchema()

In [None]:
merge_data.show(20, False)

In [None]:
merge_data2 = acc_data_act.join(gps_data3, 'timestamp', how='left' ).orderBy('timestamp')
merge_data2.cache()
merge_data2.count()

In [None]:
merge_data2.printSchema()

In [None]:
merge_data2.show(2000, False)

In [None]:
# merge_data2.toPandas().to_csv('1_merged_out_' + str(INTERVAL) + '.csv')