In [2]:
# Configuration properties of Apache Spark
#sc.stop()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, from_unixtime

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'

conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [3]:
%load_ext autoreload
%autoreload 2
# load my own functions
from utils.utilities import *

In [4]:
import pyspark.sql.functions as psf
from pyspark.sql import Window
from datetime import datetime, timedelta
from itertools import product

In [5]:
data = (  # recreate the DataFrame
    (1, datetime(2019, 12, 2, 14, 54, 17), 49.94),
    (1, datetime(2019, 12, 3, 8, 58, 39), 50.49),
    (1, datetime(2019, 12, 6, 10, 44, 1), 50.24),
    (2, datetime(2019, 12, 2, 8, 58, 39), 62.32),
    (2, datetime(2019, 12, 4, 10, 44, 1), 65.64))
df = spark.createDataFrame(data, schema=("person", "timestamp", "weight"))

min_max_timestamps = df.agg(psf.min(df.timestamp), psf.max(df.timestamp)).head()
first_date, last_date = [ts.date() for ts in min_max_timestamps]
all_days_in_range = [first_date + timedelta(days=d)
                     for d in range((last_date - first_date).days + 1)]
people = [row.person for row in df.select("person").distinct().collect()]
dates_by_person = spark.createDataFrame(product(people, all_days_in_range),
                                        schema=("person", "date"))

In [6]:
df.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 14:54:17| 49.94|
|     1|2019-12-03 08:58:39| 50.49|
|     1|2019-12-06 10:44:01| 50.24|
|     2|2019-12-02 08:58:39| 62.32|
|     2|2019-12-04 10:44:01| 65.64|
+------+-------------------+------+



In [7]:
hh = complete_missing_days(df, dates_by_person)

In [8]:
hh.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 14:54:17| 49.94|
|     1|2019-12-03 08:58:39| 50.49|
|     1|2019-12-04 00:00:00| 50.49|
|     1|2019-12-05 00:00:00| 50.49|
|     1|2019-12-06 10:44:01| 50.24|
|     2|2019-12-02 08:58:39| 62.32|
|     2|2019-12-03 00:00:00| 62.32|
|     2|2019-12-04 10:44:01| 65.64|
|     2|2019-12-05 00:00:00| 65.64|
|     2|2019-12-06 00:00:00| 65.64|
+------+-------------------+------+

