In [12]:
# Configuration properties of Apache Spark
#sc.stop()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, from_unixtime

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'

conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [13]:
sc.addFile("utils/complete_missing_dates.py")

In [14]:
%load_ext autoreload
%autoreload 2
# load my own functions
from utils.complete_missing_dates import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import pyspark.sql.functions as psf
from pyspark.sql import Window
from datetime import datetime, timedelta
from itertools import product

### Create the data for the example

In [16]:
data = (  # recreate the DataFrame
    (1, datetime(2019, 12, 2, 14, 54, 17), 49.94),
    (1, datetime(2019, 12, 3, 8, 58, 39), 50.49),
    (1, datetime(2019, 12, 6, 10, 44, 1), 50.24),
    (2, datetime(2019, 12, 2, 8, 58, 39), 62.32),
    (2, datetime(2019, 12, 4, 10, 44, 1), 65.64))
df = spark.createDataFrame(data, schema=("person", "timestamp", "weight"))

In [17]:
df.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 14:54:17| 49.94|
|     1|2019-12-03 08:58:39| 50.49|
|     1|2019-12-06 10:44:01| 50.24|
|     2|2019-12-02 08:58:39| 62.32|
|     2|2019-12-04 10:44:01| 65.64|
+------+-------------------+------+



### Example of application

In [18]:
time_col     = "timestamp"
referece_col = "person"
df = df.withColumn('timestamp', daily_date_YYmmdd('timestamp'))
df.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 00:00:00| 49.94|
|     1|2019-12-03 00:00:00| 50.49|
|     1|2019-12-06 00:00:00| 50.24|
|     2|2019-12-02 00:00:00| 62.32|
|     2|2019-12-04 00:00:00| 65.64|
+------+-------------------+------+



In [19]:
hh = complete_missing_days(df, time_col, referece_col, spark)

In [20]:
hh.sort('person','timestamp').show()

+-------------------+------+------+
|          timestamp|person|weight|
+-------------------+------+------+
|2019-12-02 00:00:00|     1| 49.94|
|2019-12-03 00:00:00|     1| 50.49|
|2019-12-04 00:00:00|     1|  null|
|2019-12-05 00:00:00|     1|  null|
|2019-12-06 00:00:00|     1| 50.24|
|2019-12-02 00:00:00|     2| 62.32|
|2019-12-03 00:00:00|     2|  null|
|2019-12-04 00:00:00|     2| 65.64|
|2019-12-05 00:00:00|     2|  null|
|2019-12-06 00:00:00|     2|  null|
+-------------------+------+------+



In [21]:
## Count number of nulls
count_nulls_by_column(hh).show()

+------+------+
|person|weight|
+------+------+
|     0|     5|
+------+------+



In [22]:
## Count number of nulls by person
count_nulls_by_column(hh, 'person').show()

+------+------+------+
|person|person|weight|
+------+------+------+
|     1|     0|     2|
|     2|     0|     3|
+------+------+------+

