In [1]:
# Configuration properties of Apache Spark
#sc.stop()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, from_unixtime

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'

conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [2]:
%load_ext autoreload
%autoreload 2
# load my own functions
from utils.complete_missing_partitions import *
from utils.partitions import *

In [3]:
import pyspark.sql.functions as psf
from pyspark.sql import Window
from datetime import datetime, timedelta
from itertools import product

### Create the data for the example

We use month partitions

In [4]:
data = (  # recreate the DataFrame
    (1, datetime(2019, 12, 2, 14, 54, 17), 49.94),
    (1, datetime(2019, 11, 3, 8, 58, 39), 50.49),
    (1, datetime(2019, 8, 6, 10, 44, 1), 50.24),
    (2, datetime(2019, 8, 2, 8, 58, 39), 62.32),
    (2, datetime(2019, 5, 4, 10, 44, 1), 65.64))
df = spark.createDataFrame(data, schema=("person", "timestamp", "weight"))

In [5]:
df.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 14:54:17| 49.94|
|     1|2019-11-03 08:58:39| 50.49|
|     1|2019-08-06 10:44:01| 50.24|
|     2|2019-08-02 08:58:39| 62.32|
|     2|2019-05-04 10:44:01| 65.64|
+------+-------------------+------+



In [6]:
df.printSchema()

root
 |-- person: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- weight: double (nullable = true)



### Example of application

In [7]:
referece_col = 'person'
time_col = 'timestamp'

In [8]:
df = complete_missing_months(df, time_col, referece_col, spark)

In [9]:
df = df.sort('person', 'timestamp')

In [10]:
df = df.withColumn('partition_id', create_partitions_from_df('timestamp'))

In [11]:
df.sort('person', 'partition_id').show()

+-------------------+------+------+------------+
|          timestamp|person|weight|partition_id|
+-------------------+------+------+------------+
|2019-05-01 00:00:00|     1|  null|    20190531|
|2019-06-01 00:00:00|     1|  null|    20190630|
|2019-07-01 00:00:00|     1|  null|    20190731|
|2019-08-01 00:00:00|     1| 50.24|    20190831|
|2019-09-01 00:00:00|     1|  null|    20190930|
|2019-10-01 00:00:00|     1|  null|    20191031|
|2019-11-01 00:00:00|     1| 50.49|    20191130|
|2019-12-01 00:00:00|     1| 49.94|    20191231|
|2019-05-01 00:00:00|     2| 65.64|    20190531|
|2019-06-01 00:00:00|     2|  null|    20190630|
|2019-07-01 00:00:00|     2|  null|    20190731|
|2019-08-01 00:00:00|     2| 62.32|    20190831|
|2019-09-01 00:00:00|     2|  null|    20190930|
|2019-10-01 00:00:00|     2|  null|    20191031|
|2019-11-01 00:00:00|     2|  null|    20191130|
|2019-12-01 00:00:00|     2|  null|    20191231|
+-------------------+------+------+------------+



## With partitions

In [12]:
df_part = df.drop('timestamp')

In [13]:
df_part.show()

+------+------+------------+
|person|weight|partition_id|
+------+------+------------+
|     1|  null|    20190531|
|     1|  null|    20190630|
|     1|  null|    20190731|
|     1| 50.24|    20190831|
|     1|  null|    20190930|
|     1|  null|    20191031|
|     1| 50.49|    20191130|
|     1| 49.94|    20191231|
|     2| 65.64|    20190531|
|     2|  null|    20190630|
|     2|  null|    20190731|
|     2| 62.32|    20190831|
|     2|  null|    20190930|
|     2|  null|    20191031|
|     2|  null|    20191130|
|     2|  null|    20191231|
+------+------+------------+



In [14]:
referece_col = 'person'
time_col     = 'partition_id'

In [15]:
df_part = df_part.withColumn(time_col, montly_partition_YYmmdd(time_col))

In [16]:
df_part.printSchema()

root
 |-- person: long (nullable = true)
 |-- weight: double (nullable = true)
 |-- partition_id: timestamp (nullable = true)



In [17]:
list_intermediate_months(df_part, time_col) 

[datetime.date(2019, 5, 31),
 datetime.date(2019, 6, 30),
 datetime.date(2019, 7, 31),
 datetime.date(2019, 8, 31),
 datetime.date(2019, 9, 30),
 datetime.date(2019, 10, 31),
 datetime.date(2019, 11, 30),
 datetime.date(2019, 12, 31)]

In [18]:
df_part = complete_missing_months(df_part, time_col, referece_col, spark)

In [19]:
df_part.sort('person', 'partition_id').show()

+-------------------+------+------+
|       partition_id|person|weight|
+-------------------+------+------+
|2019-05-01 00:00:00|     1|  null|
|2019-06-01 00:00:00|     1|  null|
|2019-07-01 00:00:00|     1|  null|
|2019-08-01 00:00:00|     1| 50.24|
|2019-09-01 00:00:00|     1|  null|
|2019-10-01 00:00:00|     1|  null|
|2019-11-01 00:00:00|     1| 50.49|
|2019-12-01 00:00:00|     1| 49.94|
|2019-05-01 00:00:00|     2| 65.64|
|2019-06-01 00:00:00|     2|  null|
|2019-07-01 00:00:00|     2|  null|
|2019-08-01 00:00:00|     2| 62.32|
|2019-09-01 00:00:00|     2|  null|
|2019-10-01 00:00:00|     2|  null|
|2019-11-01 00:00:00|     2|  null|
|2019-12-01 00:00:00|     2|  null|
+-------------------+------+------+



## Other examples

Dates like integers

In [28]:
data = (  # recreate the DataFrame
    (1, 20191231, 49.94),
    (1, 20191130, 50.49),
    (1, 20191031, 50.24),
    (1, 20190531, 55.24),
    (2, 20190831, 62.32),
    (2, 20190131, 65.64))
df = spark.createDataFrame(data, schema=("person", "timestamp", "weight"))

In [29]:
df = df.withColumn("timestamp", 
                             sf.date_format(sf.to_date(sf.unix_timestamp(df['timestamp'].cast('string'), 
                              "yyyyMMdd").cast("timestamp")), 'yyyy-MM-dd'))


In [30]:
df.show()

+------+----------+------+
|person| timestamp|weight|
+------+----------+------+
|     1|2019-12-31| 49.94|
|     1|2019-11-30| 50.49|
|     1|2019-10-31| 50.24|
|     1|2019-05-31| 55.24|
|     2|2019-08-31| 62.32|
|     2|2019-01-31| 65.64|
+------+----------+------+



In [31]:
referece_col = 'person'
time_col     = 'timestamp'

In [32]:
df = complete_missing_months(df, time_col, referece_col, spark)

In [35]:
df.sort('person', 'timestamp').show()

+-------------------+------+------+
|          timestamp|person|weight|
+-------------------+------+------+
|2019-01-01 00:00:00|     1|  null|
|2019-02-01 00:00:00|     1|  null|
|2019-03-01 00:00:00|     1|  null|
|2019-04-01 00:00:00|     1|  null|
|2019-05-01 00:00:00|     1| 55.24|
|2019-06-01 00:00:00|     1|  null|
|2019-07-01 00:00:00|     1|  null|
|2019-08-01 00:00:00|     1|  null|
|2019-09-01 00:00:00|     1|  null|
|2019-10-01 00:00:00|     1| 50.24|
|2019-11-01 00:00:00|     1| 50.49|
|2019-12-01 00:00:00|     1| 49.94|
|2019-01-01 00:00:00|     2| 65.64|
|2019-02-01 00:00:00|     2|  null|
|2019-03-01 00:00:00|     2|  null|
|2019-04-01 00:00:00|     2|  null|
|2019-05-01 00:00:00|     2|  null|
|2019-06-01 00:00:00|     2|  null|
|2019-07-01 00:00:00|     2|  null|
|2019-08-01 00:00:00|     2| 62.32|
+-------------------+------+------+
only showing top 20 rows

