#### Partition functions

In [1]:
# Configuration properties of Apache Spark
#sc.stop()
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import unix_timestamp, from_unixtime

APP_NAME = 'pyspark_python'
MASTER = 'local[*]'

conf = SparkConf().setAppName(APP_NAME)
conf = conf.setMaster(MASTER)
spark = SparkSession.builder.config(conf = conf).getOrCreate()
sc = spark.sparkContext

In [2]:
%load_ext autoreload
%autoreload 2
# load my own functions
from utils.partitions import *

In [3]:
import pyspark.sql.functions as psf
from pyspark.sql import Window
from datetime import datetime, timedelta
from itertools import product

In [4]:
data = (  # recreate the DataFrame
    (1, datetime(2019, 12, 2, 14, 54, 17), 49.94),
    (1, datetime(2019, 12, 3, 8, 58, 39), 50.49),
    (1, datetime(2019, 12, 6, 10, 44, 1), 50.24),
    (2, datetime(2019, 12, 2, 8, 58, 39), 62.32),
    (2, datetime(2019, 12, 4, 10, 44, 1), 65.64))
df = spark.createDataFrame(data, schema=("person", "timestamp", "weight"))
df.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 14:54:17| 49.94|
|     1|2019-12-03 08:58:39| 50.49|
|     1|2019-12-06 10:44:01| 50.24|
|     2|2019-12-02 08:58:39| 62.32|
|     2|2019-12-04 10:44:01| 65.64|
+------+-------------------+------+



In [5]:
df = df.withColumn('partition_id', create_partitions_from_df('timestamp'))
df.show()

+------+-------------------+------+------------+
|person|          timestamp|weight|partition_id|
+------+-------------------+------+------------+
|     1|2019-12-02 14:54:17| 49.94|    20191231|
|     1|2019-12-03 08:58:39| 50.49|    20191231|
|     1|2019-12-06 10:44:01| 50.24|    20191231|
|     2|2019-12-02 08:58:39| 62.32|    20191231|
|     2|2019-12-04 10:44:01| 65.64|    20191231|
+------+-------------------+------+------------+



Sustract N partitions

In [6]:
data_depth_months = 11
sustract_month_partition('20191231', data_depth_months)

'20190131'

Firt day of the partition

In [7]:
init_day_partition('20191231')

'20191201'

Combination of above

In [8]:
sustract_month_partition(init_day_partition('20191231'), data_depth_months)

'20190101'

In [9]:
sustract_days_partition('20191231', data_depth_days = 5)

'20191226'

## Duplicates

In [16]:
data = (  # recreate the DataFrame
    (1, datetime(2019, 12, 2, 0, 0, 0), 49.94),
    (1, datetime(2019, 12, 3, 0, 0, 0), 62.32),
    (1, datetime(2019, 12, 6, 0, 0, 0), 62.32),
    (2, datetime(2019, 12, 3, 0, 0, 0), 62.32),
    (2, datetime(2019, 12, 2, 0, 0, 0), 62.32))
df = spark.createDataFrame(data, schema=("person", "timestamp", "weight"))
df.show()

+------+-------------------+------+
|person|          timestamp|weight|
+------+-------------------+------+
|     1|2019-12-02 00:00:00| 49.94|
|     1|2019-12-03 00:00:00| 62.32|
|     1|2019-12-06 00:00:00| 62.32|
|     2|2019-12-03 00:00:00| 62.32|
|     2|2019-12-02 00:00:00| 62.32|
+------+-------------------+------+



In [9]:
df.sort('timestamp').show(20, False)

+------+-------------------+------+
|person|timestamp          |weight|
+------+-------------------+------+
|1     |2019-12-02 00:00:00|62.32 |
|1     |2019-12-02 00:00:00|62.32 |
|1     |2019-12-02 00:00:00|49.94 |
|1     |2019-12-03 00:00:00|50.49 |
|1     |2019-12-06 00:00:00|50.24 |
+------+-------------------+------+



In [27]:
from pyspark.sql import *
from pyspark.sql.functions import *

w=Window.partitionBy("person").orderBy(col("timestamp").desc())


df1=df.withColumn("rn",row_number().over(w)) #.filter(col("rn")==1).drop("rn")

df1.show()

+------+-------------------+------+---+
|person|          timestamp|weight| rn|
+------+-------------------+------+---+
|     1|2019-12-06 00:00:00| 62.32|  1|
|     1|2019-12-03 00:00:00| 62.32|  2|
|     1|2019-12-02 00:00:00| 49.94|  3|
|     2|2019-12-03 00:00:00| 62.32|  1|
|     2|2019-12-02 00:00:00| 62.32|  2|
+------+-------------------+------+---+

