In [1]:
import multiprocessing
from datetime import datetime

import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.conf import SparkConf

In [2]:
STOP_SPARK_CONTEXT = True
NUM_CORES = 8

maven_artifacts = [
    "io.delta:delta-core_2.12:1.0.0",
    "org.apache.hadoop:hadoop-aws:3.2.0",
]

spark_config = {
    # Timezone
    "spark.sql.session.timeZone": "UTC",

    # Spark config
    "spark.sql.shuffle.partitions": "4",
    "spark.sql.parquet.compression.codec": "uncompressed",
    "spark.sql.sources.parallelPartitionDiscovery.parallelism": "4",
}

if STOP_SPARK_CONTEXT and "spark" in locals():
    locals()["spark"].stop()
    print("Spark Context stopped")

spark = (
    SparkSession.builder
    .appName("jupyter_pyspark")
    .master(f"local[{min(NUM_CORES, multiprocessing.cpu_count())}]")
    .config(conf=SparkConf().setAll(spark_config.items()))
    .getOrCreate()
)

spark

22/02/10 15:11:21 WARN Utils: Your hostname, emif-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.18 instead (on interface en0)
22/02/10 15:11:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/10 15:11:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/10 15:11:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
sdf = spark.read.json("/Users/emif/Downloads/test_dataset.json")

                                                                                

In [None]:
sdf.printSchema()

In [None]:
sdf.show(vertical=True, truncate=False)

In [None]:
sdf.select("date").distinct().orderBy("date", ascending=False).show()

In [261]:
# dates = ("2022-01-20",  "2022-02-10")

# F.col('date').between(*dates)

Column<'((date >= 2022-01-20) AND (date <= 2022-02-10))'>

In [263]:
# sdf_sliced2 = sdf.filter(F.expr("date between '2022-01-20' AND '2022-02-10'"))

In [269]:
# min_date2, max_date2 = sdf_sliced2.selectExpr("min(date)", "max(date)").first()
# print(min_date2, max_date2)

2022-01-20 2022-02-09


In [10]:
dates = ("2022-01-20",  "2022-02-10")

sdf_sliced = sdf.withColumn("date", F.col("date").cast(T.DateType())).where(F.col('date').between(*dates))

In [None]:
sdf_sliced.orderBy("date", ascending=False).show(vertical=True)

In [None]:
sdf_sliced.printSchema()

### logic

In [24]:
end_date = datetime.now()

In [77]:
min_date, max_date = sdf_sliced.select(F.min("date"), F.max("date")).first()
print(min_date)
print(max_date)

2022-01-20
2022-02-09


In [80]:
sdf_date_range = spark.sql(f"SELECT sequence(to_date('{min_date}'), to_date('{max_date}'), interval 1 day) as date").withColumn("date", F.explode(F.col("date")))

In [120]:
sdf_date_range.orderBy("date", ascending=False).show(25)

+----------+
|      date|
+----------+
|2022-02-09|
|2022-02-08|
|2022-02-07|
|2022-02-06|
|2022-02-05|
|2022-02-04|
|2022-02-03|
|2022-02-02|
|2022-02-01|
|2022-01-31|
|2022-01-30|
|2022-01-29|
|2022-01-28|
|2022-01-27|
|2022-01-26|
|2022-01-25|
|2022-01-24|
|2022-01-23|
|2022-01-22|
|2022-01-21|
|2022-01-20|
+----------+



In [122]:
sdf_date_range.printSchema()

root
 |-- date: date (nullable = false)



In [270]:
joined_sdf = sdf_date_range.join(sdf_sliced, "date", how="left")

In [None]:
joined_sdf.printSchema()

In [None]:
joined_sdf.orderBy("date", ascending=False).show(vertical=True)

In [180]:
# # Fill empty values example
# win = Window.partitionBy().orderBy("date")
# filled_df = new_joined_sdf.select(
#     "date", "usd", F.last("usd", ignorenulls=True).over(win).alias("usd_fill")
# )
# filled_df.orderBy("date", ascending=False).show()

22/02/10 18:16:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/10 18:16:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/10 18:16:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/10 18:16:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/02/10 18:16:02 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+------+--------+
|      date|   usd|usd_fill|
+----------+------+--------+
|2022-02-09|1.1435|  1.1435|
|2022-02-08|1.1408|  1.1408|
|2022-02-07|1.1447|  1.1447|
|2022-02-06|  null|  1.1464|
|2022-02-05|  null|  1.1464|
|2022-02-04|1.1464|  1.1464|
|2022-02-03|1.1286|  1.1286|
|2022-02-02|1.1323|  1.1323|
|2022-02-01| 1.126|   1.126|
|2022-01-31|1.1156|  1.1156|
|2022-01-30|  null|  1.1138|
|2022-01-29|  null|  1.1138|
|2022-01-28|1.1138|  1.1138|
|2022-01-27| 1.116|   1.116|
|2022-01-26|1.1277|  1.1277|
|2022-01-25|1.1268|  1.1268|
|2022-01-24|1.1304|  1.1304|
|2022-01-23|  null|  1.1348|
|2022-01-22|  null|  1.1348|
|2022-01-21|1.1348|  1.1348|
+----------+------+--------+
only showing top 20 rows



22/02/11 02:11:32 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1056057 ms exceeds timeout 120000 ms
22/02/11 02:11:32 WARN SparkContext: Killing executors is not supported by current scheduler.


In [230]:
# Fill empty values
# win = Window.partitionBy().orderBy("date")

# filled_df = new_joined_sdf

# for col in filled_df.columns:
#     filled_df = filled_df.withColumn(f"{col}_filled", F.last(col, ignorenulls=True).over(win))


In [None]:
# filled_df.orderBy("date", ascending=False).show(vertical=True)

In [199]:
# order_by_col = "date"
# win = Window.partitionBy().orderBy(order_by_col)

# for col in filled_df.columns:
#     if col != order_by_col:
#         filled_df.withColumn(f"{col}_filled", F.last(col, ignorenulls=True).over(win))

In [None]:
# filled_df.orderBy("date", ascending=False).show(vertical=True)

In [248]:
order_by_col = "date"
win = Window.partitionBy().orderBy(order_by_col)

result_col_list = [
    F.last(col, ignorenulls=True).over(win).alias(col) 
    for col in joined_sdf.columns
    if col != order_by_col
]

In [None]:
result_col_list

In [259]:
non_fill_cols = ["date"]

output_df = joined_sdf.select(*non_fill_cols, *result_col_list)

In [None]:
output_df.orderBy("date", ascending=False).show(vertical=True)