In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").appName('EOD data load').getOrCreate()

In [30]:
common_df = spark.read.parquet("ingest_data/partition=T", "ingest_data/partition=Q")
# trade_common = spark.read.parquet("ingest_data/partition=T")
common_df.printSchema()
common_df.count()

root
 |-- trade_dt: date (nullable = true)
 |-- rec_type: string (nullable = true)
 |-- symbol: string (nullable = true)
 |-- exchange: string (nullable = true)
 |-- event_tm: timestamp (nullable = true)
 |-- event_seq_nb: integer (nullable = true)
 |-- arrival_tm: timestamp (nullable = true)
 |-- trade_pr: decimal(17,14) (nullable = true)
 |-- bid_pr: decimal(17,14) (nullable = true)
 |-- bid_size: integer (nullable = true)
 |-- ask_pr: decimal(17,14) (nullable = true)
 |-- ask_size: integer (nullable = true)



1200

In [6]:
trade = common_df.select("trade_dt", "symbol", "exchange", "event_tm", \
                            "event_seq_nb", "arrival_tm", "trade_pr")
trade.show(10)

+----------+------+--------+--------------------+------------+-------------------+-----------------+
|  trade_dt|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|         trade_pr|
+----------+------+--------+--------------------+------------+-------------------+-----------------+
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 10:42:...|          10|2020-08-06 09:30:00|78.93245610745132|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 12:00:...|          20|2020-08-06 09:30:00|77.09670488777135|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 13:09:...|          30|2020-08-06 09:30:00|78.31461997164219|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 14:27:...|          40|2020-08-06 09:30:00|75.84401002785360|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 15:39:...|          50|2020-08-06 09:30:00|77.62613181984351|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 16:43:...|          60|2020-08-06 09:30:00|77.57371021517118|
|2020-08-06|  SYMA|  NASDAQ|2020-08-06 18:01:...|          70|2020-08-06 09:30:00|76.977468

In [33]:
common_df.registerTempTable('data')

In [40]:
row_ranked_data = spark.sql("""
    SELECT 
        *
        , ROW_NUMBER() over (
            PARTITION BY trade_dt, rec_type, symbol, exchange, event_tm, event_seq_nb 
            ORDER BY arrival_tm desc) as rn
    FROM data
""")

row_ranked_data.registerTempTable('row_ranked_data')
row_ranked_data.show()

+----------+--------+------+--------+--------------------+------------+-------------------+------------------+------------------+--------+------------------+--------+---+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|          trade_pr|            bid_pr|bid_size|            ask_pr|ask_size| rn|
+----------+--------+------+--------+--------------------+------------+-------------------+------------------+------------------+--------+------------------+--------+---+
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 11:34:...|          17|2020-08-05 09:30:00|              null| 78.06235033524703|     100| 78.36705025855920|     100|  1|
|2020-08-05|       Q|  SYMA|    NYSE|2020-08-05 14:16:...|          39|2020-08-05 09:30:00|              null| 74.63142350491319|     100| 75.82283182627648|     100|  1|
|2020-08-05|       Q|  SYMB|  NASDAQ|2020-08-05 11:22:...|          16|2020-08-05 09:30:00|              null| 33.62173563866898|     100| 35.166

In [42]:
deduped_data = spark.sql("""
    SELECT
        trade_dt
        , rec_type
        , symbol
        , exchange
        , event_tm
        , event_seq_nb
        , arrival_tm
        , trade_pr
        , bid_pr
        , bid_size
        , ask_pr
        , ask_size
    FROM row_ranked_data
    WHERE rn = 1
""")
deduped_data.count()
deduped_data.show()

+----------+--------+------+--------+--------------------+------------+-------------------+------------------+------------------+--------+------------------+--------+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|          trade_pr|            bid_pr|bid_size|            ask_pr|ask_size|
+----------+--------+------+--------+--------------------+------------+-------------------+------------------+------------------+--------+------------------+--------+
|2020-08-05|       Q|  SYMA|  NASDAQ|2020-08-05 11:34:...|          17|2020-08-05 09:30:00|              null| 78.06235033524703|     100| 78.36705025855920|     100|
|2020-08-05|       Q|  SYMA|    NYSE|2020-08-05 14:16:...|          39|2020-08-05 09:30:00|              null| 74.63142350491319|     100| 75.82283182627648|     100|
|2020-08-05|       Q|  SYMB|  NASDAQ|2020-08-05 11:22:...|          16|2020-08-05 09:30:00|              null| 33.62173563866898|     100| 35.16696898182639|     100

In [46]:
spark.sql("""SELECT * FROM row_ranked_data WHERE rec_type = 'T'""").show()

+----------+--------+------+--------+--------------------+------------+-------------------+------------------+------+--------+------+--------+---+
|  trade_dt|rec_type|symbol|exchange|            event_tm|event_seq_nb|         arrival_tm|          trade_pr|bid_pr|bid_size|ask_pr|ask_size| rn|
+----------+--------+------+--------+--------------------+------------+-------------------+------------------+------+--------+------+--------+---+
|2020-08-06|       T|  SYMC|  NASDAQ|2020-08-06 20:09:...|          90|2020-08-06 09:30:00|160.98419316231528|  null|    null|  null|    null|  1|
|2020-08-05|       T|  SYMB|  NASDAQ|2020-08-05 10:40:...|          10|2020-08-05 09:30:00| 32.15344380416123|  null|    null|  null|    null|  1|
|2020-08-05|       T|  SYMB|    NYSE|2020-08-05 13:10:...|          30|2020-08-05 09:30:00| 34.18391410021153|  null|    null|  null|    null|  1|
|2020-08-05|       T|  SYMA|    NYSE|2020-08-05 14:24:...|          40|2020-08-05 09:30:00| 78.43095407886027|  null| 