In [2]:
import pathlib
from datetime import datetime
from typing import List, Tuple, Union, Dict

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Column

from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
from pyspark.sql import Window


In [3]:
# master configuration to use only 4 CPU cores
spark = SparkSession.builder.master("local[4]").getOrCreate()

# basic configuration to use only a reasonable number of partitions
spark.conf.set("spark.sql.shuffle.partition", 4)

# configuration to work in UTC
spark.conf.set("spark.sql.session.timeZone", "UTC")


23/07/31 11:49:17 WARN Utils: Your hostname, Emilianos-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.130 instead (on interface en0)
23/07/31 11:49:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/07/31 11:49:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
path = "/Users/emilianofrigo/Downloads/test_read/"

df = spark.read.parquet(path)


                                                                                

In [5]:
df.printSchema()

root
 |-- region_id: string (nullable = true)
 |-- account_id: string (nullable = true)
 |-- marketplace_id: string (nullable = true)
 |-- report_id: string (nullable = true)
 |-- posted_dt: date (nullable = true)
 |-- raw_data: struct (nullable = true)
 |    |-- AdjustmentAmount: struct (nullable = true)
 |    |    |-- CurrencyAmount: double (nullable = true)
 |    |    |-- CurrencyCode: string (nullable = true)
 |    |-- AdjustmentItemList: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- PerUnitAmount: struct (nullable = true)
 |    |    |    |    |-- CurrencyAmount: double (nullable = true)
 |    |    |    |    |-- CurrencyCode: string (nullable = true)
 |    |    |    |-- ProductDescription: string (nullable = true)
 |    |    |    |-- Quantity: string (nullable = true)
 |    |    |    |-- SellerSKU: string (nullable = true)
 |    |    |    |-- TotalAmount: struct (nullable = true)
 |    |    |    |    |-- CurrencyAmount: double (nul

In [6]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---------+------------+--------------+--------------------+----------+--------------------+--------------------+-------------------+--------------------+
|region_id|  account_id|marketplace_id|           report_id| posted_dt|            raw_data|   filename_adjusted|         aud_upd_ts|  report_ingested_ts|
+---------+------------+--------------+--------------------+----------+--------------------+--------------------+-------------------+--------------------+
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-06-15|{{14.08, EUR}, [{...|s3://heroes-dl-ra...|2023-07-26 09:52:43|2023-07-26 09:10:...|
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-06-15|{{-10.35, GBP}, [...|s3://heroes-dl-ra...|2023-07-26 09:52:43|2023-07-26 09:10:...|
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-06-15|{{19.55, EUR}, [{...|s3://heroes-dl-ra...|2023-07-26 09:52:43|2023-07-26 09:10:...|
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-06-15

                                                                                

In [7]:
df.count()

364

In [8]:
df.dropDuplicates().count()

                                                                                

363

In [9]:
# Method 2: Use 'groupBy' and 'count' to detect duplicates
duplicates = df.groupBy("report_id", "posted_dt").count().filter(F.col("count") > 1)

duplicates.show(truncate=False)

+------------------------------------+----------+-----+
|report_id                           |posted_dt |count|
+------------------------------------+----------+-----+
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-21|15   |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-02|3    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-26|2    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-17|5    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-25|7    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-25|6    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-03|2    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-15|7    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-27|27   |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-22|3    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-18|4    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-24|5    |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-07|15   |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-23|11   |
|fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-07-06

In [10]:
(
    df
    .filter((F.col("report_id") == "fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0") & (F.col("posted_dt") == "2023-06-21"))
    .show(truncate=False)
)

+---------+------------+--------------+------------------------------------+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------+-----------------------+
|region_id|account_id  |marketplace_id|report_id                           |posted_dt |raw_data                                                                                                                                                                                                                                                                                       

In [11]:
(
    df
    .filter((F.col("report_id") == "fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0") & (F.col("posted_dt") == "2023-06-21"))
    .select("region_id", "account_id", "marketplace_id", "report_id",  "posted_dt", "raw_data.posteddate")
).show(truncate=False)

+---------+------------+--------------+------------------------------------+----------+--------------------+
|region_id|account_id  |marketplace_id|report_id                           |posted_dt |posteddate          |
+---------+------------+--------------+------------------------------------+----------+--------------------+
|EU       |OLIVER_JAMES|UK            |fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-21|2023-06-21T00:13:06Z|
|EU       |OLIVER_JAMES|UK            |fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-21|2023-06-21T00:16:47Z|
|EU       |OLIVER_JAMES|UK            |fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-21|2023-06-21T10:41:01Z|
|EU       |OLIVER_JAMES|UK            |fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-21|2023-06-21T10:41:01Z|
|EU       |OLIVER_JAMES|UK            |fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-21|2023-06-21T11:54:24Z|
|EU       |OLIVER_JAMES|UK            |fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0|2023-06-21|2023-06-21T17:01:46Z|
|EU       |OLIVER_J

In [17]:
(
    df
    .select("raw_data")
    # .distinct()
    
# ).show(truncate=False)
).count()

364

In [18]:
duplicates = df.groupBy("report_id", "posted_dt", "raw_data").count().filter(F.col("count") > 1)

In [20]:
duplicates.show(truncate=False)



+------------------------------------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|report_id                           |posted_dt |raw_data                                                                                                                                                                                                                                                                                                                        |count|
+------------------------------------+----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

In [None]:
"{{-10.84, GBP}, [{{-10.84, GBP}, Vibrator Sex Toy, Adult Sex Toys for Women - Powerful Electric Wand Massager Vibrator, G Spot Clitoris Stimulation, Dildo, Vibrators - Water-Resistant, Wireless - 20 Vibration Modes &amp; 8 Speeds (Black), 1, LVW-SMW-WW-BK-NA, {-10.84, GBP}}], COMPENSATED_CLAWBACK, 2023-07-11T20:27:10Z}"

In [22]:
(
    df
    .filter((F.col("report_id") == "fb2d63a6-9fd4-4d52-8191-bf3e677a2ed0") & (F.col("posted_dt") == "2023-07-11"))
    ).show()

+---------+------------+--------------+--------------------+----------+--------------------+--------------------+-------------------+--------------------+
|region_id|  account_id|marketplace_id|           report_id| posted_dt|            raw_data|   filename_adjusted|         aud_upd_ts|  report_ingested_ts|
+---------+------------+--------------+--------------------+----------+--------------------+--------------------+-------------------+--------------------+
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-07-11|{{-9.35, GBP}, [{...|s3://heroes-dl-ra...|2023-07-26 09:52:43|2023-07-26 09:12:...|
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-07-11|{{-10.03, GBP}, [...|s3://heroes-dl-ra...|2023-07-26 09:52:43|2023-07-26 09:12:...|
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-07-11|{{-9.35, GBP}, [{...|s3://heroes-dl-ra...|2023-07-26 09:52:43|2023-07-26 09:12:...|
|       EU|OLIVER_JAMES|            UK|fb2d63a6-9fd4-4d5...|2023-07-11

23/07/31 14:23:32 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 670315 ms exceeds timeout 120000 ms
23/07/31 14:23:32 WARN SparkContext: Killing executors is not supported by current scheduler.
