## Data Cleaning & Transformantion

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("brazilian-ecommerce").master("local[*]").getOrCreate()

25/06/12 12:42:57 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
# define schema
schema_customers = 'customer_id STRING, customer_unique_id STRING, customer_zip_code_prefix INT, customer_city STRING, customer_state STRING'
schema_geolocation = 'geolocation_zip_code_prefix INT, geolocation_lat DOUBLE, geolocation_lng DOUBLE, geolocation_city STRING, geolocation_state STRING'
schema_order_items = 'order_id STRING, order_item_id INT, product_id STRING, seller_id STRING, shipping_limit_date STRING, price DOUBLE, freight_value DOUBLE'
schema_order_payments = 'order_id STRING, payment_sequential INT, payment_type STRING, payment_installments INT, payment_value DOUBLE'
schema_order_reviews = 'review_id STRING, order_id STRING, review_score INT, review_comment_title STRING, review_comment_message STRING, review_creation_date STRING, review_answer_timestamp STRING'
schema_orders = 'order_id STRING, customer_id STRING, order_status STRING, order_purchase_timestamp STRING, order_approved_at STRING, order_delivered_carrier_date STRING, order_delivered_customer_date STRING, order_estimated_delivery_date STRING'
schema_products = 'product_id STRING, product_category_name STRING, product_name_lenght INT, product_description_lenght INT, product_photos_qty INT, product_weight_g INT, product_length_cm INT, product_height_cm INT, product_width_cm INT'
schema_sellers = 'seller_id STRING, seller_zip_code_prefix INT, seller_city STRING, seller_state STRING'
category_translation = 'product_category_name STRING, product_category_name_english STRING'

In [5]:
path_head = "gs://bucket-name/ecommerce_real/"

df_customers = spark.read.parquet(f"{path_head}olist_customers", header=True, schema=schema_customers)
df_geolocation = spark.read.parquet(f"{path_head}olist_geolocation", header=True, schema=schema_geolocation)
df_order_items = spark.read.parquet(f"{path_head}olist_order_items", header=True, schema=schema_order_items)
df_order_payments = spark.read.parquet(f"{path_head}olist_order_payments", header=True, schema=schema_order_payments)
df_order_reviews = spark.read.parquet(f"{path_head}olist_order_reviews", header=True, schema=schema_order_reviews)
df_orders = spark.read.parquet(f"{path_head}olist_orders", header=True, schema=schema_orders)
df_products = spark.read.parquet(f"{path_head}olist_products", header=True, schema=schema_products)
df_sellers = spark.read.parquet(f"{path_head}olist_sellers", header=True, schema=schema_sellers)

In [6]:
df_cat_trans = spark.read.csv(f"{path_head}product_category_name_translation.csv", header=True, schema=category_translation)

### Customers

In [7]:
# Change string format of city from lower to title
df_customers = df_customers.withColumn("customer_city", initcap(col('customer_city')))
df_customers.show(5)

                                                                                

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              Franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|Sao Bernardo Do C...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           Sao Paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     Mogi Das Cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            Campinas|            SP|
+--------------------+--------------------+------------------------+--------------------+--------------+
only showing top 5 rows



In [8]:
df_customers.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [9]:
# write clean customers table to parquet file
df_customers.write.mode("overwrite").parquet(f"{path_head}cleaned/customers/")

                                                                                

### Sellers

In [10]:
# Change string format of city from lower to title
df_sellers = df_sellers.withColumn("seller_city", initcap(col('seller_city')))
df_sellers.show(5)

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+----------------------+-----------------+------------+
|           seller_id|seller_zip_code_prefix|      seller_city|seller_state|
+--------------------+----------------------+-----------------+------------+
|3442f8959a84dea7e...|                 13023|         Campinas|          SP|
|d1b65fc7debc3361e...|                 13844|       Mogi Guacu|          SP|
|ce3ad9de960102d06...|                 20031|   Rio De Janeiro|          RJ|
|c0f3eea2e14555b6f...|                  4195|        Sao Paulo|          SP|
|51a04a8a6bdcb23de...|                 12914|Braganca Paulista|          SP|
+--------------------+----------------------+-----------------+------------+
only showing top 5 rows



                                                                                

In [11]:
# write cleaned table to parquet file
df_sellers.write.mode("overwrite").parquet(f"{path_head}cleaned/sellers/")

                                                                                

### Products

In [12]:
# rename some columns name
df_products = df_products \
    .withColumnRenamed("product_name_lenght", "product_name_length") \
    .withColumnRenamed("product_description_lenght", "product_description_length")

In [14]:
# filter all row that has missing value
anomali_products_df = df_products.filter(
    col("product_category_name").isNull() |
    col("product_name_length").isNull() |
    col("product_description_length").isNull() |
    col("product_photos_qty").isNull() |
    col("product_weight_g").isNull() |
    col("product_length_cm").isNull() |
    col("product_height_cm").isNull() |
    col("product_width_cm").isNull()
)

# save it into parquet
anomali_products_df.write.mode("overwrite").parquet(f"{path_head}anomalies/anomaly_products/")

                                                                                

In [15]:
df_products = df_products.na.drop(subset=[
    "product_category_name",
    "product_name_length",
    "product_description_length",
    "product_photos_qty",
    "product_weight_g",
    "product_length_cm",
    "product_height_cm",
    "product_width_cm"
])

# check missing values
df_products.select([count(when(col(c).isNull(), 1)).alias(c) for c in df_products.columns]).show()

[Stage 21:>                                                         (0 + 1) / 1]

+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|product_id|product_category_name|product_name_length|product_description_length|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|         0|                    0|                  0|                         0|                 0|               0|                0|                0|               0|
+----------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+



                                                                                

The folder .../anomalies/anomaly_products/ contains rows that might be corrupted.

Total rows 611, where there are 609 rows of [product_category_name, product_name_length, product_description_length] that are NULL and 1 row of [product_weight_g, product_length_cm, product_height_cm, product_width_cm] that are NULL, and 1 row where all columns (except id) are NULL

In [16]:
# map category english name into category original name
df_products = df_products.join(df_cat_trans, "product_category_name", "left")

In [17]:
# Two categories are not available in df_cat_trans,
# so that it is translated manually based on context understanding:
# 'pc_gamer' => 'computers' (since there is computers category in the existed file)
# 'portateis_cozinha_e_preparadores_de_alimentos' => 'portable_kitchen_food_preparators'

df_products = df_products.withColumn(
    'product_category_name_english',
    when(
        col('product_category_name') == 'pc_gamer', 'computers'
    ).when(
        col('product_category_name') == 'portateis_cozinha_e_preparadores_de_alimentos', 'portable_kitchen_food_preparators'
    ).otherwise(col('product_category_name_english'))
)

In [18]:
df_products = df_products.select(
    'product_id', 'product_category_name',
    'product_category_name_english', 'product_name_length',
    'product_description_length', 'product_photos_qty',
    'product_weight_g', 'product_length_cm',
    'product_height_cm', 'product_width_cm'
)

In [21]:
df_products = df_products \
.withColumn("product_weight_kg", col("product_weight_g") / 1000) \
.withColumn("product_volume_cm3", col("product_length_cm")*col("product_width_cm")*col("product_height_cm")) \
.select(
    'product_id', 'product_category_name',
    'product_category_name_english', 'product_name_length',
    'product_description_length', 'product_photos_qty',
    'product_weight_g', 'product_weight_kg', 'product_length_cm',
    'product_height_cm', 'product_width_cm', 'product_volume_cm3'
)

In [22]:
df_products.agg(countDistinct("product_id"), count("*")).show()

[Stage 29:>                                                         (0 + 1) / 1]

+--------------------------+--------+
|count(DISTINCT product_id)|count(1)|
+--------------------------+--------+
|                     32340|   32340|
+--------------------------+--------+



                                                                                

In [23]:
df_products.show(5)

+--------------------+---------------------+-----------------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+-----------------+----------------+------------------+
|          product_id|product_category_name|product_category_name_english|product_name_length|product_description_length|product_photos_qty|product_weight_g|product_weight_kg|product_length_cm|product_height_cm|product_width_cm|product_volume_cm3|
+--------------------+---------------------+-----------------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+-----------------+----------------+------------------+
|1e9e8ef04dbcff454...|           perfumaria|                    perfumery|                 40|                       287|                 1|             225|            0.225|               16|               10|              14|              2240|
|3aa0711

In [24]:
# write cleaned table to parquet file
df_products.write.mode("overwrite").parquet(f"{path_head}cleaned/products/")

                                                                                

### Geolocation

In [25]:
# Change string format of city from lower to title
df_geolocation = df_geolocation.withColumn("geolocation_city", initcap(col('geolocation_city')))
df_geolocation.show(5)

+---------------------------+-------------------+------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|   geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+------------------+----------------+-----------------+
|                       1037| -23.54562128115268|-46.63929204800168|       Sao Paulo|               SP|
|                       1046|-23.546081127035535|-46.64482029837157|       Sao Paulo|               SP|
|                       1046| -23.54612896641469|-46.64295148361138|       Sao Paulo|               SP|
|                       1041|  -23.5443921648681|-46.63949930627844|       Sao Paulo|               SP|
|                       1035|-23.541577961711493|-46.64160722329613|       Sao Paulo|               SP|
+---------------------------+-------------------+------------------+----------------+-----------------+
only showing top 5 rows



In [26]:
# ambil rata-rata lat long sehingga zip code prefix tidak duplikat
df_geolocation = df_geolocation.groupBy("geolocation_zip_code_prefix").agg(
    avg("geolocation_lat").alias("geolocation_lat"),
    avg("geolocation_lng").alias("geolocation_lng"),
    first("geolocation_city").alias("geolocation_city"),
    first("geolocation_state").alias("geolocation_state")
)

df_geolocation.count()

                                                                                

19015

In [27]:
# check duplikat
df_geolocation.groupBy("geolocation_zip_code_prefix").count().filter("count > 1").count()

                                                                                

0

In [28]:
# write cleaned table to parquet file
df_geolocation.write.mode("overwrite").parquet(f"{path_head}cleaned/geolocation/")

                                                                                

### Orders

In [30]:
df_orders = df_orders.select(
    'order_id', 'customer_id',
    'order_status',
    'order_purchase_timestamp', 'order_approved_at',
    'order_delivered_carrier_date', 'order_delivered_customer_date',
    'order_estimated_delivery_date'
)

In [31]:
df_orders.show(5)

[Stage 55:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

                                                                                

In [33]:
#  All the timestamp cols are in the same format yyyy-MM-dd HH:mm:ss. Change the type from string to timestamp

timestamp_cols = [
    "order_purchase_timestamp",
    "order_approved_at",
    "order_delivered_carrier_date",
    "order_delivered_customer_date",
    "order_estimated_delivery_date"
]

for col_name in timestamp_cols:
    df_orders = df_orders.withColumn(
        col_name,
        to_timestamp(col(col_name), "yyyy-MM-dd HH:mm:ss")
    )

# Since the HH:mm:ss of order_estimated_delivery_date are same, which is 00:00:00, I think it is better to change it into date type
df_orders = df_orders.withColumn(
    "order_estimated_delivery_date",
    to_date(col("order_estimated_delivery_date"), "yyyy-MM-dd")
)

df_orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: date (nullable = true)



In [34]:
# Average Delivery Time Analysis

df_orders = df_orders.withColumn('delivery_time',datediff(col('order_delivered_customer_date'),col('order_purchase_timestamp'))).orderBy(desc('delivery_time'))

In [35]:
df_orders.filter('delivery_time is not null').groupBy("delivery_time").pivot("order_status").count().orderBy("delivery_time").show()



+-------------+--------+---------+
|delivery_time|canceled|delivered|
+-------------+--------+---------+
|            0|    NULL|        1|
|            1|    NULL|      629|
|            2|    NULL|     2874|
|            3|    NULL|     3448|
|            4|    NULL|     4497|
|            5|    NULL|     5258|
|            6|    NULL|     6507|
|            7|       2|     7480|
|            8|    NULL|     7123|
|            9|    NULL|     6256|
|           10|    NULL|     5793|
|           11|       1|     5260|
|           12|    NULL|     4937|
|           13|    NULL|     4509|
|           14|    NULL|     4101|
|           15|    NULL|     3330|
|           16|    NULL|     2816|
|           17|    NULL|     2494|
|           18|    NULL|     2280|
|           19|    NULL|     1934|
+-------------+--------+---------+
only showing top 20 rows



                                                                                

In [36]:
df_orders.filter('delivery_time is not null').groupBy("delivery_time").pivot("order_status").count().orderBy("delivery_time").filter('canceled is not null').show()



+-------------+--------+---------+
|delivery_time|canceled|delivered|
+-------------+--------+---------+
|            7|       2|     7480|
|           11|       1|     5260|
|           30|       1|      435|
|           31|       1|      400|
|           36|       1|      258|
+-------------+--------+---------+



                                                                                

Anomaly in orders

1. order_status = "canceled" tapi delivery_time ≠ NULL (ada order_delivered_customer_date)
2. order_status = "delivered" tapi order_approved_at = NULL
3. order_status = "delivered" tapi order_delivered_carrier_date = NULL
4. order_status = "delivered" tapi order_delivered_customer_date = NULL
5. order_id count beda antara orders (99441) dan order_items (98666) -> Artinya ada order yang tidak punya item. Itu tidak valid secara logika.


In [37]:
orderid_orderitems = df_order_items.select("order_id").distinct()

df_orders_flagged = df_orders.alias("x")\
.join(
    orderid_orderitems.alias("y"),
    col("x.order_id") == col("y.order_id"),
    how="left") \
.withColumn(
    "flag_missing_items", when(col("y.order_id").isNull(), lit(True))) \
.drop(col("y.order_id"))

In [38]:
df_orders_flagged = df_orders_flagged \
    .withColumn("flag_delivered_customer_but_canceled", when((col("order_delivered_customer_date").isNotNull()) & (col("order_status") == "canceled"), lit(True))) \
    .withColumn("flag_missing_approvaldate_on_delivered", when((col("order_approved_at").isNull()) & (col("order_status") == "delivered"), lit(True))) \
    .withColumn("flag_missing_carrierdate_on_delivered", when((col("order_delivered_carrier_date").isNull()) & (col("order_status") == "delivered"), lit(True))) \
    .withColumn("flag_missing_delivcustdate_on_delivered", when((col("order_delivered_customer_date").isNull()) & (col("order_status") == "delivered"), lit(True)))

In [39]:
# orders_valid -> data yang tidak mengandung flag
# orders_anomalies → data yang mengandung setidaknya satu flag

df_orders_valid = df_orders_flagged.filter(
    (col("flag_delivered_customer_but_canceled").isNull()) &
    (col("flag_missing_approvaldate_on_delivered").isNull()) &
    (col("flag_missing_carrierdate_on_delivered").isNull()) &
    (col("flag_missing_delivcustdate_on_delivered").isNull()) &
    (col("flag_missing_items").isNull())
)

df_orders_anomalies = df_orders_flagged.subtract(df_orders_valid)

In [40]:
df_orders_anomalies.select(
    sum(col("flag_delivered_customer_but_canceled").cast("int")).alias("anomaly_1"),
    sum(col("flag_missing_approvaldate_on_delivered").cast("int")).alias("anomaly_2"),
    sum(col("flag_missing_carrierdate_on_delivered").cast("int")).alias("anomaly_3"),
    sum(col("flag_missing_delivcustdate_on_delivered").cast("int")).alias("anomaly_4"),
    sum(col("flag_missing_items").cast("int")).alias("anomaly_5")
).show()

25/06/12 13:01:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+---------+---------+---------+---------+---------+
|anomaly_1|anomaly_2|anomaly_3|anomaly_4|anomaly_5|
+---------+---------+---------+---------+---------+
|        6|       14|        2|        8|      775|
+---------+---------+---------+---------+---------+



                                                                                

In [41]:
# write cleaned table to parquet file
df_orders_valid.write.mode("overwrite").parquet(f"{path_head}cleaned/orders/")

# write anomalies table to parquet file
df_orders_anomalies.write.mode("overwrite").parquet(f"{path_head}anomalies/anomaly_orders/")

# write full table with flag
df_orders_flagged.write.mode("overwrite").parquet(f"{path_head}flagged/anomaly_orders_full/")

                                                                                

In [42]:
df_orders_valid.agg(countDistinct("order_id")).show()



+------------------------+
|count(DISTINCT order_id)|
+------------------------+
|                   98637|
+------------------------+



                                                                                

In [43]:
# check missing values
df_orders_valid.select([count(when(col(c).isNull(), 1)).alias(c) for c in df_orders_valid.columns]).show()



+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+-------------+------------------+------------------------------------+--------------------------------------+-------------------------------------+---------------------------------------+
|order_id|customer_id|order_status|order_purchase_timestamp|order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|delivery_time|flag_missing_items|flag_delivered_customer_but_canceled|flag_missing_approvaldate_on_delivered|flag_missing_carrierdate_on_delivered|flag_missing_delivcustdate_on_delivered|
+--------+-----------+------------+------------------------+-----------------+----------------------------+-----------------------------+-----------------------------+-------------+------------------+------------------------------------+--------------------------------------+--------------

                                                                                

### Order Items

In [44]:
df_order_items = df_order_items.withColumn("shipping_limit_date", to_timestamp(col("shipping_limit_date"), "yyyy-MM-dd HH:mm:ss"))

In [45]:
df_order_summary = df_order_items.select("order_id", "order_item_id", "product_id", "price", "freight_value").groupBy("order_id").agg(
    round(sum("price"), 2).alias("total_price"),
    round(sum("freight_value"), 2).alias("total_freight"),
    round(sum("price") + sum("freight_value"), 2).alias("total_order_value"),
    count("order_item_id").alias("total_qty")
)

In [46]:
df_product_qty = df_order_items.groupBy("order_id", "product_id") \
    .agg(count("*").alias("qty"))

df_product_struct = df_product_qty.groupBy("order_id") \
    .agg(collect_list(struct("product_id", "qty")).alias("product_qty_list"))

In [47]:
df_order_items_final = df_order_summary.join(df_product_struct, "order_id", "left")
df_order_items_final.show(5, truncate=False)

                                                                                

+--------------------------------+-----------+-------------+-----------------+---------+---------------------------------------+
|order_id                        |total_price|total_freight|total_order_value|total_qty|product_qty_list                       |
+--------------------------------+-----------+-------------+-----------------+---------+---------------------------------------+
|019886de8f385a39b75bedbb726fd4ef|159.9      |28.5         |188.4            |1        |[{e9a69340883a438c3f91739d14d3a56d, 1}]|
|01a6ad782455876aa89081449d49c452|34.99      |15.1         |50.09            |1        |[{036734b5a58d5d4f46b0616ddc047ced, 1}]|
|05bef443b850685058070c9e781988e8|55.0       |24.52        |79.52            |1        |[{44e086c4a977f37a888627b43880586c, 1}]|
|077700dcf4e3bb4128459fc825a4056c|199.9      |15.15        |215.05           |1        |[{6cd0d08f09a8b32450fd32de16265a74, 1}]|
|09e90e3936db197d43b9bde1291c307d|72.9       |14.09        |86.99            |1        |[{e4c7ed7

In [48]:
df_order_items_final.count()

                                                                                

98666

In [49]:
# write cleaned table to parquet file
df_order_items_final.write.mode("overwrite").parquet(f"{path_head}cleaned/order_items/")

                                                                                

### Order Payments

In [50]:
df_order_payments.filter("order_id = '1d9a9731b9c10fc9cba74e6f74782e8b'").orderBy("payment_sequential").groupBy("order_id").agg(
    round(sum("payment_value"), 2).alias("total_payment"),
    collect_list("payment_type").alias("payment_type"),
    collect_list("payment_installments").alias("payment_installments")
).show()

[Stage 150:>                                                        (0 + 1) / 1]

+--------------------+-------------+--------------------+--------------------+
|            order_id|total_payment|        payment_type|payment_installments|
+--------------------+-------------+--------------------+--------------------+
|1d9a9731b9c10fc9c...|        63.58|[credit_card, vou...|[1, 1, 1, 1, 1, 1...|
+--------------------+-------------+--------------------+--------------------+



                                                                                

In [51]:
df_payments_summary = df_order_payments.orderBy("payment_sequential").groupBy("order_id").agg(
    round(sum("payment_value"), 2).alias("total_payment"),
    collect_list("payment_type").alias("payment_type"),
    collect_list("payment_installments").alias("payment_installments")
)

In [52]:
df_payments_summary.show(5, truncate=False)

[Stage 157:>                                                        (0 + 1) / 1]

+--------------------------------+-------------+-------------+--------------------+
|order_id                        |total_payment|payment_type |payment_installments|
+--------------------------------+-------------+-------------+--------------------+
|000229ec398224ef6ca0657da4fc703e|216.87       |[credit_card]|[5]                 |
|00054e8431b9d7675808bcb819fb4a32|31.75        |[credit_card]|[1]                 |
|000576fe39319847cbb9d288c5617fa6|880.75       |[credit_card]|[10]                |
|0005a1a1728c9d785b8e2b08b904576c|157.6        |[credit_card]|[3]                 |
|0005f50442cb953dcd1d21e1fb923495|65.39        |[credit_card]|[1]                 |
+--------------------------------+-------------+-------------+--------------------+
only showing top 5 rows



                                                                                

In [53]:
df_payments_summary.count()

                                                                                

99441

In [54]:
# write cleaned table to parquet file
df_payments_summary.write.mode("overwrite").parquet(f"{path_head}cleaned/order_payments/")

                                                                                

### Order Reviews

In [55]:
df_order_reviews = df_order_reviews.withColumn("review_answer_timestamp", to_timestamp(col("review_answer_timestamp"), "yyyy-MM-dd HH:mm:ss"))

In [56]:
df_order_reviews = df_order_reviews.withColumn(
    "review_creation_date",
    to_date(to_timestamp(col("review_creation_date"), "yyyy-MM-dd HH:mm:ss"))
)

In [57]:
df_order_reviews.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: date (nullable = true)
 |-- review_answer_timestamp: timestamp (nullable = true)
 |-- review_creation_date_is_valid: boolean (nullable = true)
 |-- review_answer_timestamp_is_valid: boolean (nullable = true)



Anomaly in order_reviews

1. review_id or order_id or review_score is NULL
2. review_score = 0 (tidak masuk akal)
3. order_id yang formatnya tidak valid (bukan 32 karakter heksadesimal)
4. duplikat > 2 masuk ke nomor 3
   duplikat == 2 -> kemungkinan update reviews

In [59]:
df_order_reviews_flagged = df_order_reviews \
    .withColumn("flag_missing_reviewid_orderid_score", when((col("review_id").isNull()) | col("order_id").isNull() | col("review_score").isNull(), lit(True))) \
    .withColumn("flag_review_score_0", when((col("review_score") == 0), lit(True))) \
    .withColumn("flag_orderid_not_valid", when((~col("order_id").rlike("^[a-f0-9]{32}$")), lit(True)))

In [60]:
window_orderid = Window.partitionBy("order_id")
df_order_reviews_flagged = df_order_reviews_flagged.withColumn("order_id_count", count("order_id").over(window_orderid)) \
                                                   .withColumn("flag_duplicate", when((col("order_id_count") > 1), lit(True)))

In [61]:
# Hapus NULL pada kolom penting
df_order_reviews_clean = df_order_reviews_flagged.filter(
    col("review_id").isNotNull() |
    col("order_id").isNotNull() |
    col("review_score").isNotNull()
)

In [62]:
# Hapus order_id yang formatnya tidak valid (bukan 32 karakter heksadesimal)
df_order_reviews_clean = df_order_reviews_clean.filter(col("order_id").rlike("^[a-f0-9]{32}$"))

In [63]:
# Hapus review_score yang bernilai 0 (tidak masuk akal)
df_order_reviews_clean = df_order_reviews_clean.filter(col("review_score") > 0)

In [64]:
# drop duplicate di mana yang terbaru bertahan
window = Window.partitionBy("order_id").orderBy(col("review_creation_date").desc())

df_order_reviews_clean = df_order_reviews_clean.withColumn("rn", row_number().over(window)) \
                                   .filter(col("rn") == 1) \
                                   .drop("rn")

In [65]:
df_order_reviews_clean = df_order_reviews_clean.drop("flag_missing_reviewid_orderid_score", "flag_review_score_0", "flag_orderid_not_valid", "order_id_count", "flag_duplicate")

In [66]:
df_order_reviews_clean.groupBy("order_id").count().filter('count > 1').show()

[Stage 178:>                                                        (0 + 1) / 1]

+--------+-----+
|order_id|count|
+--------+-----+
+--------+-----+



                                                                                

In [67]:
df_order_reviews.agg(countDistinct("order_id"), count("*")).show()



+------------------------+--------+
|count(DISTINCT order_id)|count(1)|
+------------------------+--------+
|                   99742|  104162|
+------------------------+--------+



                                                                                

In [68]:
df_order_reviews_clean.agg(countDistinct("order_id"), count("*")).show()



+------------------------+--------+
|count(DISTINCT order_id)|count(1)|
+------------------------+--------+
|                   98673|   98673|
+------------------------+--------+



                                                                                

In [69]:
df_order_reviews_anomalies = df_order_reviews_flagged.filter(
    (col("flag_missing_reviewid_orderid_score") == True) |
    (col("flag_review_score_0") == True) |
    (col("flag_orderid_not_valid") == True) |
    (col("flag_duplicate") == True)
)

df_order_reviews_anomalies.select(
    sum(col("flag_missing_reviewid_orderid_score").cast("int")).alias("anomaly_1"),
    sum(col("flag_review_score_0").cast("int")).alias("anomaly_2"),
    sum(col("flag_orderid_not_valid").cast("int")).alias("anomaly_3"),
    sum(col("flag_duplicate").cast("int")).alias("anomaly_4"),
).show()



+---------+---------+---------+---------+
|anomaly_1|anomaly_2|anomaly_3|anomaly_4|
+---------+---------+---------+---------+
|     4937|        1|     2702|     3134|
+---------+---------+---------+---------+



                                                                                

In [70]:
df_order_reviews_anomalies.count(), df_order_reviews_flagged.count()

                                                                                

(6036, 104162)

In [71]:
# write cleaned table to parquet file
df_order_reviews_clean.write.mode("overwrite").parquet(f"{path_head}cleaned/order_reviews/")

# write anomalies table to parquet file
df_order_reviews_anomalies.write.mode("overwrite").parquet(f"{path_head}anomalies/anomaly_order_reviews/")

# write full table with flag
df_order_reviews_flagged.write.mode("overwrite").parquet(f"{path_head}flagged/anomaly_orderreviews_full/")


                                                                                