In [8]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("olistData") \
    .getOrCreate()


In [9]:
hdfs_path='/data/olist'

In [23]:
customers_df = spark.read.csv(hdfs_path + "/olist_customers_dataset.csv", header=True, inferSchema=True)
geolocation_df = spark.read.csv(hdfs_path + "/olist_geolocation_dataset.csv", header=True, inferSchema=True)
order_items_df = spark.read.csv(hdfs_path + "/olist_order_items_dataset.csv", header=True, inferSchema=True)
payments_df = spark.read.csv(hdfs_path + "/olist_order_payments_dataset.csv", header=True, inferSchema=True)
reviews_df = spark.read.csv(hdfs_path + "/olist_order_reviews_dataset.csv", header=True, inferSchema=True)
orders_df = spark.read.csv(hdfs_path + "/olist_orders_dataset.csv", header=True, inferSchema=True)
products_df = spark.read.csv(hdfs_path + "/olist_products_dataset.csv", header=True, inferSchema=True)
sellers_df = spark.read.csv(hdfs_path + "/olist_sellers_dataset.csv", header=True, inferSchema=True)
category_translation_df = spark.read.csv(hdfs_path + "/product_category_name_translation.csv", header=True, inferSchema=True)

                                                                                

In [None]:
#handle missing values

In [None]:
#  case 
# a drop missing values
# fill missing values


In [24]:
# Dropping rows where any of the specified columns have NULL values
orders_df_cleaned = orders_df.na.drop(subset=["order_id", "customer_id", "order_status"])

# Show the updated DataFrame
orders_df_cleaned.show()


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [25]:
from pyspark.sql.functions import lit

# Creating a new DataFrame by filling NULL values
orders_df_filled = orders_df.fillna({
    "order_id": "000000",     # Fill order_id NULLs with "UNKNOWN_ORDER"
    "customer_id": "00", # Fill customer_id NULLs with "UNKNOWN_CUSTOMER"
    "order_status": "PENDING"        # Fill order_status NULLs with "PENDING"
})

# Show the updated DataFrame
orders_df_filled.show()


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [13]:
def print_schema(df,df_name):
    print(f'schema of {df_name}:')
    df.printSchema()

In [14]:
print_schema(orders_df,'order')

schema of order:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)



In [15]:
print_schema(payments_df,'payments')

schema of payments:
root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)



In [36]:
from pyspark.sql.functions import when, col

# Create a new DataFrame with 'boleto' replaced by 'Bank Transfer' while keeping others unchanged
payments_df_cleaned = payments_df.withColumn("payment_type",when(col("payment_type") == "boleto", "Bank Transfer").otherwise(col("payment_type"))
)

# Show the new DataFrame
payments_df_cleaned.show()
payments_df_cleaned.printSchema()


+--------------------+------------------+-------------+--------------------+-------------+
|            order_id|payment_sequential| payment_type|payment_installments|payment_value|
+--------------------+------------------+-------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1|  credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1|  credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1|  credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1|  credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1|  credit_card|                   2|       128.45|
|298fcdf1f73eb413e...|                 1|  credit_card|                   2|        96.12|
|771ee386b001f0620...|                 1|  credit_card|                   1|        81.16|
|3d7239c394a212faa...|                 1|  credit_card|                   3|        51.84|

In [16]:
customers_df.show()

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|06b8999e2fba1a1fb...|861eff4711a542e4b...|                   14409|              franca|            SP|
|18955e83d337fd6b2...|290c77bc529b7ac93...|                    9790|sao bernardo do c...|            SP|
|4e7b3e00288586ebd...|060e732b5b29e8181...|                    1151|           sao paulo|            SP|
|b2b6027bc5c5109e5...|259dac757896d24d7...|                    8775|     mogi das cruzes|            SP|
|4f2d8ab171c80ec83...|345ecd01c38d18a90...|                   13056|            campinas|            SP|
|879864dab9bc30475...|4c93744516667ad3b...|                   89254|      jaragua do sul|            SC|
|fd826e7cf63160e53...|addec96d2e059c80c...|            

In [18]:
customers_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: integer (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [27]:
from pyspark.sql.functions import col

# Cast 'customer_zip_code_prefix' to String type
customers_df_cleaned = customers_df.withColumn(
    "customer_zip_code_prefix", col("customer_zip_code_prefix").cast("string")
)

# Show the updated DataFrame
customers_df_cleaned.printSchema()


root
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)



In [28]:
customers_df_cleaned=customers_df_cleaned.dropDuplicates(['customer_id'])

In [21]:
customers_df_cleaned.show()

[Stage 25:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------------------+--------------------+--------------+
|         customer_id|  customer_unique_id|customer_zip_code_prefix|       customer_city|customer_state|
+--------------------+--------------------+------------------------+--------------------+--------------+
|00012a2ce6f8dcda2...|248ffe10d632bebe4...|                    6273|              osasco|            SP|
|000161a058600d590...|b0015e09bb4b6e47c...|                   35550|         itapecerica|            MG|
|000379cdec6255224...|0b83f73b19c2019e1...|                    4841|           sao paulo|            SP|
|0004164d20a9e969a...|104bdb7e6a6cdceaa...|                   13272|            valinhos|            SP|
|000419c5494106c30...|14843983d4a159080...|                   24220|             niteroi|            RJ|
|00050bf6e01e69d5c...|e3cf594a99e810f58...|                   98700|                ijui|            RS|
|00072d033fe2e5906...|b7c13491fd2aecd93...|            

                                                                                

In [12]:
orders_df.show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [29]:
from pyspark.sql.functions import to_date, col

# Convert 'order_purchase_timestamp' to date format (removing time part)
order_df_cleaned = orders_df.withColumn(
    "order_purchase_date", to_date(col("order_purchase_timestamp"))
)

# Show the updated DataFrame
order_df_cleaned.show()

# Verify schema
order_df_cleaned.printSchema()


+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_purchase_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|         2017-10-02|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27

In [34]:
order_with_details = order_df_cleaned.join(order_items_df, on="order_id", how="left")

# Join order_df_details with payments_df_cleaned (Left Join on order_id)
order_with_details = order_with_details.join(payments_df_cleaned, on="order_id", how="left")

# Join order_df_details with customer_df_cleaned (Left Join on customer_id)
order_with_details = order_with_details.join(customers_df_cleaned, on="customer_id", how="left")

# Show the final DataFrame
order_with_details.show(5)

[Stage 62:>                                                         (0 + 1) / 1]

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+------------------+-------------+--------------------+-------------+--------------------+------------------------+-------------+--------------+
|         customer_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_purchase_date|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|payment_sequential| payment_type|payment_installments|payment_value|  customer_unique_id|customer_zip_code_prefix|customer_city|customer_state|
+--------------------+--------------------+------------+------------------------+-------------------

                                                                                

In [40]:
from pyspark.sql.functions import sum

order_with_total_values = order_with_details.groupBy('order_id').agg(sum('payment_value').alias('total_order_value'))


In [42]:
order_with_total_values.show(5)



+--------------------+-----------------+
|            order_id|total_order_value|
+--------------------+-----------------+
|118045506e1c1dda0...|           1802.0|
|f44cb69655f8e4d13...|           164.32|
|edcc6b79e8394346b...|           162.63|
|9f98d6530155e3b38...|           316.76|
|949280c70c6d62ec9...|            49.42|
+--------------------+-----------------+
only showing top 5 rows



                                                                                

In [44]:
order_items_df.show()

+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35|  58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13| 239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30| 199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18| 12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51| 199.9|        18.14|
|00048cc3ae777c65d...|            1|ef92

In [45]:
# Calculate 1st percentile (0.01) and 99th percentile (0.99)
quantiles = order_items_df.approxQuantile("price", [0.01, 0.99], 0.0)
lower_bound, upper_bound = quantiles

print(f"Lower Bound (1st percentile): {lower_bound}")
print(f"Upper Bound (99th percentile): {upper_bound}")


                                                                                

Lower Bound (1st percentile): 9.99
Upper Bound (99th percentile): 890.0


In [47]:
from pyspark.sql.functions import col

# Filter rows within the quantile range
order_items_df_cleaned= order_items_df.filter(
    (col("price") >= lower_bound) & (col("price") <= upper_bound)
)

# Show the cleaned DataFrame
order_items_df_cleaned.show()


+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date| price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+------+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35|  58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13| 239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30| 199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18| 12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51| 199.9|        18.14|
|00048cc3ae777c65d...|            1|ef92

In [48]:
products_df.show()

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|
|96bd76ec8810374ed...|        esporte_lazer|                 46|                       250|    

In [49]:
from pyspark.sql.functions import when, col

products_df_cleaned = products_df.withColumn(
    'product_size_category',
    when(col('product_weight_g') < 500, 'Small')
    .when((col('product_weight_g') >= 500) & (col('product_weight_g') <= 2000), 'Medium')
    .otherwise('Large')
)

# Show the updated DataFrame
products_df_cleaned.show()


+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_size_category|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|                Small|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|        

In [50]:
from pyspark.sql.functions import sum

# Group by seller_id and calculate total price
seller_price_df = order_items_df.groupBy("seller_id").agg(sum("price").alias("total_revenue"))

# Show the new DataFrame
seller_price_df.show()


+--------------------+------------------+
|           seller_id|     total_revenue|
+--------------------+------------------+
|8e6cc767478edae94...| 6830.580000000001|
|4d600e08ecbe08258...|           4465.34|
|9b1050e85becf3ae9...|             85.14|
|cb5ff1b9715e99589...|              85.0|
|038b75b729c8a9a04...|             467.0|
|64c9a1db4e73e19aa...|             439.0|
|acadd4d36859671cb...|            2381.0|
|33ab10be054370c25...|213.20000000000002|
|bec568278124768c4...|             219.9|
|b76dba6c951ab00dc...|2574.7800000000016|
|33cbbec1e7e1044aa...| 730.6700000000001|
|e9b6c33b71b677376...|             119.9|
|7a67c85e85bb2ce85...|141745.53000000078|
|3d8fa2f5b647373c8...|3571.7299999999996|
|e5c84227854980f1d...|             72.81|
|9d213f303afae4983...|              47.7|
|a435b009cd956ea60...|            361.18|
|ca77545ca4d2dfd14...|             760.0|
|ee2fbacc2fc3794e6...|            464.97|
|c13ef0cfbe42f1907...|             390.0|
+--------------------+------------

In [59]:
!hadoop fs -ls /data/olist_proc

Found 2 items
drwxr-xr-x   - root hadoop          0 2025-04-01 17:17 /data/olist_proc/cleaned_data.parquet
drwxr-xr-x   - root hadoop          0 2025-04-01 17:27 /data/olist_proc/product_df_cleaned.parquet


In [53]:
!hadoop fs -mkdir /data/olist_proc

In [54]:
order_with_details.write.mode("overwrite").parquet("hdfs:///data/olist_proc/cleaned_data.parquet")


                                                                                

In [56]:
order_with_details.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_purchase_date: date (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- cust

In [57]:
products_df_cleaned.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- product_name_lenght: integer (nullable = true)
 |-- product_description_lenght: integer (nullable = true)
 |-- product_photos_qty: integer (nullable = true)
 |-- product_weight_g: integer (nullable = true)
 |-- product_length_cm: integer (nullable = true)
 |-- product_height_cm: integer (nullable = true)
 |-- product_width_cm: integer (nullable = true)
 |-- product_size_category: string (nullable = false)



In [58]:
products_df_cleaned.write.mode("overwrite").parquet("hdfs:///data/olist_proc/product_df_cleaned.parquet")


                                                                                

In [62]:
spark.sql("""
    CREATE EXTERNAL TABLE IF NOT EXISTS cleaned_payments (
        product_id STRING,
        product_category_name STRING,
        product_name_lenght INT,
        product_description_lenght INT,
        product_photos_qty INT,
        product_weight_g INT,
        product_length_cm INT,
        product_height_cm INT,
        product_width_cm INT,
        product_size_category STRING
    )
    STORED AS PARQUET
    LOCATION 'hdfs:///data/olist_proc/product_df_cleaned.parquet'
""")


DataFrame[]

In [63]:
spark.sql(""" select * from cleaned_payments""")

DataFrame[product_id: string, product_category_name: string, product_name_lenght: int, product_description_lenght: int, product_photos_qty: int, product_weight_g: int, product_length_cm: int, product_height_cm: int, product_width_cm: int, product_size_category: string]

In [5]:
spark.sql("SELECT COUNT(*) FROM cleaned_payments").show()


Py4JJavaError: An error occurred while calling o70.sql.
: java.lang.IllegalStateException: LiveListenerBus is stopped.
	at org.apache.spark.scheduler.LiveListenerBus.addToQueue(LiveListenerBus.scala:101)
	at org.apache.spark.scheduler.LiveListenerBus.addToStatusQueue(LiveListenerBus.scala:80)
	at org.apache.spark.sql.internal.SharedState.<init>(SharedState.scala:115)
	at org.apache.spark.sql.SparkSession.$anonfun$sharedState$1(SparkSession.scala:143)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession.sharedState$lzycompute(SparkSession.scala:143)
	at org.apache.spark.sql.SparkSession.sharedState(SparkSession.scala:142)
	at org.apache.spark.sql.SparkSession.$anonfun$sessionState$2(SparkSession.scala:162)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession.sessionState$lzycompute(SparkSession.scala:160)
	at org.apache.spark.sql.SparkSession.sessionState(SparkSession.scala:157)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$2(SparkSession.scala:631)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:138)
	at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:630)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:629)
	at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:659)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [64]:
spark.sql("SHOW TABLES").show()


+---------+----------------+-----------+
|namespace|       tableName|isTemporary|
+---------+----------------+-----------+
|  default|  cleaned_orders|      false|
|  default|cleaned_payments|      false|
|  default| customers_500mb|      false|
+---------+----------------+-----------+



In [66]:
spark.sql("SELECT * FROM cleaned_payments LIMIT 10").show()


+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|product_size_category|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+---------------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|                Small|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|        

In [4]:
from pyspark.sql import SparkSession

# Stop the existing Spark session if it exists
if "spark" in locals() or "spark" in globals():
    spark.stop()