In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
from pyspark.storagelevel import StorageLevel
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

#####Folder creation

In [0]:
dbutils.fs.rm('dbfs:/FileStore/Spotify',recurse=True)

Out[47]: True

In [0]:
dbutils.fs.mkdirs('dbfs:/FileStore/Spotify')


Out[48]: True

In [0]:
dbutils.fs.mkdirs('dbfs:/FileStore/Spotify/raw')

Out[49]: True

In [0]:
dbutils.fs.mkdirs('dbfs:/FileStore/Spotify/output')

Out[50]: True

### 1.Partitioning

#####Read the csv file

In [0]:
options={
    'header':True,
    'inferSchema':True
}

spot_listn_df=(
    spark.read.format('csv').options(**options).load('dbfs:/FileStore/Spotify/raw/Spotify_Listening_Activity.csv')
)
spot_listn_df.show(10,truncate=False)

+-----------+-------+--------------------------+---------------+
|activity_id|song_id|listen_date               |listen_duration|
+-----------+-------+--------------------------+---------------+
|1          |12     |2023-06-27 10:15:47.008867|69             |
|2          |44     |2023-06-27 10:15:47.008867|300            |
|3          |75     |2023-06-27 10:15:47.008867|73             |
|4          |48     |2023-06-27 10:15:47.008867|105            |
|5          |10     |2023-06-27 10:15:47.008867|229            |
|6          |82     |2023-06-27 10:15:47.008867|35             |
|7          |64     |2023-06-27 10:15:47.008867|249            |
|8          |96     |2023-06-27 10:15:47.008867|211            |
|9          |52     |2023-06-27 10:15:47.008867|99             |
|10         |21     |2023-06-27 10:15:47.008867|181            |
+-----------+-------+--------------------------+---------------+
only showing top 10 rows



In [0]:
spot_listn_df.printSchema()

root
 |-- activity_id: integer (nullable = true)
 |-- song_id: integer (nullable = true)
 |-- listen_date: timestamp (nullable = true)
 |-- listen_duration: integer (nullable = true)



In [0]:
spot_listn_df_1=(
    spot_listn_df.withColumn('actual_date', F.to_date(F.col('listen_date'),'yyyy-MM-dd hh:mm:ss SSSSSS'))
                 .withColumn('actual_hr', F.hour(F.col('listen_date')))
)
spot_listn_df_1.show(3)

+-----------+-------+--------------------+---------------+-----------+---------+
|activity_id|song_id|         listen_date|listen_duration|actual_date|actual_hr|
+-----------+-------+--------------------+---------------+-----------+---------+
|          1|     12|2023-06-27 10:15:...|             69| 2023-06-27|       10|
|          2|     44|2023-06-27 10:15:...|            300| 2023-06-27|       10|
|          3|     75|2023-06-27 10:15:...|             73| 2023-06-27|       10|
+-----------+-------+--------------------+---------------+-----------+---------+
only showing top 3 rows



In [0]:
spot_listn_df_1.count()

Out[54]: 11779

##### Write the df partion by listen_date

In [0]:
try:
    print('Write Started')
    (
            spot_listn_df_1.write.partitionBy('actual_date')
                            .format('parquet')
                            .mode('overwrite')
                            .save('dbfs:/FileStore/Spotify/output')
    )
    print('Write completed')
except Exception as e:
    print(f"Error while writing-{str(e)}")


Write Started
Error while writing-name 'spot_listn_df_1' is not defined


##### Write the df partion by listen_date and hour

In [0]:
try:
    print('Write Started')
    (
            spot_listn_df_1.write.partitionBy('actual_date','actual_hr')
                            .format('parquet')
                            .mode('overwrite')
                            .save('dbfs:/FileStore/Spotify/output')
    )
    print('Write completed')
except Exception as e:
    print(f"Error while writing-{str(e)}")



##### Write the df partion by listen_date and 3 files in each partition

In [0]:
try:
    print('Write Started')
    (
            spot_listn_df_1.repartition(3).write
                           .partitionBy('actual_date')
                            .format('parquet')
                            .mode('overwrite')
                            .save('dbfs:/FileStore/Spotify/output')
    )
    print('Write completed')
except Exception as e:
    print(f"Error while writing-{str(e)}")

Write Started
Error while writing-name 'spot_listn_df_1' is not defined


### 2.Bucketing

In [0]:
dbutils.fs.mkdirs('dbfs:/FileStore/bucketing')

Out[7]: True

####Read files

In [0]:
options={
    'header':True,
    'inferSchema':True
}
order_df=(
    spark.read.format('csv')
    .options(**options)
    .load('dbfs:/FileStore/bucketing/orders.csv')
)

In [0]:
order_df.count()

Out[11]: 1000

In [0]:
order_df.show(5)

+--------+----------+-----------+--------+----------+------------+
|order_id|product_id|customer_id|quantity|order_date|total_amount|
+--------+----------+-----------+--------+----------+------------+
|       1|        80|         10|       4|2023-03-20|        1003|
|       2|        69|         30|       3|2023-12-11|         780|
|       3|        61|         20|       4|2023-04-26|        1218|
|       4|        62|         44|       3|2023-08-26|        2022|
|       5|        78|         46|       4|2023-08-05|        1291|
+--------+----------+-----------+--------+----------+------------+
only showing top 5 rows



In [0]:
options={
    'header':True,
    'inferSchema':True
}
products_df=(
    spark.read.format('csv')
    .options(**options)
    .load('dbfs:/FileStore/bucketing/products.csv')
)

In [0]:
products_df.count()

Out[13]: 100

In [0]:
products_df.show(5)

+----------+------------+-----------+-------+-----+-----+
|product_id|product_name|   category|  brand|price|stock|
+----------+------------+-----------+-------+-----+-----+
|         1|   Product_1|Electronics|Brand_4|   26|  505|
|         2|   Product_2|    Apparel|Brand_4|  489|   15|
|         3|   Product_3|    Apparel|Brand_4|  102|  370|
|         4|   Product_4|  Groceries|Brand_1|   47|  433|
|         5|   Product_5|  Groceries|Brand_3|  244|  902|
+----------+------------+-----------+-------+-----+-----+
only showing top 5 rows



In [0]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold",-1)
spark.conf.set("spark.sql.adaptive.enabled", "false")

**Join without bucketing**

In [0]:
joined_df=(
    order_df.join(products_df,order_df.product_id== products_df.product_id,'left_outer')
)

In [0]:
joined_df.write.format('noop').mode('overwrite').option('header','true').save('dbfs:/FileStore/bucketing/output')

In [0]:
joined_df.explain(True)

== Parsed Logical Plan ==
Join LeftOuter, (product_id#47 = product_id#159)
:- Relation [order_id#46,product_id#47,customer_id#48,quantity#49,order_date#50,total_amount#51] csv
+- Relation [product_id#159,product_name#160,category#161,brand#162,price#163,stock#164] csv

== Analyzed Logical Plan ==
order_id: int, product_id: int, customer_id: int, quantity: int, order_date: date, total_amount: int, product_id: int, product_name: string, category: string, brand: string, price: int, stock: int
Join LeftOuter, (product_id#47 = product_id#159)
:- Relation [order_id#46,product_id#47,customer_id#48,quantity#49,order_date#50,total_amount#51] csv
+- Relation [product_id#159,product_name#160,category#161,brand#162,price#163,stock#164] csv

== Optimized Logical Plan ==
Join LeftOuter, (product_id#47 = product_id#159)
:- Relation [order_id#46,product_id#47,customer_id#48,quantity#49,order_date#50,total_amount#51] csv
+- Filter isnotnull(product_id#159)
   +- Relation [product_id#159,product_name#16

**Join with bucketing**

In [0]:
(
    order_df.write.bucketBy(3,"product_id")
    .format('csv')
    .mode('overwrite')
    .option('header','true')
    .option('path','dbfs:/FileStore/bucketing/order_bucketed')
    .saveAsTable('order_bucketed')
)

In [0]:
(
    products_df.write.bucketBy(3,"product_id")
    .format('csv')
    .mode('overwrite')
    .option('header','true')
    .option('path','dbfs:/FileStore/bucketing/product_bucketed')
    .saveAsTable('product_bucketed')
)