In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("local[*]").appName("Execution Job").getOrCreate()

In [3]:
products_path = 'data_examples/products_parquet/'
sales_path = 'data_examples/sales_parquet/'
sellers_path = 'data_examples/sellers_parquet/'
paths = [products_path,sales_path,sellers_path]

## Parquet File format

In [4]:
dataframes = []
for p in paths:
  dataframes.append(spark.read.parquet(p)) 

In [5]:
products = dataframes[0]
sales = dataframes[1]
sellers = dataframes[2]

In [7]:
sales.show()

+--------+----------+---------+----------+---------------+--------------------+
|order_id|product_id|seller_id|      date|num_pieces_sold|       bill_raw_text|
+--------+----------+---------+----------+---------------+--------------------+
|       1|         0|        0|2020-07-10|             62|esctmitgntlqljxnC...|
|       2|         0|        0|2020-07-07|             38|cufiduzyskiviokju...|
|       3|         0|        0|2020-07-05|             13|hscngebsortzolelf...|
|       4|         0|        0|2020-07-06|             94|jxhvzoobncxwzkpdl...|
|       5|         0|        0|2020-07-02|             41|nqazvvrqffccuwzpr...|
|       6|         0|        0|2020-07-07|             72|auesyqwlzglbecnmn...|
|       7|         0|        0|2020-07-04|             58|wymwvtmlsrirflpne...|
|       8|         0|        0|2020-07-08|             97|xzsadhvwyzhiboqIu...|
|       9|         0|        0|2020-07-08|             53|gvbzspbwezmfjwmuz...|
|      10|         0|        0|2020-07-0

In [28]:
from pyspark.sql.functions import *
df = sales \
.filter(col('product_id') == 0).groupBy('date').agg(sum('num_pieces_sold').alias('total'))

In [32]:
df.show()

+----------+--------+
|      date|   total|
+----------+--------+
|2020-07-03|966312.0|
|2020-07-07|959833.0|
|2020-07-01|966041.0|
|2020-07-08|972484.0|
|2020-07-04|959798.0|
|2020-07-10|946825.0|
|2020-07-09|950340.0|
|2020-07-06|967997.0|
|2020-07-02|958383.0|
|2020-07-05|960707.0|
+----------+--------+



In [34]:
df.explain()

== Physical Plan ==
*(2) HashAggregate(keys=[date#33], functions=[sum(cast(num_pieces_sold#34 as double))])
+- Exchange hashpartitioning(date#33, 200), true, [id=#288]
   +- *(1) HashAggregate(keys=[date#33], functions=[partial_sum(cast(num_pieces_sold#34 as double))])
      +- *(1) Project [date#33, num_pieces_sold#34]
         +- *(1) Filter (isnotnull(product_id#31) AND (cast(product_id#31 as int) = 0))
            +- *(1) ColumnarToRow
               +- FileScan parquet [product_id#31,date#33,num_pieces_sold#34] Batched: true, DataFilters: [isnotnull(product_id#31), (cast(product_id#31 as int) = 0)], Format: Parquet, Location: InMemoryFileIndex[file:/home/jovyan/work/data_examples/sales_parquet], PartitionFilters: [], PushedFilters: [IsNotNull(product_id)], ReadSchema: struct<product_id:string,date:string,num_pieces_sold:string>




## Partitioning

In [37]:
sales.write.mode("overwrite").partitionBy('date').parquet('data_examples/sales_parquet_partitioned/')

In [60]:
sales1 = spark.read.parquet('data_examples/sales_parquet_partitioned/')

In [61]:
grouped  = sales1.filter(col('date') == '2020-07-07').groupBy('product_id').agg(sum('num_pieces_sold').alias('total'))

In [62]:
grouped.show()

+----------+-----+
|product_id|total|
+----------+-----+
|     17677| 30.0|
|     42638| 48.0|
|     26730| 34.0|
|     34842| 63.0|
|     59799|  3.0|
|     46162| 65.0|
|     65991| 27.0|
|      6548| 73.0|
|     21638| 46.0|
|     25467| 48.0|
|     67162| 23.0|
|     16378| 29.0|
|     67896| 37.0|
|     57556| 98.0|
|      8520| 38.0|
|      4117| 62.0|
|      7235| 10.0|
|     37483| 29.0|
|     15891| 90.0|
|     12852| 55.0|
+----------+-----+
only showing top 20 rows



In [63]:
grouped.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[product_id#610], functions=[sum(cast(num_pieces_sold#612 as double))])
   +- Exchange hashpartitioning(product_id#610, 4), true, [id=#594]
      +- HashAggregate(keys=[product_id#610], functions=[partial_sum(cast(num_pieces_sold#612 as double))])
         +- Project [product_id#610, num_pieces_sold#612]
            +- FileScan parquet [product_id#610,num_pieces_sold#612,date#614] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex[file:/home/jovyan/work/data_examples/sales_parquet_partitioned], PartitionFilters: [isnotnull(date#614), (date#614 = 18450)], PushedFilters: [], ReadSchema: struct<product_id:string,num_pieces_sold:string>




In [58]:
spark.conf.get('spark.sql.shuffle.partitions')

'200'

In [59]:
spark.conf.set('spark.sql.shuffle.partitions','4')

## Bucketing 

In [6]:
movies1 = spark.read.option('Header','true').csv('movies1.csv')

In [7]:
movies2 = spark.read.option('Header','true').csv('movies2.csv')

In [8]:
movies = movies1.join(movies2,'title')

data =movies.groupBy('title').count()

data.show()

data.explain()

+--------------------+-----+
|               title|count|
+--------------------+-----+
|      Jumanji (1995)|    1|
|Father of the Bri...|    1|
|      Copycat (1995)|    1|
|Waiting to Exhale...|    1|
|  Money Train (1995)|    1|
|       Casino (1995)|    1|
|Sense and Sensibi...|    1|
|    GoldenEye (1995)|    1|
|Ace Ventura: When...|    1|
|Grumpier Old Men ...|    1|
| Tom and Huck (1995)|    1|
|      Sabrina (1995)|    1|
|   Get Shorty (1995)|    1|
|American Presiden...|    1|
|         Heat (1995)|    1|
|    Toy Story (1995)|    1|
|Cutthroat Island ...|    1|
| Sudden Death (1995)|    1|
|        Balto (1995)|    1|
|   Four Rooms (1995)|    1|
+--------------------+-----+
only showing top 20 rows

== Physical Plan ==
*(3) HashAggregate(keys=[title#41], functions=[count(1)])
+- Exchange hashpartitioning(title#41, 200), true, [id=#147]
   +- *(2) HashAggregate(keys=[title#41], functions=[partial_count(1)])
      +- *(2) Project [title#41]
         +- *(2) BroadcastHashJoin

In [9]:
movies1.write\
    .bucketBy(16, 'title') \
    .sortBy('title') \
    .saveAsTable('movies1_bucket', format='parquet')

In [10]:
movies2.write\
    .bucketBy(16, 'title') \
    .sortBy('title') \
    .saveAsTable('movies2_bucket', format='parquet')

In [11]:
m1 = spark.table('movies1_bucket')
m2 = spark.table('movies2_bucket')

In [21]:
m = m1.join(m2) \
.filter(m1['movieId'] == 20)
m.show()
m.explain()

+-------+------------------+--------------------+-------+--------------------+--------------------+
|movieId|             title|              genres|movieId|               title|              genres|
+-------+------------------+--------------------+-------+--------------------+--------------------+
|     20|Money Train (1995)|Action|Comedy|Cri...|     23|    Assassins (1995)|Action|Crime|Thri...|
|     20|Money Train (1995)|Action|Comedy|Cri...|     40|Cry, the Beloved ...|               Drama|
|     20|Money Train (1995)|Action|Comedy|Cri...|     31|Dangerous Minds (...|               Drama|
|     20|Money Train (1995)|Action|Comedy|Cri...|     63|Don't Be a Menace...|        Comedy|Crime|
|     20|Money Train (1995)|Action|Comedy|Cri...|     71|    Fair Game (1995)|              Action|
|     20|Money Train (1995)|Action|Comedy|Cri...|    118| If Lucy Fell (1996)|      Comedy|Romance|
|     20|Money Train (1995)|Action|Comedy|Cri...|     25|Leaving Las Vegas...|       Drama|Romance|
