In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [3]:
total_users = 100
total_orders = 1000000

states = ['WA', 'CA', 'NY', 'MI', 'AZ', 'CO']
products = ['ITEM1', 'ITEM2', 'ITEM3', 'ITEM4', 'ITEM5']

import random
user_df = sc.parallelize(range(total_users)) \
    .map(lambda id: (id, 'user_' + str(id), states[random.randrange(len(states))])) \
    .toDF(['user_id', 'user_name', 'user_state'])

In [4]:
order_df = sc.parallelize(range(total_orders)) \
    .map(lambda id: (id, random.randrange(100), random.randrange(len(products)), random.randrange(total_users))) \
    .toDF(['order_id', 'qty', 'product_id', 'user_id'])

In [5]:
user_df.show()
order_df.show()

+-------+---------+----------+
|user_id|user_name|user_state|
+-------+---------+----------+
|      0|   user_0|        MI|
|      1|   user_1|        CA|
|      2|   user_2|        WA|
|      3|   user_3|        CO|
|      4|   user_4|        CO|
|      5|   user_5|        CO|
|      6|   user_6|        MI|
|      7|   user_7|        MI|
|      8|   user_8|        MI|
|      9|   user_9|        NY|
|     10|  user_10|        CO|
|     11|  user_11|        NY|
|     12|  user_12|        WA|
|     13|  user_13|        CA|
|     14|  user_14|        AZ|
|     15|  user_15|        CO|
|     16|  user_16|        MI|
|     17|  user_17|        CO|
|     18|  user_18|        CA|
|     19|  user_19|        CA|
+-------+---------+----------+
only showing top 20 rows

+--------+---+----------+-------+
|order_id|qty|product_id|user_id|
+--------+---+----------+-------+
|       0| 32|         0|     79|
|       1| 82|         0|     48|
|       2| 44|         2|     37|
|       3| 21|         1| 

In [7]:
print('Number of partitions: {}'.format(order_df.rdd.getNumPartitions()))
#print('Partition structure: {}'.format(order_df.rdd.glom().collect()))


Number of partitions: 6


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
order_df.write.format('csv').option('header', 'true').save('orders')

In [8]:
# explicitly told the number of partitions at repartitioning
# order_df_partitioned.repartition(5)

order_df_partitioned = order_df.repartition(2, 'product_id')

In [9]:
order_df_partitioned.rdd.getNumPartitions()

2

In [10]:
order_df_partitioned.rdd.glom().take(10)

[[Row(order_id=3, qty=50, product_id=2, user_id=94),
  Row(order_id=10, qty=41, product_id=2, user_id=81),
  Row(order_id=11, qty=7, product_id=4, user_id=82),
  Row(order_id=13, qty=82, product_id=2, user_id=37),
  Row(order_id=16, qty=6, product_id=4, user_id=21),
  Row(order_id=19, qty=92, product_id=2, user_id=19),
  Row(order_id=20, qty=8, product_id=4, user_id=96),
  Row(order_id=21, qty=5, product_id=4, user_id=57),
  Row(order_id=23, qty=93, product_id=2, user_id=10),
  Row(order_id=24, qty=60, product_id=2, user_id=13),
  Row(order_id=25, qty=30, product_id=4, user_id=88),
  Row(order_id=26, qty=57, product_id=4, user_id=6),
  Row(order_id=30, qty=43, product_id=4, user_id=26),
  Row(order_id=33, qty=51, product_id=4, user_id=59),
  Row(order_id=34, qty=20, product_id=2, user_id=76),
  Row(order_id=36, qty=54, product_id=2, user_id=57),
  Row(order_id=37, qty=90, product_id=2, user_id=56),
  Row(order_id=41, qty=4, product_id=4, user_id=56),
  Row(order_id=42, qty=73, product_

In [12]:
df2 = user_df.repartition(2, 'user_state')
df2.rdd.glom().collect()

[[Row(user_id=4, user_name='user_4', user_state='AZ'),
  Row(user_id=5, user_name='user_5', user_state='MI'),
  Row(user_id=11, user_name='user_11', user_state='MI'),
  Row(user_id=12, user_name='user_12', user_state='MI'),
  Row(user_id=13, user_name='user_13', user_state='MI'),
  Row(user_id=14, user_name='user_14', user_state='NY'),
  Row(user_id=20, user_name='user_20', user_state='AZ'),
  Row(user_id=21, user_name='user_21', user_state='MI'),
  Row(user_id=27, user_name='user_27', user_state='MI'),
  Row(user_id=28, user_name='user_28', user_state='MI'),
  Row(user_id=29, user_name='user_29', user_state='MI'),
  Row(user_id=30, user_name='user_30', user_state='NY'),
  Row(user_id=32, user_name='user_32', user_state='NY'),
  Row(user_id=37, user_name='user_37', user_state='AZ'),
  Row(user_id=38, user_name='user_38', user_state='MI'),
  Row(user_id=44, user_name='user_44', user_state='MI'),
  Row(user_id=45, user_name='user_45', user_state='MI'),
  Row(user_id=46, user_name='user_4

In [None]:
order_df_reduced = order_df_partitioned.coalesce(1)
order_df_reduced.rdd.getNumPartitions()

In [None]:
order_per_prod = order_df.groupby('product_id').count()
order_per_prod.show()

In [None]:
order_per_prod.rdd.getNumPartitions()

In [None]:
order_per_prod.rdd.glom().collect()

In [None]:
d = order_per_prod.coalesce(1)

In [None]:
d.rdd.getNumPartitions()

In [13]:
print(spark.conf.get('spark.sql.shuffle.partitions'))
#spark.conf.set('spark.sql.shuffle.partitions', 100)
#print(spark.conf.get('spark.sql.shuffle.partitions'))

200


In [None]:
order_per_prod = order_df.groupby('product_id').count()
print(order_per_prod.rdd.getNumPartitions())
order_per_prod.show()