# Bucketing

In this notebook you will bucket the data in the storage layout and use the metastore.

In [None]:
from pyspark.sql import SparkSession
import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Bucketing I')
    .enableHiveSupport()
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

users_input_path = os.path.join(project_path, 'data/users')

output_path = os.path.join(project_path, 'output/users-bucketed')

# Task

* create a metastore table for users and make the data bucketed by `user_id` into 10 buckets. Make sure you end up with 10 files (one file per bucket)

In [None]:
usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
)

#### Save the data

Hint:
* repartition by the same column that you use for bucketing `user_id` 
    * choose the same number of partitions as you want buckets (10)
* use [bucketBy](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrameWriter.bucketBy.html#pyspark.sql.DataFrameWriter.bucketBy) with 10 buckets

In [None]:
(
    usersDF
    .repartition(10, 'user_id')
    .write
    .mode('overwrite')
    .bucketBy(10, 'user_id')
    .sortBy('user_id')
    .option('path', output_path)
    .saveAsTable('users')
)

<b>Check the metastore table:</b>

Hint:
* use sql 
 * show tables
 * describe extended table_name

In [None]:
spark.table('users').count()

In [None]:
spark.sql("show tables").show()

In [None]:
spark.sql("describe extended users").show(truncate=60, n=50)

In [None]:
spark.stop()