# Bucketing

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, year, round, rand
)

import os

In [None]:
spark = (
    SparkSession
    .builder
    .appName('Bucketing I')
    .enableHiveSupport()
    .getOrCreate()
)

In [None]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

users_input_path = os.path.join(project_path, 'data/users')

output_path = os.path.join(project_path, 'output/users-bucketed')

# Task

* create a metastore table for users and make the data bucketed by user_id into 10 buckets

In [None]:
usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
)

In [None]:
(
    usersDF
    .repartition(10, 'user_id')
    .write
    .mode('overwrite')
    .bucketBy(10, 'user_id')
    .sortBy('user_id')
    .option('path', output_path)
    .saveAsTable('users')
)

<b>Check the metastore table:</b>

In [None]:
spark.table('users').count()

In [None]:
spark.sql("show tables").show()

In [None]:
spark.sql("describe extended users").show(truncate=60, n=50)