# Bucketing

In [1]:
import findspark
findspark.init()

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, year, round, rand
)

import os

In [2]:
spark = (
    SparkSession
    .builder
    .appName('Bucketing I')
    .enableHiveSupport()
    .getOrCreate()
)

In [4]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-2]) 

users_input_path = os.path.join(project_path, 'data/users')

output_path = os.path.join(project_path, 'output/users-bucketed')

# Task

* create a metastore table for users and make the data bucketed by user_id into 10 buckets

In [5]:
usersDF = (
    spark
    .read
    .option('path', users_input_path)
    .load()
)

In [6]:
(
    usersDF
    .repartition(10, 'user_id')
    .write
    .mode('overwrite')
    .bucketBy(10, 'user_id')
    .sortBy('user_id')
    .option('path', output_path)
    .saveAsTable('users')
)

<b>Check the metastore table:</b>

In [4]:
spark.table('users').count()

153439

In [3]:
spark.sql("show tables").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| default|    users|      false|
+--------+---------+-----------+



In [6]:
spark.sql("describe extended users").show(truncate=60, n=50)

+----------------------------+------------------------------------------------------------+-------+
|                    col_name|                                                   data_type|comment|
+----------------------------+------------------------------------------------------------+-------+
|                     user_id|                                                      bigint|   null|
|                display_name|                                                      string|   null|
|                       about|                                                      string|   null|
|                    location|                                                      string|   null|
|                   downvotes|                                                      bigint|   null|
|                     upvotes|                                                      bigint|   null|
|                  reputation|                                                      bigint|   null|
