# Partitioning

In this notebook you will partition data in the storage layout and see how to steere the number of generated files.

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, round, rand

import os

In [3]:
spark = (
    SparkSession
    .builder
    .appName('Partitioning I')
    .getOrCreate()
)

In [4]:
base_path = os.getcwd()

project_path = ('/').join(base_path.split('/')[0:-3]) 

questions_input_path = os.path.join(project_path, 'output/questions-transformed')

output_path_I = os.path.join(project_path, 'output/1/questions-partitioned')
output_path_II = os.path.join(project_path, 'output/2/questions-partitioned')

# Task I

* partition questions by `year` (derived from `creation_date`) and make one file per folder

In [5]:
# read the questions data and add column year

questionsDF = (
    spark
    .read
    .option('path', questions_input_path)
    .load()
    .withColumn('year', year('creation_date'))
)

#### Save the data:

Hint:
* [repartition](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.repartition) data by year to achive one file per folder
* cal [partitionBy](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameWriter.partitionBy) on DataFrameWriter

In [6]:
(
    questionsDF
    .repartition('year')
    .write
    .mode('overwrite')
    .partitionBy('year')
    .option('path', output_path_I)
    .save()
)

# Task II

Partition questions by year (derived from creation_date) and make five files per folder

Hint:
* repartition data by year and random expression which generates random number from intrval [0, 4]
    * use [rand](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.rand)
    * use modulo operator %

In [13]:
(
    questionsDF
    .repartition('year', (rand(12) * 100).cast('int') % 5)
    .write
    .mode('overwrite')
    .partitionBy('year')
    .option('path', output_path_II)
    .save()
)

In [14]:
spark.stop()