## Pivot & Unpivot

##### Prepare dataframe

In [0]:
data = [
  ('Apple',1200,'Peru'),
  ('Avocado',1100,'Peru'), 
  ('Banana',1500,'Peru'),
  ('Orange',2300,'Peru'),
  ('Orange',2200,'Peru'),
  ('Apple',1400,'Brazil'),
  ('Avocado',1500,'Brazil'),
  ('Banana',1100,'Brazil'),
  ('Orange',3900,'Brazil'),
  ('Apple',2100,'Columbia'),
  ('Avocado',1800,'Columbia'),
  ('Banana',1400,'Mexico')
]

columns= ['Product','Amount','Country']
df = spark.createDataFrame(data = data, schema = columns)
df.printSchema()
df.show(truncate=False)

#### Pivot

In [0]:
pivotDF = df.groupBy('Product').pivot('Country').sum('Amount')
pivotDF.printSchema()
pivotDF.show(truncate=False)

it is recommended to provide column data as an argument to function

In [0]:
countries = [row.Country for row in df.select('Country').distinct().collect()]

pivotDF = df.groupBy('Product').pivot('Country', countries).sum('Amount')
pivotDF.show(truncate=False)

In [0]:
pivotDF = (
  df
  .groupBy('Product','Country')
  .sum('Amount')
  .groupBy('Product')
  .pivot('Country')
  .sum('sum(Amount)')
)
pivotDF.show(truncate=False)

#### Unpivot

In [0]:
from pyspark.sql.functions import expr

unpivotExpr = "stack(4, 'Brazil', Brazil, 'Columbia', Columbia, 'Mexico', Mexico, 'Peru', Peru) as (Country,Total)"

unPivotDF = pivotDF.select('Product', expr(unpivotExpr)).where('Total is not null')
unPivotDF.show(truncate=False)

or construct unpivot expression dynamically

In [0]:
cntry = ','.join([f'\'{country}\', {country}' for country in countries])
unpivotExpr = f'stack({len(countries)}, {cntry} ) as (Country, Total)'

unPivotDF = pivotDF.select('Product', expr(unpivotExpr)).where('Total is not null')
unPivotDF.show(truncate=False)

###### Unpivot with reduce, lambda and unoin

In [0]:
from functools import reduce
from pyspark.sql.functions import lit, col
# Get all columns
cols = [col for col in pivotDF.columns if col != 'Product']

# Use column_name as the value of the new column `question_id`
# Use column_value as the value of the new column `response_value`
# Then union all of these new dataframes
unPivotDF = reduce(
  lambda df1, df2: df1.union(df2),
  [pivotDF.select(
    'Product',
    lit(c).alias('Country'), 
    col(c).alias('Total')
  ).where('Total is not null') for c in cols]
)

unPivotDF.show(truncate=False)