In [1]:
import pyspark.sql.functions as sf

In [2]:
from pyspark.sql import SparkSession

if not 'spark' in locals():
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory","4G") \
        .getOrCreate()

spark

# Watson Sales Product Sample Data

In this example, we want to have a look at the pivoting capabilities of Spark. Since pivoting is commonly used with sales data containing information for different product categories or countries, we will use a data set called "Watson Sales Product Sample Data" which was downloaded from https://www.ibm.com/communities/analytics/watson-analytics-blog/sales-products-sample-data/

# 1 Load and inspect data

First we load the data, which is provided as a single CSV file, which again is well supported by Apache Spark

In [None]:
basedir = "s3://dimajix-training/data"

In [32]:
data = spark.read\
    .option("header", True) \
    .option("inferSchema", True) \
    .csv(basedir + "/watson-sales-products/WA_Sales_Products_2012-14.csv")

### Inspect schema

Since we used the existing header information and also let Spark infer appropriate data types, let us inspect the schema now.

In [33]:
data.printSchema()

root
 |-- Retailer country: string (nullable = true)
 |-- Order method type: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Product type: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Gross margin: double (nullable = true)



### Inspect pivoting candidates

Now let us find some good candidates for a pivoting column. A pivoting column shouldn't have too many distinct entries, otherwise the result probably doesn't make too much sense and doesn't help the business expert in interpretation.

We can either use
```
data.select("Retailer type").distinct().count()
```
which will give us the number of distinct values for a single column, or we can use the Spark aggregate function `countDistinct` which allows us to retrieve information for multiple columns within a single `select`.

In [37]:
result = data.select(
    sf.countDistinct("Retailer country"),
    sf.countDistinct("Retailer type"),
    sf.countDistinct("Product line"),
    sf.countDistinct("Product type"),
    sf.countDistinct("Quarter")
)

result.toPandas()

Unnamed: 0,count(DISTINCT Retailer country),count(DISTINCT Retailer type),count(DISTINCT Product line),count(DISTINCT Product type),count(DISTINCT Quarter)
0,21,8,5,21,11


# 2 Pivoting by Product Line

The first example pivots by the product line, since there are only five different distinct values.

In [39]:
revenue_per_product_line = data.groupBy("Quarter", "Retailer Country").pivot("Product line").agg(sf.sum("Revenue"))
revenue_per_product_line.toPandas()

Unnamed: 0,Quarter,Retailer Country,Camping Equipment,Golf Equipment,Mountaineering Equipment,Outdoor Protection,Personal Accessories
0,Q3 2013,Sweden,1433530.62,1250510.97,794786.44,48039.27,3577140.77
1,Q4 2012,Spain,3264717.34,1593436.54,954726.68,211146.20,3991933.57
2,Q2 2013,Italy,5873795.00,2924732.03,1966086.80,111329.82,5921051.85
3,Q3 2012,United States,15847275.46,6085923.58,4055966.08,914399.41,22332252.19
4,Q1 2014,Switzerland,3966205.47,2157061.01,1640871.99,53438.20,5615068.69
5,Q2 2012,Germany,5315912.78,2059216.40,1310993.61,358070.12,6708060.03
6,Q1 2013,China,7745789.40,3746186.99,2269686.55,153304.19,8133065.76
7,Q2 2014,Austria,4221897.72,1989929.50,1669665.67,55040.42,7147750.02
8,Q1 2014,China,9213181.55,5195907.57,3799226.38,121771.71,11733720.68
9,Q4 2012,France,6286894.57,2582900.85,1641672.61,449593.78,8153053.17


## 2.1 Exercise

Craete an aggragated table with
* Country and Product Line in Rows
* The quantity for each quarter in different columns

In [None]:
# YOUR CODE HERE

# 3 Unpivoting again

Sometimes you just need the opposite operation: You have a data set in pivoted format and want to unpivot it. There is no simple built in function provided by Spark, but you can construct the unpivoted table as follows
* For every pivoted column:
  * Project data frame onto non-pivot columns
  * Add a new column with an appropriate name containing the name of the pivot column as its value
  * Add a new column with an appropriate name containing the values of the pivot column
* Union together all these data frames

## 3.1 Specific Example

Now let us perform these steps for the pivoted table above

In [42]:
revenue_camping = revenue_per_product_line.select(
    sf.col("Quarter"),
    sf.col("Retailer Country"),
    sf.lit("Camping Equipment").alias("Product line"),
    sf.col("Camping Equipment").alias("Revenue")
)

revenue_golf = revenue_per_product_line.select(
    sf.col("Quarter"),
    sf.col("Retailer Country"),
    sf.lit("Golf Equipment").alias("Product line"),
    sf.col("Golf Equipment").alias("Revenue")
)

revenue_mountaineering = revenue_per_product_line.select(
    sf.col("Quarter"),
    sf.col("Retailer Country"),
    sf.lit("Mountaineering Equipment").alias("Product line"),
    sf.col("Mountaineering Equipment").alias("Revenue")
)

revenue_outdoor = revenue_per_product_line.select(
    sf.col("Quarter"),
    sf.col("Retailer Country"),
    sf.lit("Outdoor Protection").alias("Product line"),
    sf.col("Outdoor Protection").alias("Revenue")
)

revenue_personal = revenue_per_product_line.select(
    sf.col("Quarter"),
    sf.col("Retailer Country"),
    sf.lit("Personal Accessories").alias("Product line"),
    sf.col("Personal Accessories").alias("Revenue")
)

result = revenue_camping \
    .union(revenue_golf) \
    .union(revenue_mountaineering) \
    .union(revenue_outdoor) \
    .union(revenue_personal)

result.limit(10).toPandas()

Unnamed: 0,Quarter,Retailer Country,Product line,Revenue
0,Q3 2013,Sweden,Camping Equipment,1433530.62
1,Q4 2012,Spain,Camping Equipment,3264717.34
2,Q2 2013,Italy,Camping Equipment,5873795.0
3,Q3 2012,United States,Camping Equipment,15847275.46
4,Q1 2014,Switzerland,Camping Equipment,3966205.47
5,Q2 2012,Germany,Camping Equipment,5315912.78
6,Q1 2013,China,Camping Equipment,7745789.4
7,Q2 2014,Austria,Camping Equipment,4221897.72
8,Q1 2014,China,Camping Equipment,9213181.55
9,Q4 2012,France,Camping Equipment,6286894.57


## 3.2 Generic Approach

Of course manually unpivoting is somewhat tedious, but we already see a pattern:
* Select all non-pivot columns
* Create a new column containing the pivot column name
* Create a new column containing the pivot column values
* Union together everything

This can be done by writing some small Python functions as follows:

In [46]:
import functools

# Unpivot a single column, thereby creating one data frame
def unpivot_column(df, other, pivot_column, pivot_value, result_column):
    columns = [df[c] for c in other] + \
        [sf.lit(pivot_value).alias(pivot_column)] + \
        [df[pivot_value].alias(result_column)]
    return df.select(*columns)

# Unpivot multiple columns by using the above method
def unpivot(df, pivot_column, pivot_values, result_column):
    """
    df - input data frame
    pivot_column - the name of the new column containg each pivot column name
    pivot_values - the list of pivoted column names
    result_column - the name of the column containing the values of the pivot columns
    """
    common_columns = [f.name for f in df.schema.fields if not f.name in pivot_values]
    unpivot_dfs = [unpivot_column(df, common_columns, pivot_column, v, result_column) for v in pivot_values]
    return functools.reduce(lambda x,y: x.union(y), unpivot_dfs)


Let's test the function

In [47]:
product_lines = ["Camping Equipment", "Golf Equipment", "Mountaineering Equipment", "Outdoor Protection", "Personal Accessories"]
result_per_product_line = unpivot(revenue_per_product_line, "Product Line", product_lines, "Revenue")
result_per_product_line.toPandas()

Unnamed: 0,Quarter,Retailer Country,Product Line,Revenue
0,Q3 2013,Sweden,Camping Equipment,1433530.62
1,Q4 2012,Spain,Camping Equipment,3264717.34
2,Q2 2013,Italy,Camping Equipment,5873795.00
3,Q3 2012,United States,Camping Equipment,15847275.46
4,Q1 2014,Switzerland,Camping Equipment,3966205.47
5,Q2 2012,Germany,Camping Equipment,5315912.78
6,Q1 2013,China,Camping Equipment,7745789.40
7,Q2 2014,Austria,Camping Equipment,4221897.72
8,Q1 2014,China,Camping Equipment,9213181.55
9,Q4 2012,France,Camping Equipment,6286894.57


## 3.3 Exercise

Now unpivot the result of exercise 2.1. You can do that either manually or try using the generic function defined above.