In [2]:
import pyspark.sql.functions as sf

from pyspark.sql.window import Window

In [None]:
from pyspark.sql import SparkSession

if not 'spark' in locals():
    spark = SparkSession.builder \
        .master("local[*]") \
        .config("spark.driver.memory","4G") \
        .getOrCreate()

spark

# Sales Data Example

Window functions are commonly used together with sales data. In this notebook we will be using a data set called "Watson Sales Product Sample Data" which was downloaded from https://www.ibm.com/communities/analytics/watson-analytics-blog/sales-products-sample-data/

## 1 Watson Sales Product Sample Data

First we load the data, which is provided as a single CSV file, which again is well supported by Apache Spark

In [None]:
basedir = "s3://dimajix-training/data"

In [70]:
data = spark.read\
    .option("header", True) \
    .option("inferSchema", True) \
    .csv(basedir + "/watson-sales-products/WA_Sales_Products_2012-14.csv")

### Inspect schema

Since we used the existing header information and also let Spark infer appropriate data types, let us inspect the schema now.


In [71]:
data.printSchema()

root
 |-- Retailer country: string (nullable = true)
 |-- Order method type: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Product type: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Gross margin: double (nullable = true)



### Preaggregate data

Since we are not interested in all details, we preaggregate the data into the following columns:
* Retailer country
* Retailer type
* Product line
* Quarter

In [108]:
aggregated_data = data.groupBy(
    "Retailer country",
    "Retailer type",
    "Product line",
    "Quarter"
).agg(
    sf.sum("Revenue").alias("Revenue"),
    sf.sum("Quantity").alias("Quantity")
)

aggregated_data.printSchema()

root
 |-- Retailer country: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Quantity: long (nullable = true)



# 2 Find Difference to Average

In the first example, we try to find the difference of the revenue of each quarter to the average revenue for each retailer country and retailer type over all quarters. This can be done either using a grouped aggregated followed by a join or by using window functions.

## 2.1 Self Join

Just for the sake of completeness, let us start with the aggragetion and join approach. It will turn out later that this is much more complicated than using a window function, but nevertheless we implement this approach such that we can compare both approaches.

### Step 1: Extarct year and quarter

Technically the first step is not required, but in order to provide some meaningful sorting, we extract the quarter (Q1, Q2, Q3 and Q4) and the year from the incoming column `Quarter`. Otherwise sorting wouldn't work, since that column is formatted as `'Q'q YYYY` which doesn't provide a chronologically ordering if sorted alphabetically.

In [109]:
extended_data = aggregated_data.select(
    sf.col("*"),
    sf.substring(aggregated_data["Quarter"],1,2).alias("q"),
    sf.substring(aggregated_data["Quarter"],3,8).alias("y")
)

extended_data.printSchema()

root
 |-- Retailer country: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- q: string (nullable = true)
 |-- y: string (nullable = true)



### Step 2: Calculate average revenue

Now we calculate the average revenue per retailer country, retailer type and product line.

In [110]:
avg_data = extended_data.groupBy(
        "Retailer country",
        "Retailer type",
        "Product line"
    ).agg(
        sf.avg("Revenue").alias("avg_revenue")
    )

avg_data.printSchema()

root
 |-- Retailer country: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- avg_revenue: double (nullable = true)



### Step 3: Join and calculate

Now we join the average revenue with the original data set, such that we can calculate the difference of the revenue and the average revenue.

In [111]:
result = extended_data.join(
        avg_data,
        (extended_data["Retailer country"] == avg_data["Retailer country"]) &
        (extended_data["Retailer type"] == avg_data["Retailer type"]) &
        (extended_data["Product line"] == avg_data["Product line"])
    ).select(
        extended_data["*"],
        (extended_data["Revenue"] - avg_data["avg_revenue"]).alias("revenue_delta")
    )

sorted_result = result \
    .orderBy("Retailer Country", "Retailer Type", "Product line", "y", "q") \
    .drop("q", "y")

sorted_result.limit(10).toPandas()

Unnamed: 0,Retailer country,Retailer type,Product line,Quarter,Revenue,Quantity,revenue_delta
0,Australia,Department Store,Camping Equipment,Q1 2012,780636.36,23378,-654659.6
1,Australia,Department Store,Camping Equipment,Q2 2012,1022203.02,15407,-413092.9
2,Australia,Department Store,Camping Equipment,Q3 2012,904059.77,18591,-531236.1
3,Australia,Department Store,Camping Equipment,Q4 2012,1074038.59,24115,-361257.3
4,Australia,Department Store,Camping Equipment,Q1 2013,1324342.98,35357,-110952.9
5,Australia,Department Store,Camping Equipment,Q2 2013,2545450.2,49263,1110154.0
6,Australia,Department Store,Camping Equipment,Q3 2013,2116205.77,34150,680909.9
7,Australia,Department Store,Camping Equipment,Q4 2013,1515953.18,38685,80657.27
8,Australia,Department Store,Camping Equipment,Q1 2014,1568783.04,35739,133487.1
9,Australia,Department Store,Camping Equipment,Q2 2014,2606957.75,41056,1171662.0


### Analyse Exeuction Plan

Let us have a look at the execution plan using the `explain` method

In [112]:
result.explain(True)

== Parsed Logical Plan ==
'Project [ResolvedStar(Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, Revenue#11064, Quantity#11066L, q#11073, y#11074), (Revenue#11064 - avg_revenue#11092) AS revenue_delta#11180]
+- Join Inner, (((Retailer country#7476 = Retailer country#11097) && (Retailer type#7478 = Retailer type#11099)) && (Product line#7479 = Product line#11100))
   :- Project [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, Revenue#11064, Quantity#11066L, substring(Quarter#7483, 1, 2) AS q#11073, substring(Quarter#7483, 3, 8) AS y#11074]
   :  +- Aggregate [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483], [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, sum(Revenue#7484) AS Revenue#11064, sum(cast(Quantity#7485 as bigint)) AS Quantity#11066L]
   :     +- Relation[Retailer country#7476,Order method type#7477,Retailer type#7478,Product line#7479,Product type#7480,Product#7481,

## 2.2 Better use Windowing

Now let us perform the very same analysis, but using windowed aggregation instead of aggregation and joining. A *window* aggregates groups of records, but this grouping and aggregation will be performed (conceptionally) individually for every input record and the result will be attached to each input record. Therefore a windowed aggregation works like a normal aggregation followed by a join.

In Spark we always need to specify how this aggregation window is to be constructed. It always has up to three components:
* Partitioning - controls which records will be considered for each window
* Sorting - sorts all records in a window
* Range - controls how many records in the sorted list should be aggregated

### Aggregazion functions
After the window has been created, you can use any conventional aggregation function like `sum`, `avg` etc. In addition Spark also provides some special window functions which make use of the ordering (which is not available in normal aggregations). The most important window aggregation functions are:
* `rank()`
* `dense_rank()`
* `row_number()`
* `lag(column, n)` and `lead(column, n)`


### Step 1: Extarct year and quarter

Technically the first step is not required, but in order to provide some meaningful sorting, we extract the quarter (Q1, Q2, Q3 and Q4) and the year from the incoming column Quarter. Otherwise sorting wouldn't work, since that column is formatted as 'Q'q YYYY which doesn't provide a chronologically ordering if sorted alphabetically.

In [113]:
extended_data = aggregated_data.select(
    sf.col("*"),
    sf.substring(aggregated_data["Quarter"],1,2).alias("q"),
    sf.substring(aggregated_data["Quarter"],3,8).alias("y")
)

### Step 2: Define window

This time we use a windowed aggregation to calculate the average price. As the first step we need to construct a *window*. In this case it contains the following ingredients:
* A definition of partitions (i.e. which rows should be averages together)
* A definition of the window size in rows (i.e. which rows within each partition should take part for each average)

In [114]:
# Define a window
avg_window = Window\
    .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) \
    .partitionBy(
        "Retailer country",
        "Retailer type",
        "Product line"
    )

### Step 3: Perform analysis

In [115]:
# Perform analysis
result = extended_data.select(
        sf.col("*"),
        sf.avg(extended_data["Revenue"]).over(avg_window).alias("avg_revenue"),
    ) \
    .select(
        sf.col("*"),
        (sf.col("Revenue") - sf.col("avg_revenue")).alias("revenue_diff")
    )

# Sort result for nicer output
sorted_result = result \
    .orderBy("Retailer Country", "Retailer Type", "Product line", "y", "q") \
    .drop("q", "y")

sorted_result.limit(10).toPandas()

Unnamed: 0,Retailer country,Retailer type,Product line,Quarter,Revenue,Quantity,avg_revenue,revenue_diff
0,Australia,Department Store,Camping Equipment,Q1 2012,780636.36,23378,1435296.0,-654659.6
1,Australia,Department Store,Camping Equipment,Q2 2012,1022203.02,15407,1435296.0,-413092.9
2,Australia,Department Store,Camping Equipment,Q3 2012,904059.77,18591,1435296.0,-531236.1
3,Australia,Department Store,Camping Equipment,Q4 2012,1074038.59,24115,1435296.0,-361257.3
4,Australia,Department Store,Camping Equipment,Q1 2013,1324342.98,35357,1435296.0,-110952.9
5,Australia,Department Store,Camping Equipment,Q2 2013,2545450.2,49263,1435296.0,1110154.0
6,Australia,Department Store,Camping Equipment,Q3 2013,2116205.77,34150,1435296.0,680909.9
7,Australia,Department Store,Camping Equipment,Q4 2013,1515953.18,38685,1435296.0,80657.27
8,Australia,Department Store,Camping Equipment,Q1 2014,1568783.04,35739,1435296.0,133487.1
9,Australia,Department Store,Camping Equipment,Q2 2014,2606957.75,41056,1435296.0,1171662.0


### Execution Plan

Again let us have a look at the execution plan, which is now much simpler.

In [116]:
result.explain(True)

== Parsed Logical Plan ==
'Project [*, ('Revenue - 'avg_revenue) AS revenue_diff#11239]
+- Project [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, Revenue#11064, Quantity#11066L, q#11218, y#11219, avg_revenue#11229]
   +- Project [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, Revenue#11064, Quantity#11066L, q#11218, y#11219, avg_revenue#11229, avg_revenue#11229]
      +- Window [avg(Revenue#11064) windowspecdefinition(Retailer country#7476, Retailer type#7478, Product line#7479, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS avg_revenue#11229], [Retailer country#7476, Retailer type#7478, Product line#7479]
         +- Project [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, Revenue#11064, Quantity#11066L, q#11218, y#11219]
            +- Project [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, Revenue#11064, Quantity#11066L, substring(Quar

## 2.3 Exercise

Perform a similar analysis, which compares the quantity per product line and quarter with the average quantity per product line and quarter

In [None]:
# YOUR CODE HERE

# 3 Best Quarter

Another interesting question would be, which quarter was the best one in each country for each retailer type and product line. This would be already much harder to do with a join, since the join key would probably need to contain the maximum revenue, which is a double (never join on floating point values, it might not work).

## 3.1 Using windowing

### Step 2: Perform analysis

Again we need to define a window, and within each window partition we want to sort the rows by the `Revenue` column and add the sorted position as a new column. This then allows us to trivially simply select the top most row in each window, which contains the best revenue. 

This time the window again needs to be partitioned and sorted by revenue, such that we can easily pick the top most revenue.

In [98]:
# Define a ranking window
rank_window = Window\
    .orderBy(extended_data["Revenue"].desc())\
    .partitionBy(
        "Retailer country",
        "Retailer type",
        "Product line"
    )

### Step 3: Perform analysis

By using this window, we can easily perform the analysis be calculating the position of each record within its window by using the `row_number` function and then select the top most record by filtering the row number to be 1.

In [99]:
# Perform analysis using the "row_number" window function
ranked_data = extended_data.select(
        sf.col("*"),
        sf.row_number().over(rank_window).alias("rank")
    )

# Pick the top entry of every window by filtering on the row number
result = ranked_data.filter(sf.col("rank") == 1)

# Sort result, just to improve output
sorted_result = result \
    .orderBy("Retailer Country", "Retailer Type", "Product line", "y", "q") \
    .drop("q", "y", "rank")

sorted_result.limit(10).toPandas()

Unnamed: 0,Retailer country,Retailer type,Product line,Quarter,Revenue
0,Australia,Department Store,Camping Equipment,Q2 2014,2606957.75
1,Australia,Department Store,Golf Equipment,Q2 2014,677267.68
2,Australia,Department Store,Outdoor Protection,Q4 2012,78711.97
3,Australia,Department Store,Personal Accessories,Q2 2014,819106.85
4,Australia,Direct Marketing,Camping Equipment,Q1 2013,588239.09
5,Australia,Direct Marketing,Golf Equipment,Q1 2014,24586.64
6,Australia,Direct Marketing,Outdoor Protection,Q3 2013,49510.36
7,Australia,Direct Marketing,Personal Accessories,Q3 2013,160893.21
8,Australia,Equipment Rental Store,Camping Equipment,Q3 2013,187676.43
9,Australia,Equipment Rental Store,Golf Equipment,Q3 2013,107589.55


## 3.2 Exercise

Using a similar approach now calculate the country with the largest quanitity per product line and retailer type

In [None]:
# YOUR CODE HERE

# 4 Difference between Quarters

Another common example where windowing will greatly simplify processing is accessing different rows in a single query. This cannot be done in Spark without using some trick, since Spark normally processes all rows independently. In a simple `select` you can access any number of columns, but you only have access to a single row.

As an example, we'd like to calculate the difference in revenue of two consecutive quarters. Obviously we need to access the revenue of two quarters to calulcate the difference. Again we use two different approaches, the first using a `join` operation and the second using a windowed aggregation.

## 4.1 Self Join

The first approach will join the data set to itself, such that two different quarters of the same retailer country, retailer type and product type are put together into a single row. Then a simple subtraction will provide the result.

### Step 1: Calculate previous quarter

As a first step, we need to create a small helper function for calculating the previous quarter from a given quarter using the provided format `Qq YYYY`. With this function we can generate the join key required for joining the same dataset on the previous quarter.

We will write a small Python UDF to perform the desired operation.

In [63]:
def prev_quarter(quarter):
    q = int(quarter[1:2])
    y = int(quarter[3:8])
    
    prev_q = q - 1
    if (prev_q <= 0):
        prev_y = y - 1
        prev_q = 4
    else:
        prev_y = y
    
    return "Q" + str(prev_q) + " " + str(prev_y)
    
print(prev_quarter("Q1 2012"))
print(prev_quarter("Q4 2012"))

Q4 2011
Q3 2012


In [64]:
import pyspark.sql.types as st

prev_quarter_udf = sf.udf(prev_quarter, st.StringType())

Now we apply the `prev_quarter` UDF to the data set to create a new column containing the previous quarter.

In [117]:
extended_data = aggregated_data.select(
    sf.col("*"),
    prev_quarter_udf(aggregated_data["Quarter"]).alias("prev_quarter")
)

extended_data.printSchema()

root
 |-- Retailer country: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- prev_quarter: string (nullable = true)



### Step 2: Join current and previous Quarter

Now we need to join the current quarter with the last quarter using the newly created column `prev_quarter`

In [118]:
joined_data = extended_data.alias("current").join(
        extended_data.alias("prev"),
        (sf.col("current.Quarter") == sf.col("prev.prev_quarter")) &
        (sf.col("current.Retailer country") == sf.col("prev.Retailer country")) &
        (sf.col("current.Retailer type") == sf.col("prev.Retailer type")) &
        (sf.col("current.Product Line") == sf.col("prev.Product Line")),
        "left"
    )

joined_data.printSchema()

root
 |-- Retailer country: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- prev_quarter: string (nullable = true)
 |-- Retailer country: string (nullable = true)
 |-- Retailer type: string (nullable = true)
 |-- Product line: string (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- prev_quarter: string (nullable = true)



Note that most columns are present twice now, but by using the data frame aliases `current` and `prev` we still can distinguish between the two original sources. We need that capability in the next step.

### Step 3: Calculate difference

Now that we have the current revenue and the previous revenue joined together in a single data frame, we finally can now calculate the difference and keep only the columns from the `current` data frame.

In [119]:
joined_data.select(
        sf.col("current.*"),
        (sf.col("current.Revenue") - sf.col("prev.Revenue")).alias("revenue_delta")
    )

result.limit(10).toPandas()

Unnamed: 0,Retailer country,Retailer type,Product line,Quarter,Revenue,Quantity,q,y,avg_revenue,revenue_diff
0,Switzerland,Department Store,Golf Equipment,Q3 2013,102315.47,1656,Q3,2013,118721.772222,-16406.302222
1,Switzerland,Department Store,Golf Equipment,Q2 2012,41009.9,422,Q2,2012,118721.772222,-77711.872222
2,Switzerland,Department Store,Golf Equipment,Q4 2013,97670.56,784,Q4,2013,118721.772222,-21051.212222
3,Switzerland,Department Store,Golf Equipment,Q2 2014,145322.53,383,Q2,2014,118721.772222,26600.757778
4,Switzerland,Department Store,Golf Equipment,Q1 2014,144023.04,1778,Q1,2014,118721.772222,25301.267778
5,Switzerland,Department Store,Golf Equipment,Q3 2012,121081.88,1949,Q3,2012,118721.772222,2360.107778
6,Switzerland,Department Store,Golf Equipment,Q4 2012,156496.44,943,Q4,2012,118721.772222,37774.667778
7,Switzerland,Department Store,Golf Equipment,Q2 2013,148097.38,374,Q2,2013,118721.772222,29375.607778
8,Switzerland,Department Store,Golf Equipment,Q1 2013,112478.75,957,Q1,2013,118721.772222,-6243.022222
9,United Kingdom,Sports Store,Outdoor Protection,Q1 2013,37301.84,6111,Q1,2013,45800.049091,-8498.209091


## 4.2 Use Windows

Now that we saw how to solve the problem with a join (and a UDF for calculating the previous quarter), let us get to a different approach using a windowed aggregation. 

In [120]:
extended_data = aggregated_data.select(
    sf.col("*"),
    sf.substring(aggregated_data["Quarter"],1,2).alias("q"),
    sf.substring(aggregated_data["Quarter"],3,8).alias("y")
)

### Step 1: Define Window

What we essentially want to do is to access values from *different rows* for calculating the difference between quarters. So what we need is something like follows:
* Create window per retailer country, retailer type and product line
* Sort by quarter
* Pick previous row

The last step is the interesting one. This is done by using the `lag` window aggregation function which allows you to access some preceeding record within the window. Note that the window actually has to contain exactly one record, otherwise you'll get an error by Spark.

In [121]:
prev_window = Window \
    .orderBy(extended_data["y"].asc(),extended_data["q"].asc())\
    .rowsBetween(-1, -1) \
    .partitionBy(
        "Retailer country",
        "Retailer type",
        "Product line"
    )

### Step 2: Perform analysis

Now we can use the window in the following simple select statement:

In [97]:
result = extended_data.select(
        sf.col("*"),
        (extended_data["Revenue"] - sf.lag(extended_data["Revenue"], 1).over(prev_window)).alias("revenue_delta")
    )

sorted_result = result \
    .orderBy("Retailer Country", "Retailer Type", "Product line", "y", "q") \
    .drop("q", "y")

sorted_result.limit(10).toPandas()

Unnamed: 0,Retailer country,Retailer type,Product line,Quarter,Revenue,revenue_delta
0,Australia,Department Store,Camping Equipment,Q1 2012,780636.36,
1,Australia,Department Store,Camping Equipment,Q2 2012,1022203.02,241566.66
2,Australia,Department Store,Camping Equipment,Q3 2012,904059.77,-118143.25
3,Australia,Department Store,Camping Equipment,Q4 2012,1074038.59,169978.82
4,Australia,Department Store,Camping Equipment,Q1 2013,1324342.98,250304.39
5,Australia,Department Store,Camping Equipment,Q2 2013,2545450.2,1221107.22
6,Australia,Department Store,Camping Equipment,Q3 2013,2116205.77,-429244.43
7,Australia,Department Store,Camping Equipment,Q4 2013,1515953.18,-600252.59
8,Australia,Department Store,Camping Equipment,Q1 2014,1568783.04,52829.86
9,Australia,Department Store,Camping Equipment,Q2 2014,2606957.75,1038174.71


## 4.3 Exercise

Now calculate the difference in sold quantities between two consecutive quarters per retailer country, retailer type and product line.

In [None]:
# YOUR CODE HERE

# 5 Putting it all together

Of course you can also use different window aggregations with different windows in a single query as follows:

In [105]:
rank_window = Window\
    .orderBy(extended_data["Revenue"].desc())\
    .partitionBy(
        "Retailer country",
        "Retailer type",
        "Product line"
    )
avg_window = Window\
    .orderBy(extended_data["Revenue"].desc())\
    .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) \
    .partitionBy(
        "Retailer country",
        "Retailer type",
        "Product line"
    )

prev_window = Window \
    .orderBy(extended_data["y"].asc(),extended_data["q"].asc())\
    .rowsBetween(-1, -1) \
    .partitionBy(
        "Retailer country",
        "Retailer type",
        "Product line"
    )

result = extended_data.select(
        sf.col("*"),
        sf.row_number().over(rank_window).alias("rank"),
        sf.avg(extended_data["Revenue"]).over(avg_window).alias("avg_revenue"),
        (extended_data["Revenue"] - sf.lag(extended_data["Revenue"], 1).over(prev_window)).alias("revenue_delta")
    )

sorted_result = result\
    .orderBy("Retailer Country", "Retailer Type", "Product line", "y", "q") \
    .drop("q", "y")

result.limit(10).toPandas()

Unnamed: 0,Retailer country,Retailer type,Product line,Quarter,Revenue,q,y,rank,avg_revenue,revenue_delta
0,Switzerland,Department Store,Golf Equipment,Q4 2012,156496.44,Q4,2012,1,118721.772222,35414.56
1,Switzerland,Department Store,Golf Equipment,Q2 2013,148097.38,Q2,2013,2,118721.772222,35618.63
2,Switzerland,Department Store,Golf Equipment,Q2 2014,145322.53,Q2,2014,3,118721.772222,1299.49
3,Switzerland,Department Store,Golf Equipment,Q1 2014,144023.04,Q1,2014,4,118721.772222,46352.48
4,Switzerland,Department Store,Golf Equipment,Q3 2012,121081.88,Q3,2012,5,118721.772222,80071.98
5,Switzerland,Department Store,Golf Equipment,Q1 2013,112478.75,Q1,2013,6,118721.772222,-44017.69
6,Switzerland,Department Store,Golf Equipment,Q3 2013,102315.47,Q3,2013,7,118721.772222,-45781.91
7,Switzerland,Department Store,Golf Equipment,Q4 2013,97670.56,Q4,2013,8,118721.772222,-4644.91
8,Switzerland,Department Store,Golf Equipment,Q2 2012,41009.9,Q2,2012,9,118721.772222,
9,United Kingdom,Sports Store,Outdoor Protection,Q2 2012,127100.3,Q2,2012,1,45800.049091,38105.77


### Inspect execution plan

In [106]:
result.explain(True)

== Parsed Logical Plan ==
'Project [*, row_number() windowspecdefinition('Retailer country, 'Retailer type, 'Product line, Revenue#7510 DESC NULLS LAST, unspecifiedframe$()) AS rank#11022, avg(Revenue#7510) windowspecdefinition('Retailer country, 'Retailer type, 'Product line, Revenue#7510 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS avg_revenue#11024, (Revenue#7510 - lag(Revenue#7510, 1, null) windowspecdefinition('Retailer country, 'Retailer type, 'Product line, y#10533 ASC NULLS FIRST, q#10532 ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1))) AS revenue_delta#11025]
+- Project [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, Revenue#7510, substring(Quarter#7483, 1, 2) AS q#10532, substring(Quarter#7483, 3, 8) AS y#10533]
   +- Aggregate [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483], [Retailer country#7476, Retailer type#7478, Product line#7479, Quarter#7483, sum(Re