In [103]:
from pyspark.sql import Row
     
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

order_items = spark.read.json("order_items.csv")

orders = spark.read.json("orders.csv")

[Stage 167:>                                                        (0 + 1) / 1]                                                                                

In [113]:
order_items.show(5)

+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_product_price|order_item_quantity|order_item_subtotal|
+-------------+-------------------+---------------------+------------------------+-------------------+-------------------+
|            1|                  1|                  957|                  299.98|                  1|             299.98|
|            2|                  2|                 1073|                  199.99|                  1|             199.99|
|            3|                  2|                  502|                    50.0|                  5|              250.0|
|            4|                  2|                  403|                  129.99|                  1|             129.99|
|            5|                  4|                  897|                   24.99|                  2|              49.98|
+-------------+-

In [21]:
orders.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



## Aggregate functions 

In [14]:
# The common aggregate functions that are available as part of pyspark.sql.functions

# count
# sum
# min
# max
# avg

In [15]:
from pyspark.sql.functions import count

In [16]:
help(count)

Help on function count in module pyspark.sql.functions:

count(col)
    Aggregate function: returns the number of items in a group.
    
    .. versionadded:: 1.3



In [18]:
orders.select(count("*")).show()

+--------+
|count(1)|
+--------+
|  172198|
+--------+



In [112]:
order_items.groupBy("order_item_id").agg(count("*")).show(5)

+-------------+--------+
|order_item_id|count(1)|
+-------------+--------+
|           26|       1|
|           29|       1|
|          474|       1|
|          964|       1|
|         1677|       1|
+-------------+--------+
only showing top 5 rows



# Get number of records in order_items.

In [89]:

# Function count on data frame is action. It will trigger execution.

order_items.count() # it is called action as it immediately triggers execution
     

order_items.count()

print(type(order_items.count()))


# or by sql type syntax

order_items.select(count("*")).show()

<class 'int'>
+--------+
|count(1)|
+--------+
|  172198|
+--------+



In [33]:
## or


# count is transformation (wide).
# Execution will be triggered when we perform actions such as show

order_items.select(count("*")).show()

+--------+
|count(1)|
+--------+
|  172198|
+--------+



## tOTAL Aggregations on Spark

In [34]:
order_items.dtypes

[('order_item_id', 'bigint'),
 ('order_item_order_id', 'bigint'),
 ('order_item_product_id', 'bigint'),
 ('order_item_product_price', 'double'),
 ('order_item_quantity', 'bigint'),
 ('order_item_subtotal', 'double')]

In [35]:
## Get revence using order_item_subtotal for a given order_item_order_id

In [39]:
from pyspark.sql.functions import sum

In [110]:
order_items.groupBy("order_item_order_id").\
        agg(sum("order_item_subtotal").alias("sum")).\
        sort("order_item_order_id",ascending =1).show(5)

+-------------------+------------------+
|order_item_order_id|               sum|
+-------------------+------------------+
|                  1|            299.98|
|                  2|            579.98|
|                  4|            699.85|
|                  5|1129.8600000000001|
|                  7| 579.9200000000001|
+-------------------+------------------+
only showing top 5 rows



In [46]:
order_items.printSchema()

root
 |-- order_item_id: long (nullable = true)
 |-- order_item_order_id: long (nullable = true)
 |-- order_item_product_id: long (nullable = true)
 |-- order_item_product_price: double (nullable = true)
 |-- order_item_quantity: long (nullable = true)
 |-- order_item_subtotal: double (nullable = true)



In [44]:
## for a order_id =2, Get number of item, total quantity as well as revenue for given order item , order id 


    # Number of items can be computed using count on order_item_quantity
    # Total quantity can be computed using sum on order_item_quantity
    # Total Revenue can be computed using sum on order_item_subtotal


In [70]:
# a

# count(*) is same as count(col("col_name"))

# SQL Type syntax

order_items.\
    filter("order_item_order_id = 2").\
    select(count("*"),sum("order_item_quantity").alias("sum"),sum("order_item_subtotal")
          
          ).show()

# or

from pyspark.sql.functions import col

order_items.\
    filter(col("order_item_order_id") == 2).\
    select(count("*"),sum("order_item_quantity").alias("sum"),sum("order_item_subtotal")
          
          ).show()

+--------+---+------------------------+
|count(1)|sum|sum(order_item_subtotal)|
+--------+---+------------------------+
|       3|  7|                  579.98|
+--------+---+------------------------+

+--------+---+------------------------+
|count(1)|sum|sum(order_item_subtotal)|
+--------+---+------------------------+
|       3|  7|                  579.98|
+--------+---+------------------------+



## GroupBy

In [91]:
help(order_items.groupBy)

Help on method groupBy in module pyspark.sql.dataframe:

groupBy(*cols) method of pyspark.sql.dataframe.DataFrame instance
    Groups the :class:`DataFrame` using the specified columns,
    so we can run aggregation on them. See :class:`GroupedData`
    for all the available aggregate functions.
    
    :func:`groupby` is an alias for :func:`groupBy`.
    
    .. versionadded:: 1.3.0
    
    Parameters
    ----------
    cols : list, str or :class:`Column`
        columns to group by.
        Each element should be a column name (string) or an expression (:class:`Column`).
    
    Examples
    --------
    >>> df.groupBy().avg().collect()
    [Row(avg(age)=3.5)]
    >>> sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(df.name).avg().collect())
    [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
    >>> sorted(df.groupBy(['name', df.age]).count().collect())
    [Ro

In [76]:
order_items.groupBy().min()

# it calculates min of all columns

DataFrame[min(order_item_id): bigint, min(order_item_order_id): bigint, min(order_item_product_id): bigint, min(order_item_product_price): double, min(order_item_quantity): bigint, min(order_item_subtotal): double]

In [75]:
order_items.groupBy().min().show()

+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|min(order_item_id)|min(order_item_order_id)|min(order_item_product_id)|min(order_item_product_price)|min(order_item_quantity)|min(order_item_subtotal)|
+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|                 1|                       1|                        19|                         9.99|                       1|                    9.99|
+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+



In [109]:
order_items_grouped = order_items.groupBy("order_item_id")

# it creates a dataframe type groupedData

print(type(order_items_grouped))

order_items_grouped.count().show(5)

<class 'pyspark.sql.group.GroupedData'>
+-------------+-----+
|order_item_id|count|
+-------------+-----+
|           26|    1|
|           29|    1|
|          474|    1|
|          964|    1|
|         1677|    1|
+-------------+-----+
only showing top 5 rows



In [84]:
help(order_items_grouped.count)

Help on method count in module pyspark.sql.group:

count() method of pyspark.sql.group.GroupedData instance
    Counts the number of records for each group.
    
    .. versionadded:: 1.3.0
    
    Examples
    --------
    >>> sorted(df.groupBy(df.age).count().collect())
    [Row(age=2, count=1), Row(age=5, count=1)]



In [105]:
## for non-numeric columns - min ,max will not be used on those columns

order.groupBy().min().show()

[Stage 169:>                                                        (0 + 1) / 1]

++
||
++
||
++



                                                                                

## Perform Grouped Aggregations using direct functions on Spark DataFrame

In [121]:
order_items_grouped = order_items.groupBy("order_item_order_id")

# it creates a dataframe type groupedData

type(order_items_grouped)


pyspark.sql.group.GroupedData

In [122]:

# The aggregate functions will be worked on the groups of that specific column

# count on one column or more columns, the value will be sane

order_items_grouped.count().show(5)


# or

order_items_grouped.\
        count().\
        withColumnRenamed("count","order_count").\
        show(5)



+-------------------+-----+
|order_item_order_id|count|
+-------------------+-----+
|                 29|    5|
|                474|    5|
|                964|    4|
|               1677|    5|
|               1806|    3|
+-------------------+-----+
only showing top 5 rows

+-------------------+-----------+
|order_item_order_id|order_count|
+-------------------+-----------+
|                 29|          5|
|                474|          5|
|                964|          4|
|               1677|          5|
|               1806|          3|
+-------------------+-----------+
only showing top 5 rows



In [126]:
order_items_grouped.\
        sum().\
        show(5)

# we get sum of all columns

+-------------------+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_id)|sum(order_item_order_id)|sum(order_item_product_id)|sum(order_item_product_price)|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------+------------------------+--------------------------+-----------------------------+------------------------+------------------------+
|                 29|               425|                     145|                      3897|            909.9300000000001|                       9|                 1109.85|
|                474|              5815|                    2370|                      4508|           374.94000000000005|                      13|       774.8199999999999|
|                964|              9586|                    3856|                      2964|           499.95000000000005|             

In [127]:
# to get sum of specific columns

order_items_grouped.\
        sum("order_item_quantity","order_item_subtotal").\
        show(5)

+-------------------+------------------------+------------------------+
|order_item_order_id|sum(order_item_quantity)|sum(order_item_subtotal)|
+-------------------+------------------------+------------------------+
|                 29|                       9|                 1109.85|
|                474|                      13|       774.8199999999999|
|                964|                      11|       739.8800000000001|
|               1677|                      14|       649.9200000000001|
|               1806|                       8|                  789.94|
+-------------------+------------------------+------------------------+
only showing top 5 rows



In [139]:
# in order to get custom column names, use toDF

order_items_grouped.\
        sum("order_item_quantity","order_item_subtotal").\
        toDF("order_item_order_id","order_item_quantity","order_item_subtotal")\
        .show(5)

# Use Round to round the order_item_subtotal

from pyspark.sql.functions import round

order_items_grouped.\
        sum("order_item_quantity","order_item_subtotal").\
        toDF("order_item_order_id","order_quantity","order_subtotal").\
        withColumn("order_subtotal",round("order_subtotal",2))\
        .show(5)

# # keep in mind that , we can have only one aggregate function on top of grouped dataframe

+-------------------+-------------------+-------------------+
|order_item_order_id|order_item_quantity|order_item_subtotal|
+-------------------+-------------------+-------------------+
|                 29|                  9|            1109.85|
|                474|                 13|  774.8199999999999|
|                964|                 11|  739.8800000000001|
|               1677|                 14|  649.9200000000001|
|               1806|                  8|             789.94|
+-------------------+-------------------+-------------------+
only showing top 5 rows

+-------------------+--------------+--------------+
|order_item_order_id|order_quantity|order_subtotal|
+-------------------+--------------+--------------+
|                 29|             9|       1109.85|
|                474|            13|        774.82|
|                964|            11|        739.88|
|               1677|            14|        649.92|
|               1806|             8|        789.94|
+

## Multiple aggregation methods on single grouped dataframe using agg()

In [144]:
order_items.grouped = order_items.groupBy("order_item_order_id") 

# This is of type groupedData, we can use agg func like min,max,sum on top of it directly

# or use the agg() method


order_items.grouped.sum("order_item_quantity").show()

# we can not use max,min on top of this sum as it returns a Dataframe

order_items.grouped.sum("order_item_quantity").min().show()

+-------------------+------------------------+
|order_item_order_id|sum(order_item_quantity)|
+-------------------+------------------------+
|                 29|                       9|
|                474|                      13|
|                964|                      11|
|               1677|                      14|
|               1806|                       8|
|               1950|                      12|
|               2214|                       5|
|               2250|                      10|
|               2453|                       7|
|               2509|                       4|
|               2529|                       1|
|               2927|                       8|
|               3091|                       5|
|               3764|                       2|
|               4590|                      11|
|               4894|                       4|
|               5385|                      10|
|               5409|                       7|
|            

AttributeError: 'DataFrame' object has no attribute 'min'

In [145]:
help(order_items_grouped.agg)

Help on method agg in module pyspark.sql.group:

agg(*exprs) method of pyspark.sql.group.GroupedData instance
    Compute aggregates and returns the result as a :class:`DataFrame`.
    
    The available aggregate functions can be:
    
    1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`
    
    2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`
    
       .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
           a full shuffle is required. Also, all the data of a group will be loaded into
           memory, so the user should be aware of the potential OOM risk if data is skewed
           and certain groups are too large to fit in memory.
    
       .. seealso:: :func:`pyspark.sql.functions.pandas_udf`
    
    If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
    is the column to perform aggregation on, and the value is the aggregate function.
    
    Alt

In [148]:
order_items.grouped.\
            agg(sum("order_item_quantity"),sum("order_item_subtotal")).printSchema()

# sum("col_name") - it returns a column object only.

root
 |-- order_item_order_id: long (nullable = true)
 |-- sum(order_item_quantity): long (nullable = true)
 |-- sum(order_item_subtotal): double (nullable = true)



In [149]:
order_items.grouped.\
            agg(sum("order_item_quantity"),round(sum("order_item_subtotal"),2)).show()

+-------------------+------------------------+----------------------------------+
|order_item_order_id|sum(order_item_quantity)|round(sum(order_item_subtotal), 2)|
+-------------------+------------------------+----------------------------------+
|                 29|                       9|                           1109.85|
|                474|                      13|                            774.82|
|                964|                      11|                            739.88|
|               1677|                      14|                            649.92|
|               1806|                       8|                            789.94|
|               1950|                      12|                           1015.87|
|               2214|                       5|                            449.96|
|               2250|                      10|                            889.94|
|               2453|                       7|                            999.93|
|               


# we can also pass the dict to agg()




In [151]:
order_items.grouped.\
            agg( {"order_item_quantity":"sum","order_item_subtotal":"sum"} ).printSchema()

root
 |-- order_item_order_id: long (nullable = true)
 |-- sum(order_item_subtotal): double (nullable = true)
 |-- sum(order_item_quantity): long (nullable = true)



23/07/05 21:13:04 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1018111 ms exceeds timeout 120000 ms
23/07/05 21:13:04 WARN SparkContext: Killing executors is not supported by current scheduler.
