In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,sum,avg,max

spark = SparkSession.builder \
                    .appName('group by example') \
                    .getOrCreate()

simpleData = [("1","Corse","F1",80,7,0.2),
    ("2","Gascon","F2",83,8,0.3),
    ("3","Gascon","F1",81,7,0.4),
    ("4","Corse","F2",82,8,0.5),
    ("5","Gascon","F1",88,6,0.3),
    ("6","Corse","F1",70,7,0.4),
    ("7","Gascon","F2",91,8,0.5),
    ("8","Corse","F1",78,7,0.5),
    ("9","Gascon","F1",94,8,0.6)
  ]

schema = ["animal_id","species", "farm","weight","age","efficiency"]
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

df.groupBy("species").sum("weight").show()

dfGroup=df.groupBy("species") \
          .agg(sum("weight").alias("sum_weight"))
          
dfGroup.show(truncate=False)

dfFilter=dfGroup.filter(dfGroup.sum_weight > 300)
dfFilter.show()

from pyspark.sql.functions import asc
dfFilter.sort("sum_weight").show()

from pyspark.sql.functions import desc
dfFilter.sort(desc("sum_weight")).show()


root
 |-- animal_id: string (nullable = true)
 |-- species: string (nullable = true)
 |-- farm: string (nullable = true)
 |-- weight: long (nullable = true)
 |-- age: long (nullable = true)
 |-- efficiency: double (nullable = true)

+---------+-------+----+------+---+----------+
|animal_id|species|farm|weight|age|efficiency|
+---------+-------+----+------+---+----------+
|1        |Corse  |F1  |80    |7  |0.2       |
|2        |Gascon |F2  |83    |8  |0.3       |
|3        |Gascon |F1  |81    |7  |0.4       |
|4        |Corse  |F2  |82    |8  |0.5       |
|5        |Gascon |F1  |88    |6  |0.3       |
|6        |Corse  |F1  |70    |7  |0.4       |
|7        |Gascon |F2  |91    |8  |0.5       |
|8        |Corse  |F1  |78    |7  |0.5       |
|9        |Gascon |F1  |94    |8  |0.6       |
+---------+-------+----+------+---+----------+

+-------+-----------+
|species|sum(weight)|
+-------+-----------+
|  Corse|        310|
| Gascon|        437|
+-------+-----------+

+-------+----------+
|

In [6]:

df.groupBy("species") \
  .agg(sum("weight").alias("sum_weight")) \
  .filter(col("sum_weight") > 200)  \
  .sort(desc("sum_weight")) \
  .show()
  
df.createOrReplaceTempView("EMP")
spark.sql("select species, sum(weight) as sum_weight from EMP " +
          "group by species having sum_weight > 200 " + 
          "order by sum_weight desc").show()

df.groupBy("species") \
  .sum("weight") \
  .withColumnRenamed("sum(weight)", "sum_weight") \
  .show()

df.groupBy("species") \
  .sum("weight") \
  .select(col("species"),col("sum(weight)").alias("sum_weight")) \
  .show()
  

+-------+----------+
|species|sum_weight|
+-------+----------+
| Gascon|       437|
|  Corse|       310|
+-------+----------+

+-------+----------+
|species|sum_weight|
+-------+----------+
| Gascon|       437|
|  Corse|       310|
+-------+----------+

+-------+----------+
|species|sum_weight|
+-------+----------+
|  Corse|       310|
| Gascon|       437|
+-------+----------+

+-------+----------+
|species|sum_weight|
+-------+----------+
|  Corse|       310|
| Gascon|       437|
+-------+----------+

