In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

spark = SparkSession.builder.appName("CustomerSpendigs").getOrCreate()

schema = StructType([ \
                     StructField("customerID", IntegerType(), True), \
                     StructField("itemID", IntegerType(), True), \
                     StructField("amount", FloatType(), True)])

# // Read the file as dataframe
df = spark.read.schema(schema).csv("./work/data/customer-orders.csv")
df.printSchema()

spendings = (df
    .select("customerID", "amount")
    .groupBy("customerID")
    .sum("amount")
    .select(func.col("customerID"), func.round("sum(amount)", 2).alias("amount_spent"))
    .orderBy("amount_spent", ascending=False))

spendings.show()

spark.stop()

root
 |-- customerID: integer (nullable = true)
 |-- itemID: integer (nullable = true)
 |-- amount: float (nullable = true)

+----------+------------+
|customerID|amount_spent|
+----------+------------+
|        68|     6375.45|
|        73|      6206.2|
|        39|     6193.11|
|        54|     6065.39|
|        71|     5995.66|
|         2|     5994.59|
|        97|     5977.19|
|        46|     5963.11|
|        42|     5696.84|
|        59|     5642.89|
|        41|     5637.62|
|         0|     5524.95|
|         8|     5517.24|
|        85|     5503.43|
|        61|     5497.48|
|        32|     5496.05|
|        58|     5437.73|
|        63|     5415.15|
|        15|     5413.51|
|         6|     5397.88|
+----------+------------+
only showing top 20 rows

