In [3]:
from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


In [2]:
# SparkSession => entry to the DataFrame API
spark = SparkSession.builder.appName("CustomerSpend").getOrCreate()
spark


In [6]:
schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("item_id", IntegerType(), True),
    StructField("amount", FloatType(), True)])


In [7]:
df = spark.read.schema(schema).csv("customer-orders.csv")
df.printSchema()


root
 |-- customer_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- amount: float (nullable = true)



In [None]:
df.show(5)

+-----------+-------+------+
|customer_id|item_id|amount|
+-----------+-------+------+
|         44|   8602| 37.19|
|         35|   5368| 65.89|
|          2|   3391| 40.64|
|         47|   6694| 14.98|
|         29|    680| 13.08|
+-----------+-------+------+
only showing top 5 rows



In [13]:
df.groupBy("customer_id").sum("amount").orderBy("sum(amount)", ascending=False).show()

+-----------+------------------+
|customer_id|       sum(amount)|
+-----------+------------------+
|         68| 6375.450028181076|
|         73| 6206.199985742569|
|         39| 6193.109993815422|
|         54| 6065.390002984554|
|         71| 5995.659991919994|
|          2| 5994.589979887009|
|         97| 5977.190007060766|
|         46| 5963.110011339188|
|         42| 5696.840004444122|
|         59| 5642.890004396439|
|         41| 5637.619991332293|
|          0| 5524.950008839369|
|          8|5517.2399980425835|
|         85|  5503.42998456955|
|         61| 5497.479998707771|
|         32| 5496.049998283386|
|         58| 5437.730004191399|
|         63| 5415.150004655123|
|         15| 5413.510010659695|
|          6| 5397.880012750626|
+-----------+------------------+
only showing top 20 rows

