In [0]:
spark

In [0]:
#Reading files
membersDF = (
  spark 
  .read 
  .option("header", True) #Opciones
  .csv("/FileStore/final_project_spark/data/members.csv") #Archivo
)

menuDF = (
  spark 
  .read 
  .option("header", True) #Opciones
  .csv("/FileStore/final_project_spark/data/menu.csv") #Archivo
)

salesDF = (
  spark 
  .read 
  .option("header", True) #Opciones
  .csv("/FileStore/final_project_spark/data/sales.csv") #Archivo
)

In [0]:
from pyspark.sql.types import DateType
#Casting columns in dataframes

membersDF_final = membersDF.withColumn("join_date",membersDF.join_date.cast(DateType()))

menuDF_final = menuDF.withColumn("product_id",menuDF.product_id.cast('int')) \
                     .withColumn("price",menuDF.price.cast('int')) 

salesDF_final = salesDF.withColumn("order_date",salesDF.order_date.cast(DateType())) \
                       .withColumn("product_id",salesDF.product_id.cast('int')) 

In [0]:
from pyspark.sql.functions import sum,col,desc
#Question 1: ¿Cuál es la cantidad total que gastó cada cliente en el restaurante?

salesDF_final.join(menuDF_final,salesDF_final.product_id == menuDF_final.product_id,"left") \
       .select(salesDF_final.customer_id,menuDF_final.price) \
       .groupby("customer_id") \
       .agg(sum("price").alias("total_consume")) \
       .sort(col("total_consume").desc())\
       .show()

+-----------+-------------+
|customer_id|total_consume|
+-----------+-------------+
|          A|           76|
|          B|           74|
|          C|           36|
+-----------+-------------+



In [0]:
from pyspark.sql.functions import countDistinct,col,desc
#Question 2: ¿Cuántos días ha visitado cada cliente el restaurante?
salesDF_final.groupby("customer_id") \
       .agg(countDistinct("order_date").alias("total_days")) \
       .sort(col("total_days").desc())\
       .show()

+-----------+----------+
|customer_id|total_days|
+-----------+----------+
|          B|         6|
|          A|         4|
|          C|         2|
+-----------+----------+



In [0]:
from pyspark.sql.functions import col,row_number
from pyspark.sql.window import Window
#Question 3: ¿Cuál fue el primer artículo del menú comprado por cada cliente?

overCondition  = Window.partitionBy("customer_id").orderBy("order_date")
salesDF_ranked = salesDF_final.withColumn("row_number",row_number().over(overCondition))
salesDF_ranked.join(menuDF_final,salesDF_ranked.product_id == menuDF_final.product_id,'left') \
              .filter(col("row_number") == 1) \
              .select(col("customer_id"),col("product_name")) \
              .show()
       
       

+-----------+------------+
|customer_id|product_name|
+-----------+------------+
|          A|       sushi|
|          B|       curry|
|          C|       ramen|
+-----------+------------+



In [0]:
from pyspark.sql.functions import count,col, desc
#Question 4: ¿Cuál es el artículo más comprado en el menú y cuántas veces lo compraron todos los clientes?

salesDF_final_prev = salesDF_final.groupby("product_id") \
       .agg(count("order_date").alias("orders_number")) \
       .sort(col("orders_number").desc())\
       .limit(1)
salesDF_final_prev.join(menuDF_final,salesDF_final_prev.product_id == menuDF_final.product_id,'left') \
       .select(col("product_name"),col("orders_number")) \
       .show()

+------------+-------------+
|product_name|orders_number|
+------------+-------------+
|       ramen|            8|
+------------+-------------+



In [0]:
from pyspark.sql.functions import col,desc
from pyspark.sql.window import Window
#Question 5: ¿Qué artículo fue el más popular para cada cliente?

salesDF_final_prev = salesDF_final.groupby("customer_id","product_id") \
       .agg(count("order_date").alias("orders_number"))
overCondition  = Window.partitionBy("customer_id").orderBy(desc("orders_number"))
salesDF_ranked = salesDF_final_prev.withColumn("row_number",row_number().over(overCondition))
salesDF_ranked.filter(col("row_number") == 1) \
              .join(menuDF_final,salesDF_ranked.product_id == menuDF_final.product_id,'left') \
              .select(col("customer_id"),col("product_name")) \
              .show()

+-----------+------------+
|customer_id|product_name|
+-----------+------------+
|          A|       ramen|
|          B|       sushi|
|          C|       ramen|
+-----------+------------+



In [0]:
from pyspark.sql.functions import col
from pyspark.sql.window import Window
#Question 6: ¿Qué artículo compró primero el cliente después de convertirse en miembro?
overCondition  = Window.partitionBy("customer_id").orderBy("order_date")
salesDF_ranked = salesDF_final.join(membersDF_final,salesDF_final.customer_id == membersDF_final.customer_id,'left') \
             .join(menuDF_final,salesDF_final.product_id == menuDF_final.product_id,'left') \
             .select(salesDF_final.customer_id,'order_date','join_date','product_name') \
             .filter(col('order_date') >= col('join_date')) \
             .withColumn("row_number",row_number().over(overCondition))
salesDF_ranked.filter(col('row_number') == 1) \
              .select('customer_id','product_name') \
              .show()


+-----------+------------+
|customer_id|product_name|
+-----------+------------+
|          A|       curry|
|          B|       sushi|
+-----------+------------+



In [0]:
from pyspark.sql.functions import col,desc
from pyspark.sql.window import Window
#Question 7: ¿Qué artículo se compró justo antes de que el cliente se convirtiera en miembro?
overCondition  = Window.partitionBy("customer_id").orderBy(desc("order_date"))
salesDF_ranked = salesDF_final.join(membersDF_final,salesDF_final.customer_id == membersDF_final.customer_id,'left') \
             .join(menuDF_final,salesDF_final.product_id == menuDF_final.product_id,'left') \
             .select(salesDF_final.customer_id,'order_date','join_date','product_name') \
             .filter(col('order_date') < col('join_date')) \
             .withColumn("row_number",row_number().over(overCondition))
salesDF_ranked.filter(col('row_number') == 1) \
              .select('customer_id','product_name') \
              .show()


+-----------+------------+
|customer_id|product_name|
+-----------+------------+
|          A|       sushi|
|          B|       sushi|
+-----------+------------+



In [0]:
from pyspark.sql.functions import col,sum
#Question 8: ¿Cuál es el total de artículos y la cantidad gastada por cada miembro antes de convertirse en miembro?

salesDF_final.join(membersDF_final,salesDF_final.customer_id == membersDF_final.customer_id,'left') \
             .join(menuDF_final,salesDF_final.product_id == menuDF_final.product_id,'left') \
             .select(salesDF_final.customer_id,'order_date','join_date',salesDF_final.product_id,'price') \
             .filter(col('order_date') < col('join_date')) \
             .groupby("customer_id") \
             .agg(count("product_id").alias("total_orders"),sum('price').alias('total_money')) \
             .show()


+-----------+------------+-----------+
|customer_id|total_orders|total_money|
+-----------+------------+-----------+
|          B|           3|         40|
|          A|           2|         25|
+-----------+------------+-----------+



In [0]:
from pyspark.sql.functions import col, sum,when
#Question 9: Si cada $ 1 gastado equivale a 10 puntos y el sushi tiene un multiplicador de puntos 2x, ¿cuántos puntos tendría cada cliente?

salesDF_final.join(menuDF_final,salesDF_final.product_id == menuDF_final.product_id,'left') \
             .select(salesDF_final.customer_id,menuDF_final.product_name,menuDF_final.price,
                     when(col('product_name')=='sushi',(col('price')*2))
                     .otherwise(col('price'))
                     .alias('final_price')) \
             .groupby("customer_id") \
             .agg((sum('final_price')*10).alias('total_points')) \
             .show()


+-----------+------------+
|customer_id|total_points|
+-----------+------------+
|          B|         940|
|          C|         360|
|          A|         860|
+-----------+------------+



In [0]:
from pyspark.sql.functions import col, sum,when
#Question 10: En la primera semana después de que un cliente se une al programa (incluida la fecha
# de ingreso), gana el doble de puntos en todos los artículos, no solo en sushi. ¿Cuántos
# puntos tienen los clientes A y B a fines de enero?

salesDF_final.filter((col('order_date') <= '2021-01-31') & (col('customer_id').isin('A','B'))) \
             .join(menuDF_final,salesDF_final.product_id == menuDF_final.product_id,'left') \
             .join(membersDF_final,salesDF_final.customer_id == membersDF_final.customer_id,'left') \
             .select(salesDF_final.customer_id,salesDF_final.order_date,'join_date','price',
                    when(((col('order_date')>=col('join_date')) & (col('order_date')<=(col('join_date')+7)) | (col('product_name')=='sushi')),(col('price')*2))
                    .otherwise(col('price'))
                    .alias('final_price')) \
             .groupby("customer_id") \
             .agg((sum('final_price')*10).alias('total_points')) \
             .show()

+-----------+------------+
|customer_id|total_points|
+-----------+------------+
|          B|         940|
|          A|        1370|
+-----------+------------+

