In [0]:
spark

In [0]:
# Importando data

from pyspark.sql.types import StructField, StructType, StringType, LongType, DateType, IntegerType

# Members
Members_Schema = StructType([
StructField("customer_id", StringType(), True),
StructField("join_date", DateType(), True)])

dmembers = (
  spark
  .read
  .format("csv")
  .option("header", "true")
  .schema(Members_Schema)
  .option('path', "/FileStore/tables/members.txt")
  .load()
)

# Menu
dmenu = (
  spark
  .read
  .format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .option('path', "/FileStore/tables/menu.txt")
  .load()
)

# Sales

Sales_Schema = StructType([
StructField("customer_id", StringType(), True),
StructField("order_date", DateType(), True),
StructField("product_id", LongType(), True)])

dsales = (
  spark
  .read
  .format("csv")
  .option("header", "true")
  .schema(Sales_Schema)
  .option('path', "/FileStore/tables/sales.txt")
  .load()
)

In [0]:
# Seleccionando Variables
from pyspark.sql.functions import expr, col, column

dmembers = dmembers.withColumn("member_customer_id", expr("customer_id"))
dmembers = dmembers.select("member_customer_id", "join_date")

dmenu = dmenu.withColumn("menu_product_id", expr("product_id"))
dmenu = dmenu.select("menu_product_id", "product_name", "price")

dsales = dsales.withColumn("sales_customer_id", expr("customer_id"))
dsales = dsales.withColumn("sales_product_id", expr("product_id"))
dsales = dsales.select("sales_customer_id", "order_date", "sales_product_id")

In [0]:
# Revisando Importacion data
dmembers.show()
dmenu.show()
dsales.show()

+------------------+----------+
|member_customer_id| join_date|
+------------------+----------+
|                 A|2021-01-07|
|                 B|2021-01-09|
+------------------+----------+

+---------------+------------+-----+
|menu_product_id|product_name|price|
+---------------+------------+-----+
|              1|       sushi|   10|
|              2|       curry|   15|
|              3|       ramen|   12|
+---------------+------------+-----+

+-----------------+----------+----------------+
|sales_customer_id|order_date|sales_product_id|
+-----------------+----------+----------------+
|                A|2021-01-01|               1|
|                A|2021-01-01|               2|
|                A|2021-01-07|               2|
|                A|2021-01-10|               3|
|                A|2021-01-11|               3|
|                A|2021-01-11|               3|
|                B|2021-01-01|               2|
|                B|2021-01-02|               2|
|                B|

In [0]:
# Creando Vistas Temporales
dmembers.createOrReplaceTempView("members")
dmenu.createOrReplaceTempView("menu")
dsales.createOrReplaceTempView("sales")

In [0]:
# Pregunta 1
# ¿Cuál es la cantidad total que gastó cada cliente en el restaurante?

spark.sql(
  """
  SELECT sales_customer_id, sum(price) AS Gasto_Total
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id
  ORDER BY sales_customer_id
  """).show()

+-----------------+-----------+
|sales_customer_id|Gasto_Total|
+-----------------+-----------+
|                A|         76|
|                B|         74|
|                C|         36|
+-----------------+-----------+



In [0]:
# Pregunta 2
# ¿Cuántos días ha visitado cada cliente el restaurante?
spark.sql(
  """
  SELECT sales_customer_id, count(distinct(order_date)) AS Visitas
  FROM sales
  GROUP BY sales_customer_id
  ORDER BY sales_customer_id
  """).show()

+-----------------+-------+
|sales_customer_id|Visitas|
+-----------------+-------+
|                A|      4|
|                B|      6|
|                C|      2|
+-----------------+-------+



In [0]:
# Pregunta 3
# ¿Cuál fue el primer artículo del menú comprado por cada cliente?
# 1st Method
spark.sql(
  """
  SELECT *  
  FROM(
  SELECT sales_customer_id, product_name, min(order_date) AS 1st_Sale  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, product_name
  ORDER BY(sales_customer_id, 1st_Sale))
  WHERE sales_customer_id = 'A' LIMIT 1
  """).show()


spark.sql(
  """
  SELECT *  
  FROM(
  SELECT sales_customer_id, product_name, min(order_date) AS 1st_Sale  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, product_name
  ORDER BY(sales_customer_id, 1st_Sale))
  WHERE sales_customer_id = 'B' LIMIT 1
  """).show()


spark.sql(
  """
  SELECT *  
  FROM(
  SELECT sales_customer_id, product_name, min(order_date) AS 1st_Sale  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, product_name
  ORDER BY(sales_customer_id, 1st_Sale))
  WHERE sales_customer_id = 'C' LIMIT 1
  """).show()

# 2do Method
spark.sql(
  """
  SELECT DISTINCT(sales_customer_id), product_name FROM
  (SELECT sales_customer_id, product_name, min(order_date) AS 1st_Sale  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, product_name
  ORDER BY(sales_customer_id, 1st_Sale))
  """).dropDuplicates(['sales_customer_id']).show()

+-----------------+------------+----------+
|sales_customer_id|product_name|  1st_Sale|
+-----------------+------------+----------+
|                A|       sushi|2021-01-01|
+-----------------+------------+----------+

+-----------------+------------+----------+
|sales_customer_id|product_name|  1st_Sale|
+-----------------+------------+----------+
|                B|       curry|2021-01-01|
+-----------------+------------+----------+

+-----------------+------------+----------+
|sales_customer_id|product_name|  1st_Sale|
+-----------------+------------+----------+
|                C|       ramen|2021-01-01|
+-----------------+------------+----------+

+-----------------+------------+
|sales_customer_id|product_name|
+-----------------+------------+
|                A|       sushi|
|                B|       curry|
|                C|       ramen|
+-----------------+------------+



In [0]:
# Pregunta 4
# ¿Cuál es el artículo más comprado en el menú y cuántas veces lo compraron todos los clientes?
spark.sql(
  """
  SELECT menu.product_name, count(order_date) AS Veces_Compra  
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY menu.product_name
  ORDER BY Veces_Compra DESC LIMIT 1
  """).show()

+------------+------------+
|product_name|Veces_Compra|
+------------+------------+
|       ramen|           8|
+------------+------------+



In [0]:
# Pregunta 5
# ¿Qué artículo fue el más popular para cada cliente?
# 1st Method
spark.sql(
  """
  SELECT sales_customer_id, menu.product_name, count(*) AS Veces_Compra  
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, menu.product_name
  HAVING sales_customer_id = "A"
  ORDER BY sales_customer_id, Veces_Compra DESC LIMIT 1
  """).show()

spark.sql(
  """
  SELECT sales_customer_id, menu.product_name, count(*) AS Veces_Compra  
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, menu.product_name
  HAVING sales_customer_id = "B"
  ORDER BY sales_customer_id, Veces_Compra DESC LIMIT 1
  """).show()

spark.sql(
  """
  SELECT sales_customer_id, menu.product_name, count(*) AS Veces_Compra  
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, menu.product_name
  HAVING sales_customer_id = "C"
  ORDER BY sales_customer_id, Veces_Compra DESC LIMIT 1
  """).show()

# 2nd Method
spark.sql(
  """
  SELECT sales_customer_id, menu.product_name, count(*) AS Veces_Compra  
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id
  GROUP BY sales_customer_id, menu.product_name
  ORDER BY sales_customer_id, Veces_Compra DESC
  """).dropDuplicates(['sales_customer_id']).show()

+-----------------+------------+------------+
|sales_customer_id|product_name|Veces_Compra|
+-----------------+------------+------------+
|                A|       ramen|           3|
+-----------------+------------+------------+

+-----------------+------------+------------+
|sales_customer_id|product_name|Veces_Compra|
+-----------------+------------+------------+
|                B|       sushi|           2|
+-----------------+------------+------------+

+-----------------+------------+------------+
|sales_customer_id|product_name|Veces_Compra|
+-----------------+------------+------------+
|                C|       ramen|           3|
+-----------------+------------+------------+

+-----------------+------------+------------+
|sales_customer_id|product_name|Veces_Compra|
+-----------------+------------+------------+
|                A|       ramen|           3|
|                B|       sushi|           2|
|                C|       ramen|           3|
+-----------------+------------

In [0]:
# Pregunta 6
# ¿Qué artículo compró primero el cliente después de convertirse en miembro?
spark.sql(
  """
  SELECT sales_customer_id, menu.product_name, sales.order_date
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id 
  LEFT OUTER JOIN members ON sales_customer_id = member_customer_id 
  WHERE sales.order_date > members.join_date
  GROUP BY sales_customer_id, menu.product_name, sales.order_date
  ORDER BY sales_customer_id, sales.order_date
  """).dropDuplicates(['sales_customer_id']).show()

+-----------------+------------+----------+
|sales_customer_id|product_name|order_date|
+-----------------+------------+----------+
|                A|       ramen|2021-01-10|
|                B|       sushi|2021-01-11|
+-----------------+------------+----------+



In [0]:
# Pregunta 7
# ¿Qué artículo se compró justo antes de que el cliente se convirtiera en miembro?
spark.sql(
  """
  SELECT sales_customer_id, menu.product_name, sales.order_date
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id 
  LEFT OUTER JOIN members ON sales_customer_id = member_customer_id 
  WHERE sales.order_date < members.join_date
  GROUP BY sales_customer_id, menu.product_name, sales.order_date
  ORDER BY sales_customer_id, sales.order_date DESC
  """).dropDuplicates(['sales_customer_id']).show()

+-----------------+------------+----------+
|sales_customer_id|product_name|order_date|
+-----------------+------------+----------+
|                A|       curry|2021-01-01|
|                B|       sushi|2021-01-04|
+-----------------+------------+----------+



In [0]:
# Pregunta 8
# ¿Cuál es el total de artículos y la cantidad gastada por cada miembro antes de convertirse en miembro?
spark.sql(
  """
  SELECT sales_customer_id, COUNT(menu.price) AS Cantidad_Articulos, SUM(menu.price) AS Gasto_Total
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id 
  LEFT OUTER JOIN members ON sales_customer_id = member_customer_id 
  WHERE sales.order_date < members.join_date
  GROUP BY sales_customer_id
  ORDER BY sales_customer_id
  """).show()

+-----------------+------------------+-----------+
|sales_customer_id|Cantidad_Articulos|Gasto_Total|
+-----------------+------------------+-----------+
|                A|                 2|         25|
|                B|                 3|         40|
+-----------------+------------------+-----------+



In [0]:
# Pregunta 9
# Si cada $ 1 gastado equivale a 10 puntos y el sushi tiene un multiplicador de puntos 2x, ¿cuántos puntos tendría cada cliente?

from pyspark.sql.functions import udf

def puntos(product_id, price):
    if product_id == 1:
        return price * 20
    else:
        return price * 10

spark.udf.register("puntos", puntos)

spark.sql(
  """
  SELECT sales_customer_id, sum(Puntos) AS Total_Puntos
  FROM (
  SELECT sales_customer_id, puntos(menu_product_id, Total) AS Puntos 
  FROM(
  SELECT sales_customer_id, menu.menu_product_id, sum(menu.price) AS Total
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id 
  GROUP BY sales_customer_id, menu.menu_product_id))
  GROUP BY sales_customer_id
  ORDER BY sales_customer_id
  """).show()

+-----------------+------------+
|sales_customer_id|Total_Puntos|
+-----------------+------------+
|                A|       860.0|
|                B|       940.0|
|                C|       360.0|
+-----------------+------------+



In [0]:
# Pregunta 10
# En la primera semana después de que un cliente se une al programa (incluida la fecha de ingreso), gana el doble de puntos en todos los artículos, no solo en sushi. ¿Cuántos
# puntos tienen los clientes A y B a fines de enero?

# Definiendo la Funcion
from pyspark.sql.functions import udf
from datetime import datetime
from pyspark.sql.functions import datediff, months_between, to_date, lit, date_sub

def puntos_new(product_id, price, days):
    if product_id == 1:
        return price * 20
    else:
        if days <= 7 and days >= 0:
            return price * 20
        else:
            return price * 10

spark.udf.register("puntos_new", puntos_new)

# Planteando la Consulta
spark.sql(
  """
  SELECT sales_customer_id, sum(Puntos_New) AS Total_Puntos
  FROM (
  SELECT sales_customer_id, puntos_new(menu_product_id, Total, Days_From_Member) AS Puntos_New 
  FROM(
  SELECT sales_customer_id, menu_product_id, Total, datediff(order_date,join_date) AS Days_From_Member
  FROM(
  SELECT sales_customer_id, menu.menu_product_id, sum(menu.price) AS Total, order_date, members.join_date
  FROM sales
  LEFT OUTER JOIN menu ON sales_product_id = menu_product_id 
  LEFT OUTER JOIN members ON sales_customer_id = members.member_customer_id       
  GROUP BY sales_customer_id, menu.menu_product_id, order_date, members.join_date)
  WHERE sales_customer_id != 'C' AND order_date <= '2021-01-31'))
  GROUP BY sales_customer_id
  ORDER BY sales_customer_id
  """).show()

+-----------------+------------+
|sales_customer_id|Total_Puntos|
+-----------------+------------+
|                A|      1370.0|
|                B|       940.0|
+-----------------+------------+

