<h4>   . menu.csv </h4>
<h4>   . members.csv </h4>
<h4>   . sales.csv </h4>

In [0]:
def Import_data (file_location, file_name , file_type):
  # File location and type
  file_location = file_location+""+file_name
  file_type = file_type

  # CSV options
  infer_schema = "true"
  first_row_is_header = "true"
  delimiter = ","

  # The applied options are for CSV files. For other file types, these will be ignored.
  df_ = spark.read.format(file_type) \
        .option("inferSchema", infer_schema) \
        .option("header", first_row_is_header) \
        .option("sep", delimiter) \
        .load(file_location)

  return df_

In [0]:
df_menu=Import_data ("/FileStore/tables/Project_Datapath/","menu.csv","csv")
display(df_menu)

product_id,product_name,price
1,sushi,10
2,curry,15
3,ramen,12


In [0]:
df_members=Import_data ("/FileStore/tables/Project_Datapath/","members.csv","csv")
display(df_members)

customer_id,join_date
A,2021-01-07T00:00:00.000+0000
B,2021-01-09T00:00:00.000+0000


In [0]:
df_sales=Import_data ("/FileStore/tables/Project_Datapath/","sales.csv","csv")
display(df_sales)

customer_id,order_date,product_id
A,2021-01-01T00:00:00.000+0000,1
A,2021-01-01T00:00:00.000+0000,2
A,2021-01-07T00:00:00.000+0000,2
A,2021-01-10T00:00:00.000+0000,3
A,2021-01-11T00:00:00.000+0000,3
A,2021-01-11T00:00:00.000+0000,3
B,2021-01-01T00:00:00.000+0000,2
B,2021-01-02T00:00:00.000+0000,2
B,2021-01-04T00:00:00.000+0000,1
B,2021-01-11T00:00:00.000+0000,1


In [0]:
df_menu = df_menu.withColumnRenamed(
  "product_id", "product_id_menu")

df_members = df_members.withColumnRenamed(
  "customer_id", "customer_id_members")
  
  

<H3> TRANSFORMACIÓN DE DATOS </H3>

In [0]:
df_join= df_sales.join(df_menu,df_sales.product_id ==  df_menu.product_id_menu,"left") 
df_join= df_join.join(df_members,df_join.customer_id == df_members.customer_id_members,"left")


In [0]:
from pyspark.sql.types import StringType, DateType, FloatType, DoubleType
  
df_join = df_join \
  .withColumn("order_date" ,
              df_join["order_date"]
              .cast(DateType())) 

df_join = df_join \
  .withColumn("join_date" ,
              df_join["join_date"]
              .cast(DateType())) 



In [0]:
display(df_join)

customer_id,order_date,product_id,product_id_menu,product_name,price,customer_id_members,join_date
A,2021-01-01,1,1,sushi,10,A,2021-01-07
A,2021-01-01,2,2,curry,15,A,2021-01-07
A,2021-01-07,2,2,curry,15,A,2021-01-07
A,2021-01-10,3,3,ramen,12,A,2021-01-07
A,2021-01-11,3,3,ramen,12,A,2021-01-07
A,2021-01-11,3,3,ramen,12,A,2021-01-07
B,2021-01-01,2,2,curry,15,B,2021-01-09
B,2021-01-02,2,2,curry,15,B,2021-01-09
B,2021-01-04,1,1,sushi,10,B,2021-01-09
B,2021-01-11,1,1,sushi,10,B,2021-01-09


<H3> PREGUNTAS </H3><BR/>
● ¿Cuál es la cantidad total que gastó cada cliente en el restaurante?<BR/>
● ¿Cuántos días ha visitado cada cliente el restaurante?<BR/>
● ¿Cuál fue el primer artículo del menú comprado por cada cliente?<BR/>
● ¿Cuál es el artículo más comprado en el menú y cuántas veces lo compraron todos los
clientes?<BR/>
● ¿Qué artículo fue el más popular para cada cliente?<BR/>
● ¿Qué artículo compró primero el cliente después de convertirse en miembro?<BR/>
● ¿Qué artículo se compró justo antes de que el cliente se convirtiera en miembro?<BR/>
● ¿Cuál es el total de artículos y la cantidad gastada por cada miembro antes de
convertirse en miembro?<BR/>
● Si cada $ 1 gastado equivale a 10 puntos y el sushi tiene un multiplicador de puntos 2x,
¿cuántos puntos tendría cada cliente?<BR/>
● En la primera semana después de que un cliente se une al programa (incluida la fecha
de ingreso), gana el doble de puntos en todos los artículos, no solo en sushi. ¿Cuántos
puntos tienen los clientes A y B a fines de enero?

In [0]:
## Solución 1
from pyspark.sql import functions as Fx

df_join.groupBy("customer_id").agg(Fx.sum("price")).show()

+-----------+----------+
|customer_id|sum(price)|
+-----------+----------+
|          B|        74|
|          C|        36|
|          A|        76|
+-----------+----------+



In [0]:
## Solución 2
from pyspark.sql.functions import countDistinct
df_join.groupBy("customer_id").agg(countDistinct('order_date')) \
    .show(truncate=False)

+-----------+-----------------+
|customer_id|count(order_date)|
+-----------+-----------------+
|B          |6                |
|C          |2                |
|A          |4                |
+-----------+-----------------+



In [0]:
## Solución 3
from pyspark.sql.functions import min, max
from pyspark.sql import Row, functions as F
from pyspark.sql.window import Window

rowfilter =col("rowNum")==1
datefilter = col("product_id")<=10
df_join.filter(datefilter)\
       .sort("customer_id","order_date")\
       .select("customer_id","order_date","product_name",F.row_number().over(Window.partitionBy("customer_id").orderBy("order_date")).alias("rowNum"))\
       .select("customer_id","order_date","product_name")\
       .filter(rowfilter).show()


+-----------+----------+------------+
|customer_id|order_date|product_name|
+-----------+----------+------------+
|          A|2021-01-01|       sushi|
|          B|2021-01-01|       curry|
|          C|2021-01-01|       ramen|
+-----------+----------+------------+



In [0]:
## Solución 4

from pyspark.sql.functions import count, max, col

## Articulo más comprado
df_join.groupBy("product_name")\
   .agg(count("product_name").alias("Cantidad"))\
   .sort(col("Cantidad").desc())\
   .first()


Out[20]: Row(product_name='ramen', Cantidad=8)

In [0]:
## Solución 4

## Cuantas Veces lo compro cada cliente
from pyspark.sql.functions import col
productfilter = col("product_name")=="ramen"
df_join.filter(productfilter)\
       .groupBy("customer_id")\
       .agg(count("product_name").alias("Cantidad"))\
       .show()

+-----------+--------+
|customer_id|Cantidad|
+-----------+--------+
|          B|       2|
|          C|       3|
|          A|       3|
+-----------+--------+



In [0]:
## Solución 5

from pyspark.sql.functions import col
df_join.groupBy("customer_id","product_name")\
       .agg(count("product_name").alias("Cantidad"))\
       .groupBy("customer_id")\
       .agg(max("product_name"),max(col("Cantidad")))\
       .show()


+-----------+-----------------+-------------+
|customer_id|max(product_name)|max(Cantidad)|
+-----------+-----------------+-------------+
|          A|            sushi|            3|
|          B|            sushi|            2|
|          C|            ramen|            3|
+-----------+-----------------+-------------+



In [0]:
## Solución 6

from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql import Row, functions as F

datefilter = col("join_date")<col("order_date")
rowfilter =col("rowNum")==1
df_join.filter(datefilter)\
       .groupBy("customer_id","product_name").agg(min("order_date").alias("min_order"))\
       .sort("customer_id",col("min_order").asc())\
       .select("customer_id","product_name","min_order",F.row_number().over(Window.partitionBy("customer_id").orderBy("customer_id")).alias("rowNum"))\
       .filter(rowfilter).show()

+-----------+------------+----------+------+
|customer_id|product_name| min_order|rowNum|
+-----------+------------+----------+------+
|          A|       ramen|2021-01-10|     1|
|          B|       sushi|2021-01-11|     1|
+-----------+------------+----------+------+



In [0]:
## Solución 7


from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql import Row, functions as F

datefilter = col("join_date")>=col("order_date")
rowfilter =col("rowNum")==1
df_join.filter(datefilter)\
       .groupBy("customer_id","product_name").agg(min("order_date").alias("min_order"))\
       .sort("customer_id",col("min_order").desc())\
       .select("customer_id","product_name","min_order",F.row_number().over(Window.partitionBy("customer_id").orderBy("customer_id")).alias("rowNum"))\
       .filter(rowfilter).show()


+-----------+------------+----------+------+
|customer_id|product_name| min_order|rowNum|
+-----------+------------+----------+------+
|          A|       sushi|2021-01-01|     1|
|          B|       sushi|2021-01-04|     1|
+-----------+------------+----------+------+



In [0]:
## Solución 8

from pyspark.sql.functions import count, max, col, sum

datefilter = col("join_date")>col("order_date")
df_join.filter(datefilter)\
       .groupBy("customer_id").agg(countDistinct("product_name"),sum("price"))\
       .show()


+-----------+-------------------+----------+
|customer_id|count(product_name)|sum(price)|
+-----------+-------------------+----------+
|          B|                  2|      40.0|
|          A|                  2|      25.0|
+-----------+-------------------+----------+



In [0]:
## Solución 9

from pyspark.sql.functions import count, max, col, sum, when

df_join.withColumn("puntos",when(col("product_name")=="sushi",col("price")*2*10)\
                  .when(col("product_name")!="sushi",col("price")*10))\
                  .groupBy("customer_id").agg(sum(col("puntos"))).show()


+-----------+-----------+
|customer_id|sum(puntos)|
+-----------+-----------+
|          B|      940.0|
|          C|      360.0|
|          A|      860.0|
+-----------+-----------+



In [0]:
## Solución 10

from pyspark.sql.functions import count, max, col, sum, when

df_join.withColumn("puntos",when( (col("product_name")=="sushi") & (col("order_date")<col("join_date")),col("price")*2*10)\
                  .when( (col("product_name")!="sushi") & (col("order_date")<col("join_date")),col("price")*10)\
                  .when( (col("order_date")>=col("join_date")),col("price")*2*10) )\
                  .filter(col("order_date")<'2021-01-31')\
                  .groupBy("customer_id").agg(sum(col("puntos")))\
                  .show()




+-----------+-----------+
|customer_id|sum(puntos)|
+-----------+-----------+
|          B|        940|
|          C|       null|
|          A|       1370|
+-----------+-----------+

