In [0]:
dbutils.fs.mkdirs('dbfs:/FileStore/Salesdata')

Out[1]: True

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import MapType,StructType,StructField,StringType,IntegerType,ArrayType,DecimalType

In [0]:
sales_df=(
  spark.read.format('csv').option('header',True)
  .option('inferSchema',True)
  .load('dbfs:/FileStore/Salesdata/sales_short.csv')
  )

In [0]:
sales_df.printSchema()

root
 |-- Order_date: timestamp (nullable = true)
 |-- Order_id: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Address: string (nullable = true)



In [0]:
sales_df.show(5)

+-------------------+--------+--------------------+--------------------+
|         Order_date|Order_id|             Product|             Address|
+-------------------+--------+--------------------+--------------------+
|2020-01-22 21:25:00|  141234|              iPhone|944 Walnut St, Bo...|
|2020-02-28 14:15:00|  141235|Lightning Chargin...|185 Maple St, Por...|
|2019-03-17 13:33:00|  141236|    Wired Headphones|538 Adams St, San...|
|2019-03-05 20:33:00|  141237|    27in FHD Monitor|738 10th St, Los ...|
|2021-04-25 11:59:00|  141238|    Wired Headphones|387 10th St, Aust...|
+-------------------+--------+--------------------+--------------------+
only showing top 5 rows



In [0]:
order_df = spark.read.format("csv").option("header", "true").load('dbfs:/FileStore/Salesdata/sales_order_price.csv')
order_df.show(5)

+--------+---+----------+
|Order_id|qty|price_each|
+--------+---+----------+
|  141234|  1|       700|
|  141235|  1|     14.95|
|  141236|  2|     11.99|
|  141237|  1|    149.99|
|  141238|  1|     11.99|
+--------+---+----------+
only showing top 5 rows



Q1. Display the total price of each product ordered in 1st and 5th month of 2020
 1. 1st DF (sales) will contain all product and order_id that were purchased in 1st and 5th month of 2020. You can keep order date also for reference
 2. 2nd DF (order) will contain order_id and total_price (total_price = qty*price_each)]
 3. Now join the 1st and 2nd df on order id.
 4. Then group by on product and sum total price_

In [0]:
sales_df=sales_df.withColumn('Order_date_new',to_date(col('Order_date'),'yyyy-MM-ddHH:mm:ss.SSSZ'))


+-------------------+--------+--------------------+--------------------+--------------+
|         Order_date|Order_id|             Product|             Address|Order_date_new|
+-------------------+--------+--------------------+--------------------+--------------+
|2020-01-22 21:25:00|  141234|              iPhone|944 Walnut St, Bo...|    2020-01-22|
|2020-02-28 14:15:00|  141235|Lightning Chargin...|185 Maple St, Por...|    2020-02-28|
|2019-03-17 13:33:00|  141236|    Wired Headphones|538 Adams St, San...|    2019-03-17|
|2019-03-05 20:33:00|  141237|    27in FHD Monitor|738 10th St, Los ...|    2019-03-05|
|2021-04-25 11:59:00|  141238|    Wired Headphones|387 10th St, Aust...|    2021-04-25|
|2020-04-29 20:22:00|  141239|AAA Batteries (4-...|775 Willow St, Sa...|    2020-04-29|
|2019-04-26 12:16:00|  141240|27in 4K Gaming Mo...|979 Park St, Los ...|    2019-04-26|
|2019-04-05 12:04:00|  141241|USB-C Charging Cable|181 6th St, San F...|    2019-04-05|
|2019-05-01 10:30:01|  141242|Bo

In [0]:
sales_df.show(5)

+-------------------+--------+--------------------+--------------------+--------------+
|         Order_date|Order_id|             Product|             Address|Order_date_new|
+-------------------+--------+--------------------+--------------------+--------------+
|2020-01-22 21:25:00|  141234|              iPhone|944 Walnut St, Bo...|    2020-01-22|
|2020-02-28 14:15:00|  141235|Lightning Chargin...|185 Maple St, Por...|    2020-02-28|
|2019-03-17 13:33:00|  141236|    Wired Headphones|538 Adams St, San...|    2019-03-17|
|2019-03-05 20:33:00|  141237|    27in FHD Monitor|738 10th St, Los ...|    2019-03-05|
|2021-04-25 11:59:00|  141238|    Wired Headphones|387 10th St, Aust...|    2021-04-25|
+-------------------+--------+--------------------+--------------------+--------------+
only showing top 5 rows



In [0]:
sales_df.printSchema()

root
 |-- Order_date: timestamp (nullable = true)
 |-- Order_id: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Order_date_new: date (nullable = true)



In [0]:
df_product =sales_df.select('Order_id','Product','Order_date_new').filter("month(Order_date_new) in (1,5) and year(Order_date_new) in (2020)")
df_product.show(8)

+--------+---------------+--------------+
|Order_id|        Product|Order_date_new|
+--------+---------------+--------------+
|  141234|         iPhone|    2020-01-22|
|  141250|Vareebadd Phone|    2020-05-10|
|  141256|   Google Phone|    2020-01-29|
+--------+---------------+--------------+



In [0]:
df_order=order_df.withColumn('total_price',(col('qty') * col('price_each')))
df_order.show()

+--------+---+----------+-----------+
|Order_id|qty|price_each|total_price|
+--------+---+----------+-----------+
|  141234|  1|       700|      700.0|
|  141235|  1|     14.95|      14.95|
|  141236|  2|     11.99|      23.98|
|  141237|  1|    149.99|     149.99|
|  141238|  1|     11.99|      11.99|
|  141239|  1|      2.99|       2.99|
|  141240|  1|    389.99|     389.99|
|  141241|  1|     11.95|      11.95|
|  141242|  1|     99.99|      99.99|
|  141243|  1|       150|      150.0|
|  141244|  1|       150|      150.0|
|  141245|  1|      1700|     1700.0|
|  141246|  3|      2.99|       8.97|
|  141247|  1|    149.99|     149.99|
|  141248|  1|       300|      300.0|
|  141249|  1|    149.99|     149.99|
|  141250|  1|       400|      400.0|
|  141251|  1|       150|      150.0|
|  141252|  1|     11.95|      11.95|
|  141253|  1|      3.84|       3.84|
+--------+---+----------+-----------+
only showing top 20 rows



In [0]:
#join 
joined_df=df_product.join(df_order,df_product.Order_id ==df_order.Order_id ).select(df_product['*'],'total_price')

In [0]:
joined_df.show(6)

+--------+---------------+--------------+-----------+
|Order_id|        Product|Order_date_new|total_price|
+--------+---------------+--------------+-----------+
|  141234|         iPhone|    2020-01-22|      700.0|
|  141250|Vareebadd Phone|    2020-05-10|      400.0|
|  141256|   Google Phone|    2020-01-29|      600.0|
+--------+---------------+--------------+-----------+



In [0]:
group_df=joined_df.groupBy('Product').agg(sum('total_price').alias('total_price'))
group_df.show()

+---------------+-----------+
|        Product|total_price|
+---------------+-----------+
|         iPhone|      700.0|
|Vareebadd Phone|      400.0|
|   Google Phone|      600.0|
+---------------+-----------+



Q2. How many no of days between 1st USB-C Charging Cable and 2nd USB-C Charging Cable sales

In [0]:
sales_df=(
  spark.read.format('csv').option('header',True)
  .option('inferSchema',True)
  .load('dbfs:/FileStore/Salesdata/sales_short.csv')
  )

In [0]:
sales_df.filter(expr("Product ='USB-C Charging Cable'")).show(8)

+-------------------+--------+--------------------+--------------------+
|         Order_date|Order_id|             Product|             Address|
+-------------------+--------+--------------------+--------------------+
|2019-04-05 12:04:00|  141241|USB-C Charging Cable|181 6th St, San F...|
|2019-01-30 09:28:00|  141252|USB-C Charging Cable|220 9th St, Los A...|
|2019-02-09 20:55:00|  141255|USB-C Charging Cable|764 11th St, Los ...|
+-------------------+--------+--------------------+--------------------+



In [0]:
sales_df=sales_df.withColumn('Order_date',to_date(col('Order_date'),'yyyy-MM-ddHH:mm:ss.SSSZ'))

In [0]:
sales_df.show(8)

+----------+--------+--------------------+--------------------+
|Order_date|Order_id|             Product|             Address|
+----------+--------+--------------------+--------------------+
|2020-01-22|  141234|              iPhone|944 Walnut St, Bo...|
|2020-02-28|  141235|Lightning Chargin...|185 Maple St, Por...|
|2019-03-17|  141236|    Wired Headphones|538 Adams St, San...|
|2019-03-05|  141237|    27in FHD Monitor|738 10th St, Los ...|
|2021-04-25|  141238|    Wired Headphones|387 10th St, Aust...|
|2020-04-29|  141239|AAA Batteries (4-...|775 Willow St, Sa...|
|2019-04-26|  141240|27in 4K Gaming Mo...|979 Park St, Los ...|
|2019-04-05|  141241|USB-C Charging Cable|181 6th St, San F...|
+----------+--------+--------------------+--------------------+
only showing top 8 rows



In [0]:
from pyspark.sql.window import Window

wind_spec=Window.partitionBy('Product').orderBy('Order_date')

new_sales_df=sales_df.withColumn('prev_order_date',lag('Order_date').over(wind_spec))
new_sales_df.filter(expr("Product ='USB-C Charging Cable'")).show(10)

+----------+--------+--------------------+--------------------+---------------+
|Order_date|Order_id|             Product|             Address|prev_order_date|
+----------+--------+--------------------+--------------------+---------------+
|2019-01-30|  141252|USB-C Charging Cable|220 9th St, Los A...|           null|
|2019-02-09|  141255|USB-C Charging Cable|764 11th St, Los ...|     2019-01-30|
|2019-04-05|  141241|USB-C Charging Cable|181 6th St, San F...|     2019-02-09|
+----------+--------+--------------------+--------------------+---------------+



In [0]:
df_result =new_sales_df.filter(expr("Product ='USB-C Charging Cable'")).withColumn('days_diff', datediff(col('Order_date'),col('prev_order_date')))
df_result.show()

+----------+--------+--------------------+--------------------+---------------+---------+
|Order_date|Order_id|             Product|             Address|prev_order_date|days_diff|
+----------+--------+--------------------+--------------------+---------------+---------+
|2019-01-30|  141252|USB-C Charging Cable|220 9th St, Los A...|           null|     null|
|2019-02-09|  141255|USB-C Charging Cable|764 11th St, Los ...|     2019-01-30|       10|
|2019-04-05|  141241|USB-C Charging Cable|181 6th St, San F...|     2019-02-09|       55|
+----------+--------+--------------------+--------------------+---------------+---------+



Q3. Which month of 2020 has the most no of total sales

In [0]:
sales_df=sales_df.withColumn('Order_date',to_date('Order_date'))

In [0]:
sales_df.filter(expr(" Product ='27in FHD Monitor'")).show(4)

+----------+--------+----------------+--------------------+
|Order_date|Order_id|         Product|             Address|
+----------+--------+----------------+--------------------+
|2019-03-05|  141237|27in FHD Monitor|738 10th St, Los ...|
|2020-03-25|  141247|27in FHD Monitor|512 Wilson St, Sa...|
|2021-05-05|  141249|27in FHD Monitor|440 Cedar St, Por...|
+----------+--------+----------------+--------------------+



In [0]:
df_order.show(4)

+--------+---+----------+-----------+
|Order_id|qty|price_each|total_price|
+--------+---+----------+-----------+
|  141234|  1|       700|      700.0|
|  141235|  1|     14.95|      14.95|
|  141236|  2|     11.99|      23.98|
|  141237|  1|    149.99|     149.99|
+--------+---+----------+-----------+
only showing top 4 rows



In [0]:
sales_order_joined=sales_df.join(df_order,sales_df.Order_id==df_order.Order_id).select(sales_df['*'],'total_price')

In [0]:
sales_order_joined.show(6)

+----------+--------+--------------------+--------------------+-----------+
|Order_date|Order_id|             Product|             Address|total_price|
+----------+--------+--------------------+--------------------+-----------+
|2020-01-22|  141234|              iPhone|944 Walnut St, Bo...|      700.0|
|2020-02-28|  141235|Lightning Chargin...|185 Maple St, Por...|      14.95|
|2019-03-17|  141236|    Wired Headphones|538 Adams St, San...|      23.98|
|2019-03-05|  141237|    27in FHD Monitor|738 10th St, Los ...|     149.99|
|2021-04-25|  141238|    Wired Headphones|387 10th St, Aust...|      11.99|
|2020-04-29|  141239|AAA Batteries (4-...|775 Willow St, Sa...|       2.99|
+----------+--------+--------------------+--------------------+-----------+
only showing top 6 rows



In [0]:
from pyspark.sql import functions as F

(sales_order_joined
    .filter(F.year('Order_date') == 2020)
    .groupBy(F.month('Order_date').alias('month'))
    .agg(F.max('total_price').alias('max_sales'))
).show()


+-----+---------+
|month|max_sales|
+-----+---------+
|    1|    700.0|
|    3|    150.0|
|    5|    400.0|
|    4|     2.99|
|    2|    14.95|
+-----+---------+



Q4. How many months between the purchase of 1st and 3rd 27in FHD Monitor

In [0]:
sales_df.filter(expr(" Product ='27in FHD Monitor'")).show(4)

+----------+--------+----------------+--------------------+
|Order_date|Order_id|         Product|             Address|
+----------+--------+----------------+--------------------+
|2019-03-05|  141237|27in FHD Monitor|738 10th St, Los ...|
|2020-03-25|  141247|27in FHD Monitor|512 Wilson St, Sa...|
|2021-05-05|  141249|27in FHD Monitor|440 Cedar St, Por...|
+----------+--------+----------------+--------------------+



In [0]:
winspec=Window.partitionBy('Product').orderBy('Order_date')

df_fhd_prev_date = sales_df.withColumn('prev_day_3',lag('Order_date',2).over(winspec))
df_fhd_prev_date.filter(expr(" Product ='27in FHD Monitor'")).show(4)

+----------+--------+----------------+--------------------+----------+
|Order_date|Order_id|         Product|             Address|prev_day_3|
+----------+--------+----------------+--------------------+----------+
|2019-03-05|  141237|27in FHD Monitor|738 10th St, Los ...|      null|
|2020-03-25|  141247|27in FHD Monitor|512 Wilson St, Sa...|      null|
|2021-05-05|  141249|27in FHD Monitor|440 Cedar St, Por...|2019-03-05|
+----------+--------+----------------+--------------------+----------+



In [0]:

(df_fhd_prev_date.filter(F.expr("Product = '27in FHD Monitor'"))
    .withColumn('days_diff', F.months_between(F.col('prev_day_3'), F.col('Order_date')))
    ).show()


+----------+--------+----------------+--------------------+----------+---------+
|Order_date|Order_id|         Product|             Address|prev_day_3|days_diff|
+----------+--------+----------------+--------------------+----------+---------+
|2019-03-05|  141237|27in FHD Monitor|738 10th St, Los ...|      null|     null|
|2020-03-25|  141247|27in FHD Monitor|512 Wilson St, Sa...|      null|     null|
|2021-05-05|  141249|27in FHD Monitor|440 Cedar St, Por...|2019-03-05|    -26.0|
+----------+--------+----------------+--------------------+----------+---------+

