In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').config('spark.driver.memory', '8g').appName('Ready_for_analysis').getOrCreate()



In [2]:
df = spark.read.parquet(
    '/Users/gabriele.sabato/PycharmProjects/raw_data/price_elasticity_model_data/part-*',
    header=True)


In [3]:
df.createOrReplaceTempView('model_data_table')



In [4]:
df.show()

+------------------+-----------+--------------+-----------+-----+
|         item_code|update_date|delivery_weeks| item_price|sales|
+------------------+-----------+--------------+-----------+-----+
|000000001000000008| 2019-01-26|             1|      13.99|    0|
|000000001000000008| 2019-04-12|             0|      13.99|    0|
|000000001000000008| 2019-06-22|             1|      13.99|    0|
|000000001000000008| 2019-08-13|             1|      13.99|    0|
|000000001000000008| 2019-12-27|             0|      13.99|    0|
|000000001000000013| 2018-12-18|             1|      39.99|    0|
|000000001000000013| 2020-01-30|             1|      39.99|    0|
|000000001000000013| 2020-10-26|             0|31.98999999|    0|
|000000001000000013| 2020-10-27|             0|31.98999999|    0|
|000000001000000014| 2019-04-19|             1|74.98999999|    0|
|000000001000000014| 2019-06-16|             1|74.98999999|    0|
|000000001000000014| 2019-11-10|             0|74.98999999|    0|
|000000001

In [5]:
df2 = spark.sql("SELECT * from model_data_table where update_date >= '2020-01-01' ORDER BY update_date ")

In [6]:
df2.createOrReplaceTempView('model_data_table_YTD')

In [7]:
one_item_df = spark.sql(" SELECT item_code, update_date, item_price, delivery_weeks, sales from model_data_table_YTD  "
                       "where item_code = '000000001000016021' ")

In [8]:
one_item_df.show()

+------------------+-----------+------------+--------------+-----+
|         item_code|update_date|  item_price|delivery_weeks|sales|
+------------------+-----------+------------+--------------+-----+
|000000001000016021| 2020-01-01|119.98999999|             5|   23|
|000000001000016021| 2020-01-02|119.98999999|             1|    7|
|000000001000016021| 2020-01-03|119.98999999|             4|   15|
|000000001000016021| 2020-01-04|119.98999999|             4|   16|
|000000001000016021| 2020-01-05|119.98999999|             4|   17|
|000000001000016021| 2020-01-06|119.98999999|             4|   18|
|000000001000016021| 2020-01-07|119.98999999|             4|    3|
|000000001000016021| 2020-01-08|119.98999999|             3|    8|
|000000001000016021| 2020-01-09|119.98999999|             3|    9|
|000000001000016021| 2020-01-10|119.98999999|             1|   18|
|000000001000016021| 2020-01-11|119.98999999|             1|    9|
|000000001000016021| 2020-01-12|119.98999999|             3|  

In [9]:
#CREATE A TABLE VIEW FOR ITEM_CODE = 000000001000016021
one_item_df.createOrReplaceTempView('top_seller')


In [10]:
TABLE_NAME = 'top_seller'

In [11]:
group_by_cons_rows = """WITH tmp_tbl AS (
      SELECT *,
             CASE
                 WHEN lag(delivery_weeks, 1) OVER (PARTITION BY delivery_weeks, item_price ORDER BY update_date ASC) =
                      delivery_weeks
                     THEN delivery_weeks
                 ELSE ROW_NUMBER() OVER (ORDER BY update_date)
                 END AS grouping_dw_col --filled with row number or delivery week if the previous one is part of the same group,
        FROM {table_name}
  ),
       tmp_tbl2 AS (
           SELECT update_date,
                  item_code,
                  sales,
                  item_price,
                  delivery_weeks,
                  CASE
                      WHEN lag(delivery_weeks, 1) OVER (PARTITION BY delivery_weeks, item_price ORDER BY update_date) =
                           delivery_weeks
                          THEN lag(grouping_dw_col, 1) OVER (ORDER BY update_date)
                      ELSE ROW_NUMBER() OVER (ORDER BY update_date)
                      END AS grouping_col_dw
             FROM tmp_tbl
       )
SELECT MIN(update_date)                   AS min_date,
       MAX(update_date)                   AS max_date,
       item_code,
       item_price,
       delivery_weeks,
       avg(sales),
       (datediff(MAX(update_date),MIN(update_date)) + 1) AS bin_width 
  FROM tmp_tbl2
 GROUP BY grouping_col_dw, delivery_weeks, item_price, item_code
 ORDER BY MIN(update_date)""".format(table_name= TABLE_NAME)

In [12]:
print(group_by_cons_rows)

WITH tmp_tbl AS (
      SELECT *,
             CASE
                 WHEN lag(delivery_weeks, 1) OVER (PARTITION BY delivery_weeks, item_price ORDER BY update_date ASC) =
                      delivery_weeks
                     THEN delivery_weeks
                 ELSE ROW_NUMBER() OVER (ORDER BY update_date)
                 END AS grouping_dw_col --filled with row number or delivery week if the previous one is part of the same group,
        FROM top_seller
  ),
       tmp_tbl2 AS (
           SELECT update_date,
                  item_code,
                  sales,
                  item_price,
                  delivery_weeks,
                  CASE
                      WHEN lag(delivery_weeks, 1) OVER (PARTITION BY delivery_weeks, item_price ORDER BY update_date) =
                           delivery_weeks
                          THEN lag(grouping_dw_col, 1) OVER (ORDER BY update_date)
                      ELSE ROW_NUMBER() OVER (ORDER BY update_date)
                      EN

In [13]:
df_try_grouping = spark.sql(group_by_cons_rows)

In [14]:
df_try_grouping.show()

+----------+----------+------------------+------------+--------------+------------------+---------+
|  min_date|  max_date|         item_code|  item_price|delivery_weeks|        avg(sales)|bin_width|
+----------+----------+------------------+------------+--------------+------------------+---------+
|2020-01-01|2020-01-01|000000001000016021|119.98999999|             5|              23.0|        1|
|2020-01-02|2020-08-14|000000001000016021|119.98999999|             1|              8.25|      226|
|2020-01-03|2020-01-04|000000001000016021|119.98999999|             4|              15.5|        2|
|2020-01-05|2020-01-07|000000001000016021|119.98999999|             4|12.666666666666666|        3|
|2020-01-08|2020-01-09|000000001000016021|119.98999999|             3|               8.5|        2|
|2020-01-10|2020-01-10|000000001000016021|119.98999999|             1|              18.0|        1|
|2020-01-11|2020-10-23|000000001000016021|119.98999999|             1|18.793357933579337|      287|


In [15]:
one_item_df.write.csv('/tmp/one_item_pel.csv')

In [16]:
one_item_df.toPandas().to_csv('/Users/gabriele.sabato/Downloads/one_item_pel.csv')
