In [81]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').config('spark.driver.memory', '8g').appName('Ready_for_analysis').getOrCreate()



In [82]:
df = spark.read.parquet(
    '/Users/gabriele.sabato/PycharmProjects/raw_data/price_elasticity_model_data/part-*',
    header=True)


In [83]:
df.createOrReplaceTempView('model_data_table')


In [84]:
df.show()

+------------------+-----------+--------------+-----------+-----+
|         item_code|update_date|delivery_weeks| item_price|sales|
+------------------+-----------+--------------+-----------+-----+
|000000001000000008| 2019-01-26|             1|      13.99|    0|
|000000001000000008| 2019-04-12|             0|      13.99|    0|
|000000001000000008| 2019-06-22|             1|      13.99|    0|
|000000001000000008| 2019-08-13|             1|      13.99|    0|
|000000001000000008| 2019-12-27|             0|      13.99|    0|
|000000001000000013| 2018-12-18|             1|      39.99|    0|
|000000001000000013| 2020-01-30|             1|      39.99|    0|
|000000001000000013| 2020-10-26|             0|31.98999999|    0|
|000000001000000013| 2020-10-27|             0|31.98999999|    0|
|000000001000000014| 2019-04-19|             1|74.98999999|    0|
|000000001000000014| 2019-06-16|             1|74.98999999|    0|
|000000001000000014| 2019-11-10|             0|74.98999999|    0|
|000000001

In [85]:
df_d_item = spark.read.parquet('/Users/gabriele.sabato/PycharmProjects/raw_data/redshift_tables/d_item/*')


In [86]:
df_d_item.createOrReplaceTempView('d_item_tmp')

In [87]:
df_item_visits = spark.read.parquet(
    '/Users/gabriele.sabato/PycharmProjects/raw_data/redshift_tables/item_visits/*2020*/*.parquet',header=True)

In [88]:
top_seller_list_string = """'000000001000016021','000000001000016133','000000001000022708', '000000001000015958',
                           '000000001000016020','000000001000015959','000000001000016019','000000001000022573','000000001000015944',
                            '000000001000024281'"""



In [89]:
df_item_visits.createOrReplaceTempView('item_visits')

df_item_visits_df = spark.sql("""SELECT item_code,
                                         meta_date,
                                        SUM(unique_views) as all_unique_views
                                    from item_visits
                                  where meta_date <= '2020-11-18'
                                  and item_code in ({lista})
                                  GROUP BY item_code, meta_date""".format(lista=top_seller_list_string))

In [94]:
df2 = spark.sql("""SELECT * from model_data_table where update_date >= '2020-01-01' and item_code in ({lista}) ORDER BY update_date""".format(lista=top_seller_list_string))

In [95]:
df4 = spark.sql ("""SELECT item_code, item_main_category, item_sub_category_1, item_sub_category_2,item_parent_item_code from d_item_tmp where item_code in ({lista})""".format(lista=top_seller_list_string))

In [96]:
df4.createOrReplaceTempView('d_item_filtered')
df2.createOrReplaceTempView('model_data_table_YTD')
df_item_visits_df.createOrReplaceTempView('item_views_YTD')

In [97]:
df_item_visits_df.describe(['item_code']).show()

+-------+------------------+
|summary|         item_code|
+-------+------------------+
|  count|              3230|
|   mean|    1.0000181616E9|
| stddev| 3318.297311634993|
|    min|000000001000015944|
|    max|000000001000024281|
+-------+------------------+



In [98]:
sql_price_visit_join= """ SELECT mdt_YTD.*, it_YTD.all_unique_views FROM model_data_table_YTD as mdt_YTD LEFT JOIN item_views_YTD as it_YTD
on mdt_YTD.item_code = it_YTD.item_code and mdt_YTD.update_date = it_YTD.meta_date """

print(sql_price_visit_join)

 SELECT mdt_YTD.*, it_YTD.all_unique_views FROM model_data_table_YTD as mdt_YTD LEFT JOIN item_views_YTD as it_YTD
on mdt_YTD.item_code = it_YTD.item_code and mdt_YTD.update_date = it_YTD.meta_date 


In [99]:
df_price_visit_join = spark.sql(sql_price_visit_join)
df_price_visit_join.createOrReplaceTempView('mdt_YTD_it')

df_price_visit_join.describe(['item_code']).show()


+-------+------------------+
|summary|         item_code|
+-------+------------------+
|  count|              3230|
|   mean|    1.0000181616E9|
| stddev|3318.2973116408034|
|    min|000000001000015944|
|    max|000000001000024281|
+-------+------------------+



In [100]:
sql_top_item_query_group_by = """
WITH tmp_tbl AS (
      SELECT *,
             CASE
                 WHEN lag(delivery_weeks, 1) OVER (PARTITION BY item_code ORDER BY update_date ASC) =
                      delivery_weeks
                     AND
                      lag(item_price, 1) OVER (PARTITION BY item_code ORDER BY update_date ASC) =
                      item_price
                     THEN NULL
                 ELSE RANK() OVER (PARTITION BY item_code ORDER BY update_date)
                 END AS ranking_col --filled with row number or delivery week if the previous one is part of the same group,
        FROM mdt_YTD_it
  ),
       tmp_tbl2 AS (
           SELECT update_date,
                  item_code,
                  sales,
                  all_unique_views,
                  item_price,
                  delivery_weeks,
                  CASE
                      WHEN ranking_col IS NULL
                          THEN last(ranking_col, True) OVER (PARTITION BY item_code ORDER BY update_date ROWS BETWEEN UNBOUNDED PRECEDING and 1 PRECEDING)
                      ELSE ranking_col
                      END AS ranks
             FROM tmp_tbl
       )
SELECT CAST(MIN(update_date) as date)                     AS min_date,
       CAST (MAX(update_date) as date)                    AS max_date,
       item_code,
       item_price,
       delivery_weeks,
       avg(sales) as avg_sales,
       avg(all_unique_views) as avg_unique_views,
       CASE WHEN sum(all_unique_views) <> 0 THEN sum(sales)/sum(all_unique_views) ELSE NULL END as conv_rate,
       CAST (datediff(MAX(update_date), MIN(update_date) ) + 1 as int) AS bin
  FROM tmp_tbl2
 GROUP BY ranks, delivery_weeks, item_price, item_code
 ORDER BY item_code, min_date ASC;
"""

In [101]:
print(sql_top_item_query_group_by)


WITH tmp_tbl AS (
      SELECT *,
             CASE
                 WHEN lag(delivery_weeks, 1) OVER (PARTITION BY item_code ORDER BY update_date ASC) =
                      delivery_weeks
                     AND
                      lag(item_price, 1) OVER (PARTITION BY item_code ORDER BY update_date ASC) =
                      item_price
                     THEN NULL
                 ELSE RANK() OVER (PARTITION BY item_code ORDER BY update_date)
                 END AS ranking_col --filled with row number or delivery week if the previous one is part of the same group,
        FROM mdt_YTD_it
  ),
       tmp_tbl2 AS (
           SELECT update_date,
                  item_code,
                  sales,
                  all_unique_views,
                  item_price,
                  delivery_weeks,
                  CASE
                      WHEN ranking_col IS NULL
                          THEN last(ranking_col, True) OVER (PARTITION BY item_code ORDER BY update_date ROWS B

In [102]:
sql_top_item_df = spark.sql(sql_top_item_query_group_by)

In [103]:
sql_top_item_df.show(5000,False)

+----------+----------+------------------+------------+--------------+------------------+------------------+--------------------+---+
|min_date  |max_date  |item_code         |item_price  |delivery_weeks|avg_sales         |avg_unique_views  |conv_rate           |bin|
+----------+----------+------------------+------------+--------------+------------------+------------------+--------------------+---+
|2020-01-01|2020-01-02|000000001000015944|64.98999999 |2             |8.5               |264.0             |0.032196969696969696|2  |
|2020-01-03|2020-01-03|000000001000015944|69.98999999 |2             |7.0               |317.0             |0.022082018927444796|1  |
|2020-01-04|2020-01-10|000000001000015944|69.98999999 |1             |7.142857142857143 |283.2857142857143 |0.025214321734745335|7  |
|2020-01-11|2020-01-22|000000001000015944|64.98999999 |1             |6.5               |274.8333333333333 |0.023650697392359005|12 |
|2020-01-23|2020-01-24|000000001000015944|64.98999999 |2      

In [104]:
#CREATE A TABLE VIEW FOR Top10 sellers
sql_top_item_df.createOrReplaceTempView('top_sellers')


In [105]:
#JOIN THE TWO TABLEs
sql_join_query = " SELECT top.*, di.item_parent_item_code, di.item_main_category, di.item_sub_category_1, di.item_sub_category_2" \
                 "        from top_sellers as top " \
                 "INNER JOIN d_item_filtered as di on di.item_code = top.item_code "\
                 "ORDER BY top.min_date ASC"

In [106]:
print(sql_join_query)


 SELECT top.*, di.item_parent_item_code, di.item_main_category, di.item_sub_category_1, di.item_sub_category_2        from top_sellers as top INNER JOIN d_item_filtered as di on di.item_code = top.item_code ORDER BY top.min_date ASC


In [107]:
final_top_seller_df = spark.sql(sql_join_query)

In [108]:
final_top_seller_df.show(3000,False)


+----------+----------+------------------+------------+--------------+------------------+------------------+--------------------+---+---------------------+---------------------+-------------------+-------------------+
|min_date  |max_date  |item_code         |item_price  |delivery_weeks|avg_sales         |avg_unique_views  |conv_rate           |bin|item_parent_item_code|item_main_category   |item_sub_category_1|item_sub_category_2|
+----------+----------+------------------+------------+--------------+------------------+------------------+--------------------+---+---------------------+---------------------+-------------------+-------------------+
|2020-01-01|2020-01-05|000000001000016133|119.98999999|1             |10.6              |459.6             |0.023063533507397736|5  |000000008000001083   |ESSZIMMERSTÜHLE&BÄNKE|ESSZIMMERSTÜHLE    |KLASSISCH          |
|2020-01-01|2020-02-10|000000001000022708|99.98999999 |1             |3.1219512195121952|110.6829268292683 |0.028206258263552227

In [109]:
final_top_seller_df.describe(['bin']).show()

+-------+------------------+
|summary|               bin|
+-------+------------------+
|  count|               200|
|   mean|             16.15|
| stddev|39.848425124772774|
|    min|                 1|
|    max|               275|
+-------+------------------+



In [111]:
pd_df_top_sellers = final_top_seller_df.toPandas()


In [112]:
import pickle
import pandas as pd
import numpy as np


In [113]:
pd_df_top_sellers.to_pickle('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top_seller_YTD_group_by_item_visit.pickle')

In [114]:
pd_df_top_sellers['log_price'] = np.log(pd_df_top_sellers['item_price'])

In [115]:
pd_df_top_sellers['log_delivery_weeks'] = np.log(pd_df_top_sellers['delivery_weeks']+1)

In [116]:
pd_df_top_sellers['log_sales'] = np.log(pd_df_top_sellers['avg_sales']+1)


In [117]:
pd_df_top_sellers['log_avg_unique_views'] = np.log(pd_df_top_sellers['avg_unique_views']+1)


In [118]:
pd_df_top_sellers.to_pickle('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top_seller_YTD_group_by_item_visit.pickle')

In [120]:
pd_df_top_sellers.describe()

Unnamed: 0,item_price,delivery_weeks,avg_sales,avg_unique_views,conv_rate,bin,log_price,log_delivery_weeks,log_sales,log_avg_unique_views
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,99.71,2.57,9.940302,236.716368,0.050228,16.15,4.540459,1.156021,2.22498,5.285745
std,31.984632,1.841755,6.499238,136.753351,0.035966,39.848425,0.368547,0.476291,0.613488,0.678411
min,49.99,0.0,0.0,8.0,0.0,1.0,3.911823,0.0,0.0,2.197225
25%,59.99,1.0,6.0,155.1,0.025905,1.0,4.094178,0.693147,1.94591,5.050466
50%,119.99,2.0,8.583333,217.5,0.038462,3.0,4.787408,1.098612,2.259988,5.386783
75%,119.99,3.0,12.402941,296.5,0.058162,7.0,4.787408,1.386294,2.595474,5.69541
max,139.99,9.0,43.0,775.0,0.175824,275.0,4.941571,2.302585,3.78419,6.654153


In [76]:
pd_df_top_sellers.to_pickle('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top_seller_YTD_group_by.pickle')