In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').config('spark.driver.memory', '8g').appName('Ready_for_analysis').getOrCreate()



In [2]:
df = spark.read.parquet(
    '/Users/gabriele.sabato/PycharmProjects/raw_data/price_elasticity_model_data/part-*',
    header=True)


In [3]:
df.createOrReplaceTempView('model_data_table')



In [4]:
df.show()

+------------------+-----------+--------------+-----------+-----+
|         item_code|update_date|delivery_weeks| item_price|sales|
+------------------+-----------+--------------+-----------+-----+
|000000001000000008| 2019-01-26|             1|      13.99|    0|
|000000001000000008| 2019-04-12|             0|      13.99|    0|
|000000001000000008| 2019-06-22|             1|      13.99|    0|
|000000001000000008| 2019-08-13|             1|      13.99|    0|
|000000001000000008| 2019-12-27|             0|      13.99|    0|
|000000001000000013| 2018-12-18|             1|      39.99|    0|
|000000001000000013| 2020-01-30|             1|      39.99|    0|
|000000001000000013| 2020-10-26|             0|31.98999999|    0|
|000000001000000013| 2020-10-27|             0|31.98999999|    0|
|000000001000000014| 2019-04-19|             1|74.98999999|    0|
|000000001000000014| 2019-06-16|             1|74.98999999|    0|
|000000001000000014| 2019-11-10|             0|74.98999999|    0|
|000000001

In [5]:
df_d_item = spark.read.parquet('/Users/gabriele.sabato/PycharmProjects/raw_data/redshift_tables/d_item/*')


In [6]:
df_d_item.createOrReplaceTempView('d_item_tmp')

In [7]:
df2 = spark.sql("SELECT * from model_data_table where update_date >= '2020-01-01' ORDER BY update_date ")

In [8]:
df4 = spark.sql ("SELECT item_code, item_main_category, item_sub_category_1, item_sub_category_2,item_parent_item_code from d_item_tmp")

In [9]:
df4.createOrReplaceTempView('d_item_filtered')
df2.createOrReplaceTempView('model_data_table_YTD')

In [10]:
list_of_top_seller_item_code = ['000000001000016021','000000001000016133','000000001000022708', '000000001000015958',
                       '000000001000016020','000000001000015959','000000001000016019','000000001000022573','000000001000015944',
                       '000000001000024281']

In [11]:
top_seller_list_string = """'000000001000016021','000000001000016133','000000001000022708', '000000001000015958',
                       '000000001000016020','000000001000015959','000000001000016019','000000001000022573','000000001000015944',
                       '000000001000024281'"""

In [12]:
sql_top_item_query = """SELECT item_code, update_date, item_price, delivery_weeks, sales from model_data_table_YTD
where item_code in ({lista})""".format(lista=top_seller_list_string)

In [13]:
print(sql_top_item_query)

SELECT item_code, update_date, item_price, delivery_weeks, sales from model_data_table_YTD
where item_code in ('000000001000016021','000000001000016133','000000001000022708', '000000001000015958',
                       '000000001000016020','000000001000015959','000000001000016019','000000001000022573','000000001000015944',
                       '000000001000024281')


In [14]:
sql_top_item_df = spark.sql(sql_top_item_query)

In [15]:
sql_top_item_df.show()

+------------------+-----------+------------+--------------+-----+
|         item_code|update_date|  item_price|delivery_weeks|sales|
+------------------+-----------+------------+--------------+-----+
|000000001000016019| 2020-01-01|119.98999999|             5|    0|
|000000001000016021| 2020-01-01|119.98999999|             5|   23|
|000000001000022708| 2020-01-01| 99.98999999|             1|    2|
|000000001000022573| 2020-01-01| 99.98999999|             2|    7|
|000000001000015959| 2020-01-01|      129.99|             0|    1|
|000000001000024281| 2020-01-01|       59.99|             1|   10|
|000000001000015958| 2020-01-01|      129.99|             1|   11|
|000000001000015944| 2020-01-01| 64.98999999|             2|    4|
|000000001000016020| 2020-01-01|119.98999999|             2|    8|
|000000001000016133| 2020-01-01|119.98999999|             1|    3|
|000000001000015958| 2020-01-02|      129.99|             6|   17|
|000000001000016020| 2020-01-02|119.98999999|             2|  

In [16]:
#CREATE A TABLE VIEW FOR Top10 sellers
sql_top_item_df.createOrReplaceTempView('top_sellers')


In [17]:
#JOIN THE TWO TABLEs
sql_join_query = " SELECT top.*, di.item_parent_item_code, di.item_main_category, di.item_sub_category_1, di.item_sub_category_2" \
                 "        from top_sellers as top " \
                 "INNER JOIN d_item_filtered as di on di.item_code = top.item_code "\
                 "ORDER BY top.update_date ASC"

In [18]:
print(sql_join_query)


 SELECT top.*, di.item_parent_item_code, di.item_main_category, di.item_sub_category_1, di.item_sub_category_2        from top_sellers as top INNER JOIN d_item_filtered as di on di.item_code = top.item_code ORDER BY top.update_date ASC


In [19]:
final_top_seller_df = spark.sql(sql_join_query)

In [20]:
final_top_seller_df.show()

+------------------+-----------+------------+--------------+-----+---------------------+--------------------+-------------------+-------------------+
|         item_code|update_date|  item_price|delivery_weeks|sales|item_parent_item_code|  item_main_category|item_sub_category_1|item_sub_category_2|
+------------------+-----------+------------+--------------+-----+---------------------+--------------------+-------------------+-------------------+
|000000001000016019| 2020-01-01|119.98999999|             5|    0|   000000008000001083|ESSZIMMERSTÜHLE&B...|    ESSZIMMERSTÜHLE|          KLASSISCH|
|000000001000016021| 2020-01-01|119.98999999|             5|   23|   000000008000001083|ESSZIMMERSTÜHLE&B...|    ESSZIMMERSTÜHLE|          KLASSISCH|
|000000001000022708| 2020-01-01| 99.98999999|             1|    2|   000000008000017295|MATRATZEN&LATTENR...|        LATTENROSTE|          FEDERHOLZ|
|000000001000022573| 2020-01-01| 99.98999999|             2|    7|   000000008000001979|MATRATZEN&LA

In [21]:
final_top_seller_df.describe()

DataFrame[summary: string, item_code: string, item_price: string, delivery_weeks: string, sales: string, item_parent_item_code: string, item_main_category: string, item_sub_category_1: string, item_sub_category_2: string]

In [22]:
pd_df_top_sellers = final_top_seller_df.toPandas()

In [23]:
import pickle
import pandas as pd
import numpy as np


In [24]:
pd_df_top_sellers.to_pickle('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top_seller_YTD.pickle')

In [25]:
pd_df_top_sellers['log_price'] = np.log(pd_df_top_sellers['item_price'])

In [26]:
pd_df_top_sellers['log_delivery_weeks'] = np.log(pd_df_top_sellers['delivery_weeks']+1)

In [27]:
pd_df_top_sellers['log_sales'] = np.log(pd_df_top_sellers['sales']+1)


In [28]:
pd_df_top_sellers.to_pickle('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top_seller_YTD.pickle')