In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').config('spark.driver.memory', '16g').appName('Captain').getOrCreate()

import pandas as pd
import numpy as np


In [2]:
df_d_item = spark.read.parquet('/Users/gabriele.sabato/PycharmProjects/raw_data/redshift_tables/d_item/*')


In [3]:
df_d_item.createOrReplaceTempView('d_item_tmp')

In [4]:
#read csv files
original_df_up = pd.read_csv('../raw_data/Excel_files/Item_margin_up_pre_campaign_202106.csv', index_col=0)
original_df_down = pd.read_csv('../raw_data/Excel_files/Item_margin_down_pre_campaign_202106.csv', index_col=0)

In [5]:
frames = [original_df_down,original_df_up]
together_df = pd.concat(frames)

In [6]:
df_spark_together = spark.createDataFrame(together_df)

In [9]:
df_spark_together.createOrReplaceTempView('margin_total')

In [7]:
df_spark_together.show()

+----------+------------+----------+--------------+--------+------------+--------------------+-----------------+---------+-------------+-----------------+---------------+------+-------+------------------+--------------------+------------------+--------------------+--------------------+----------+--------------------+
| item_code|item_price_x|  max_date|delivery_weeks|sales_pe|sales_pe_err|       beta_del_week|beta_del_week_err|intercept|intercept_err|              pc2|      item_skey|     x|Delta_x|        Delta_cost|       Delta_N_Sales|       log_x_prime|       N_prime_sales|        Delta_Margin|Group_flag|           Increment|
+----------+------------+----------+--------------+--------+------------+--------------------+-----------------+---------+-------------+-----------------+---------------+------+-------+------------------+--------------------+------------------+--------------------+--------------------+----------+--------------------+
|1000054054|      329.99|2021-04-26|       

In [10]:
df_d_item.dtypes


[('item_skey', 'bigint'),
 ('item_code', 'string'),
 ('item_parent_item_code', 'string'),
 ('item_name', 'string'),
 ('item_parent_name', 'string'),
 ('item_brand', 'string'),
 ('item_designer', 'string'),
 ('item_series', 'string'),
 ('item_style', 'string'),
 ('item_colors', 'string'),
 ('item_materials', 'string'),
 ('item_gtin', 'string'),
 ('item_gtin_comparable', 'string'),
 ('item_picture_url_first', 'string'),
 ('item_pdp_url', 'string'),
 ('item_exclusivity', 'string'),
 ('item_is_trusted_shop', 'string'),
 ('item_is_in_catalog', 'string'),
 ('item_main_category', 'string'),
 ('item_sub_category_1', 'string'),
 ('item_sub_category_2', 'string'),
 ('item_type', 'string'),
 ('item_cross_plant_material_status', 'string'),
 ('item_cross_distribution_chain_material_status', 'string'),
 ('item_supplier_item_code', 'string'),
 ('item_supplier_regular', 'string'),
 ('item_supplier_regular_id', 'int'),
 ('item_first_activation_date', 'date'),
 ('item_first_sellable_date', 'date'),
 ('i

In [11]:
df_captain_ready = spark.sql("""
SELECT di.item_code,
       di.item_main_category,
       di.item_inventory_flag,
       di.item_inco_terms,
       pe.item_price_x as price,
       pe.sales_pe as price_elasticity,
       pe.sales_pe_err as price_elasticity_err,
       pe.Increment as price_pct_change

  FROM d_item_tmp as di
       INNER JOIN margin_total as pe ON pe.item_skey = di.item_skey
""")

In [12]:
df_captain_ready.show()

#df_list_top_seller.show()



+------------------+--------------------+-------------------+---------------+------+----------------+--------------------+--------------------+
|         item_code|  item_main_category|item_inventory_flag|item_inco_terms| price|price_elasticity|price_elasticity_err|    price_pct_change|
+------------------+--------------------+-------------------+---------------+------+----------------+--------------------+--------------------+
|000000001000054054|     ESSZIMMERTISCHE|                mto|            DDP|329.99|         -0.8534|              0.1496|-14.000000000000002%|
|000000001000137922|              BETTEN|               laso|            EXW|629.99|          -1.952|              0.6018|-14.000000000000002%|
|000000001000007431|ESSZIMMERSTÜHLE&B...|               laso|            FOB|104.99|        -18.2928|              2.4617|-14.000000000000002%|
|000000001000048003|              WOHNEN|               laso|            DDP|349.99|         -2.7317|              0.3851|-14.0000000000

In [None]:
#df_item_visits_df.show()

In [15]:

pd_captain = df_captain_ready.toPandas()

In [16]:
pd_captain.to_csv('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Fixed_increment_14_pct_captain_ready_202106.csv')


