In [1]:
#IMPORTING SETUP PACKAGES
import statsmodels.formula.api as smf
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 500)

In [2]:
number_of_top_items = '5000'
plot_name = number_of_top_items[0] + 'k'

#print(plot_name)

df = pd.read_pickle('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top'+number_of_top_items+'_seller_YTD_group_by_item_visit_20210226.pickle')


In [3]:
print(df.item_code.nunique())



4488


In [4]:
print(df.dtypes)

min_date                  object
max_date                  object
item_code                 object
item_price               float64
delivery_weeks             int64
avg_sales                float64
avg_unique_views         float64
conv_rate                float64
bin                        int32
item_parent_item_code     object
item_main_category        object
item_sub_category_1       object
item_sub_category_2       object
log_price                float64
log_delivery_weeks       float64
log_sales                float64
log_avg_unique_views     float64
log_conv_rate            float64
dtype: object


In [5]:
def filter_df(DF = df,bins=6, avg_PDP = 19.):
    df_filt = DF[DF['bin'] > bins]
    df_items = df_filt.groupby('item_code').item_price.nunique() > 1
    df_filt  = df_filt.merge(df_items,on='item_code',how='inner')
    df_filt  = df_filt[df_filt['item_price_y']== True]
    #filter PDP
    #print(df_filt)
    df_filt = df_filt[df_filt['avg_unique_views'] > avg_PDP]
    #print(df_filt)
    return df_filt

In [6]:
df_filt = filter_df(df)


print(df_filt.item_code.nunique())


4181


In [7]:
#introducing the similarity score
sim_score = pd.read_parquet('/Users/gabriele.sabato/PycharmProjects/raw_data/recommendations_prod/part-00000-5f77392a-0bfa-4f0b-a4ae-d816ce2d71de-c000.snappy.parquet')

In [8]:
sim_score

Unnamed: 0,sku,recommended_sku,score
0,000000001000200072,000000001000200087,0.260000
1,000000001000200072,000000001000183538,0.216580
2,000000001000200072,000000001000142209,0.200000
3,000000001000200072,000000001000204906,0.190794
4,000000001000200072,000000001000183468,0.183980
...,...,...,...
3827600,000000001000064787,000000001000115010,0.203238
3827601,000000001000064787,000000001000112641,0.201339
3827602,000000001000064787,000000001000046204,0.200052
3827603,000000001000064787,000000001000114086,0.199867


In [9]:
# Filter table with only topX item_code

unique_item_codes = list(df_filt.item_code.unique())


sim_score_filt = sim_score[sim_score['sku'].isin(unique_item_codes)]

sim_score_filt

Unnamed: 0,sku,recommended_sku,score
780,000000001000135008,000000001000135012,0.881024
781,000000001000135008,000000001000121202,0.715546
782,000000001000135008,000000001000167468,0.633481
783,000000001000135008,000000001000166799,0.564164
784,000000001000135008,000000001000208980,0.544883
...,...,...,...
3827360,000000001000176744,000000001000171107,0.339302
3827361,000000001000176744,000000001000163202,0.335340
3827362,000000001000176744,000000001000163190,0.327887
3827363,000000001000176744,000000001000176438,0.324293


In [10]:
#Create a map with X subset for X linear fit:

def create_map(list_of_items = unique_item_codes , df=sim_score_filt, score_cut = 0.):
    map_of_items = {}
    for item_code in list_of_items:
        df_mod = df[df['sku'] == item_code]
        df_score_filt = df_mod[df_mod['score'] > score_cut]
        final_set_sku = list(df_score_filt.recommended_sku)
        final_set_sku.insert(0,item_code)
        map_of_items[item_code] = final_set_sku
    #print(map_of_items)
    return map_of_items


In [11]:
sim_score_bin = [0.16]#0.3,0.4,0.5,0.6,0.7,0.8,0.9]




In [12]:
map_of_map_of_items = {}
for sim_score_i in sim_score_bin:
    Map_of_Items = create_map(score_cut = sim_score_i)
    map_of_map_of_items[str(sim_score_i)] = Map_of_Items
#print(map_of_map_of_items)

In [13]:
#load greater dataset 
largest_dataset = '/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/All_catalogue_seller_YTD_group_by_item_visit_20210226.pickle'

#df_k = pd.read_pickle(dataset10k)
df_k = pd.read_pickle(largest_dataset)

In [14]:
PDP_bin = [19]#,30,40,50]

In [15]:
df_PDP_bin_filt = {}
for PDP_view in PDP_bin:
    df_k_filt = filter_df(df_k, avg_PDP= PDP_view)
    df_PDP_bin_filt[PDP_view] = df_k_filt
    #print(df_k_filt)

#print(df_PDP_bin_filt)


In [16]:
def create_map_of_dataframes(items_map = Map_of_Items,df=df_filt):
    map_of_df = {}
    for key in items_map.keys():
        items  = items_map[key]
        #print(f'Item_code = {key}')
        
        df_mod = df[df['item_code'].isin(items)]
        #
        #print(f'Number of items per cluster {df_mod.item_code.nunique()}')
        map_of_df[key]= df_mod
    return map_of_df



In [17]:
import numpy as np
def run_model(item_code, df, map_results):
    #print('\n')
    wls_cr_with_fe = smf.wls("conv_rate ~ log_price + log_delivery_weeks + C(item_code)", df, weights=df['bin']).fit()

    beta_log_price, beta_std_err_log_price = wls_cr_with_fe.params[-2].round(4),wls_cr_with_fe.bse[-2].round(4)
    beta_log_del, beta_std_err_log_del = wls_cr_with_fe.params[-1].round(4),wls_cr_with_fe.bse[-1].round(4)
    Result_log_price = 'Log_price  = ' + str(beta_log_price) + ' ± ' + str(beta_std_err_log_price)
    Result_log_del = 'Log_del_week = ' + str(beta_log_del) + ' ± ' + str(beta_std_err_log_del)
    flag_stat_ok = False
    if beta_log_price != 0 :
        flag_stat_ok = np.fabs((3*beta_std_err_log_price)/beta_log_price) < 1.
    #flag_stat_ok = fabs(beta_std_err_log_price/beta_log_price)

    if flag_stat_ok:
        #print(f'Enough statistics for the item = {item_code}')
        #print(Result_log_price)
        #print(Result_log_del)
        map_results[item_code] = [beta_log_price, beta_std_err_log_price, beta_log_del,beta_std_err_log_del, df.item_code.nunique(), True]
        #print(map_results[item_code])
        return True
    else:
        #print(f'Not enough statistics for the item = {item_code}')
        #print(Result_log_price)
        #print(Result_log_del)
        map_results[item_code] = [beta_log_price, beta_std_err_log_price, beta_log_del,beta_std_err_log_del, df.item_code.nunique(), False]
        #print(map_results[item_code])
        return False


In [18]:
df_result = pd.DataFrame()

map_of_maps_of_df = {}
#print('Cut on PDP, sim_score, result = ')
for PDP_cut in df_PDP_bin_filt.keys():
    for score_cut in  map_of_map_of_items.keys():
        map_of_dfs = create_map_of_dataframes(map_of_map_of_items[score_cut],df=df_PDP_bin_filt[PDP_cut])
        map_of_maps_of_df[PDP_cut] = map_of_dfs
        print(f'Number of items passing the PE cut {len(map_of_dfs)}')

        map_result = {}

        #print("Model used: conv_rate ~ log_price + log_delivery_weeks + C(item_code)\n")
        map_items_stat = {}
        for key in map_of_dfs.keys():
            #map_of_dfs[key].head()
            map_items_stat[key] = run_model(key, map_of_dfs[key], map_result)

            #print(map_result)

        items_stat_series = pd.Series(map_items_stat)
        #print(map_items_stat)
        #print(items_stat_series)


        
        x,y, y_err, rel_err = {'True':[],'False':[]},{'True':[],'False':[]},{'True':[],'False':[]},{'True':[],'False':[]}
        for i_code,flag in items_stat_series.items():
        #flag=T
            if flag == True:

                y['True'].append(map_result[i_code][0])
                y_err['True'].append(map_result[i_code][1])
                x['True'].append(map_result[i_code][-2])
                #print('True',map_result[i_code][0],map_result[i_code][1])
                rel_err['True'].append(abs(map_result[i_code][1]/map_result[i_code][0]))


            else:
                y['False'].append(map_result[i_code][0])
                y_err['False'].append(map_result[i_code][1])
                x['False'].append(map_result[i_code][-2])
                #print('False',map_result[i_code][0],map_result[i_code][1])
                rel_err['False'].append(abs(map_result[i_code][1]/map_result[i_code][0]))

                continue
        #print(f'{PDP_cut},{score_cut} , {len(x)}')
        df_result = pd.DataFrame.from_dict(map_result, orient='index', columns = ['PE', 'PE_err', 'log_del_week', 'log_del_week_err', 'Cluster_size', 'has_enough_stat'])
        #print (df_result.head(100))

Number of items passing the PE cut 4181


  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  return np.dot(wresid, wresid) / self.df_resid
  rel_err['False'].append(abs(map_result[i_code][1]/map_result[i_code][0]))


In [19]:
df_result_true = df_result[df_result['has_enough_stat']==True]
print(len(df_result_true.index))


1907


In [20]:
df_result_false = df_result[df_result['has_enough_stat']==False]
items_false = list(df_result_false.index.values)
print(len(items_false))

2274


In [21]:
import pickle
def save_obj(obj, name ):
    with open('pickle_files/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


In [22]:
df_result_true = df_result_true[['PE','PE_err']]

In [23]:
#df_result_true = df_result_true.rename(column_index={'':'item_code'})
print(df_result_true.head(1000))
print(df_result_true.dtypes)

                        PE  PE_err
000000001000012530 -0.0187  0.0059
000000001000021922 -0.0194  0.0025
000000001000005727 -0.0246  0.0044
000000001000007327 -0.0229  0.0049
000000001000050787 -0.0284  0.0087
...                    ...     ...
000000001000053282 -0.0138  0.0028
000000001000007033 -0.0054  0.0016
000000001000007443 -0.0217  0.0054
000000001000008013 -0.0142  0.0038
000000001000008067 -0.0294  0.0048

[1000 rows x 2 columns]
PE        float64
PE_err    float64
dtype: object


In [24]:
#PREPARE THE CONVERSION RATE FILE
final_items = list(df_result_true.index.values)
print(len(final_items))



1907


In [31]:
print(df_filt.head(10))

      min_date    max_date           item_code  item_price_x  delivery_weeks  \
0   2020-01-01  2020-05-23  000000001000012530         99.99               1   
2   2020-05-31  2020-06-26  000000001000012530         99.99               1   
5   2020-08-06  2020-08-20  000000001000012530         99.99               1   
7   2020-09-16  2020-10-31  000000001000012530         99.99               1   
8   2020-11-01  2020-11-07  000000001000012530         99.99               1   
9   2020-11-10  2020-11-22  000000001000012530         99.99               1   
11  2020-12-08  2020-12-27  000000001000012530         99.99               1   
12  2021-01-05  2021-02-13  000000001000012530         99.99               1   
13  2020-01-01  2020-03-13  000000001000021922        129.99               2   
14  2020-04-15  2020-04-21  000000001000021922        119.99               4   

    avg_sales  avg_unique_views  conv_rate  bin item_parent_item_code  \
0    0.520833         35.743056   0.014573  14

In [32]:
df_final = df_filt[['item_code','max_date','item_price_x','conv_rate','avg_sales', 'avg_unique_views']]

print(df_final.head())

            item_code    max_date  item_price_x  conv_rate  avg_sales  \
0  000000001000012530  2020-05-23         99.99   0.014573   0.520833   
2  000000001000012530  2020-06-26         99.99   0.008923   0.444444   
5  000000001000012530  2020-08-20         99.99   0.026937   0.533333   
7  000000001000012530  2020-10-31         99.99   0.013050   0.478261   
8  000000001000012530  2020-11-07         99.99   0.021128   1.714286   

   avg_unique_views  
0         35.743056  
2         49.814815  
5         19.800000  
7         36.652174  
8         81.142857  


In [33]:
df_final= df_final[df_filt['item_code'].isin(final_items)]

In [34]:
print(df_final.head())

            item_code    max_date  item_price_x  conv_rate  avg_sales  \
0  000000001000012530  2020-05-23         99.99   0.014573   0.520833   
2  000000001000012530  2020-06-26         99.99   0.008923   0.444444   
5  000000001000012530  2020-08-20         99.99   0.026937   0.533333   
7  000000001000012530  2020-10-31         99.99   0.013050   0.478261   
8  000000001000012530  2020-11-07         99.99   0.021128   1.714286   

   avg_unique_views  
0         35.743056  
2         49.814815  
5         19.800000  
7         36.652174  
8         81.142857  


In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').config('spark.driver.memory', '8g').appName('Ready_for_analysis').getOrCreate()

In [36]:
df_to_spark = df_final.groupby(['item_code']).nth(-1)
df_to_spark.reset_index(level=0,inplace=True)
df_to_spark.rename(columns={'index':'item_code'}, inplace=True)
spark_check = spark.createDataFrame(df_to_spark)


In [37]:
spark_check.show()

+------------------+----------+------------+--------------------+-------------------+------------------+
|         item_code|  max_date|item_price_x|           conv_rate|          avg_sales|  avg_unique_views|
+------------------+----------+------------+--------------------+-------------------+------------------+
|000000001000000145|2020-06-04|      169.99|0.019559359621451104| 1.3880597014925373| 70.97014925373135|
|000000001000002470|2021-02-25|       54.99| 0.10219078102189781|                2.0|19.571428571428573|
|000000001000002471|2021-02-16|       53.99| 0.07605733802816901|             1.6875|           22.1875|
|000000001000002479|2021-02-08|       79.99| 0.04507437526205451| 3.3076923076923075| 73.38461538461539|
|000000001000004855|2021-02-04|      799.99|0.004241882103477523|0.23809523809523808|56.142857142857146|
|000000001000005211|2021-02-25|     1299.99|0.007408407407407408|0.14285714285714285|19.285714285714285|
|000000001000005213|2020-02-06|      599.99|0.012404100

In [38]:
spark_df = spark.createDataFrame(df_final)

In [39]:
spark_df.createOrReplaceTempView('Final_table')


In [40]:
spark_last_price = spark.sql("""with max_price as (SELECT item_code, ROW_NUMBER() over (PARTITION BY item_code ORDER BY max_date DESC) as rn, item_price_x, max_date
                            from Final_table)
                            SELECT item_code, item_price_x, max_date
                            from max_price
                            where rn=1
                            """)
spark_last_price.show()

+------------------+------------+----------+
|         item_code|item_price_x|  max_date|
+------------------+------------+----------+
|000000001000012530|       99.99|2021-02-13|
|000000001000021922|      114.99|2021-02-16|
|000000001000022179|      479.99|2021-02-06|
|000000001000034743|      279.99|2021-02-04|
|000000001000145006|      289.99|2021-02-04|
|000000001000005727|       99.99|2021-02-25|
|000000001000007327|      209.99|2021-02-25|
|000000001000008870|      619.99|2021-02-11|
|000000001000016028|      139.99|2021-02-25|
|000000001000050787|      369.99|2021-02-20|
|000000001000114877|      119.99|2021-02-25|
|000000001000117179|      319.99|2021-02-25|
|000000001000128764|      164.99|2021-02-11|
|000000001000164660|      729.99|2021-02-25|
|000000001000166801|      174.99|2021-02-25|
|000000001000175476|       83.99|2021-02-25|
|000000001000176734|      159.99|2021-02-08|
|000000001000194074|       77.99|2021-02-25|
|000000001000008003|      429.99|2021-02-25|
|000000001

In [41]:
spark_last_price.createOrReplaceTempView('last_price')


In [42]:
spark_check.createOrReplaceTempView('check_last_price')


In [43]:
disjoint_spark = spark.sql("""SELECT *
                                from check_last_price as clp
                                inner join last_price as lp on lp.item_code = clp.item_code and lp.item_price_x!= clp.item_price_x  """)

disjoint_spark.show()

+---------+--------+------------+---------+---------+----------------+---------+------------+--------+
|item_code|max_date|item_price_x|conv_rate|avg_sales|avg_unique_views|item_code|item_price_x|max_date|
+---------+--------+------------+---------+---------+----------------+---------+------------+--------+
+---------+--------+------------+---------+---------+----------------+---------+------------+--------+



In [44]:
spark_filter = spark.sql("""SELECT ft.item_code, AVG(ft.conv_rate) as avg_conv_rate, MAX(ft.item_price_x) as last_price, MAX(ft.max_date) as last_date, AVG(avg_unique_views) as avg_unique_views
                            from Final_table as ft
                            inner join last_price as lp on lp.item_code = ft.item_code and lp.item_price_x = ft.item_price_x
                            GROUP BY ft.item_code
                            """)
spark_filter.show()

+------------------+--------------------+----------+----------+------------------+
|         item_code|       avg_conv_rate|last_price| last_date|  avg_unique_views|
+------------------+--------------------+----------+----------+------------------+
|000000001000021922|0.023490245459702064|    114.99|2021-02-16| 90.11309523809524|
|000000001000022179|0.003121343752816...|    479.99|2021-02-06| 93.25873910040576|
|000000001000145006|0.004873447305681728|    289.99|2021-02-04| 188.1904761904762|
|000000001000034743|0.005481390945972513|    279.99|2021-02-04|110.20249433106576|
|000000001000012530|0.016219531350040728|     99.99|2021-02-13| 66.08138736263736|
|000000001000175476|0.055747730901582936|     83.99|2021-02-25| 80.72222222222223|
|000000001000194074|0.024822180486446917|     77.99|2021-02-25| 97.43777360850532|
|000000001000114877| 0.01852575581003705|    119.99|2021-02-25|269.90909090909093|
|000000001000166801|0.020149259328747133|    174.99|2021-02-25| 44.92005096352923|
|000

In [45]:
spark_filter.createOrReplaceTempView('Final_table_filt')



In [46]:

df_result_true.reset_index(level=0,inplace=True)



In [47]:
df_result_true.rename(columns={'index':'item_code'}, inplace=True)

In [48]:
print(df_result_true.head())

            item_code      PE  PE_err
0  000000001000012530 -0.0187  0.0059
1  000000001000021922 -0.0194  0.0025
2  000000001000005727 -0.0246  0.0044
3  000000001000007327 -0.0229  0.0049
4  000000001000050787 -0.0284  0.0087


In [49]:
spark_PE_df = spark.createDataFrame(df_result_true)

In [50]:
spark_PE_df.show()

+------------------+-------+------+
|         item_code|     PE|PE_err|
+------------------+-------+------+
|000000001000012530|-0.0187|0.0059|
|000000001000021922|-0.0194|0.0025|
|000000001000005727|-0.0246|0.0044|
|000000001000007327|-0.0229|0.0049|
|000000001000050787|-0.0284|0.0087|
|000000001000117179|-0.0242|0.0072|
|000000001000164660| -0.003|7.0E-4|
|000000001000176734|-0.0149| 0.003|
|000000001000008470|-0.0366|0.0073|
|000000001000064521| -0.007|0.0022|
|000000001000124600|-0.0145|0.0034|
|000000001000159178|-0.0215|0.0058|
|000000001000174312|-0.0096|0.0023|
|000000001000174905|-0.0245|0.0054|
|000000001000176716|-0.0258|0.0047|
|000000001000008827|-0.0136|0.0024|
|000000001000012817|-0.0128|0.0025|
|000000001000022615|-0.0625|0.0131|
|000000001000027423|-0.0089| 0.002|
|000000001000032520|-0.0429|0.0046|
+------------------+-------+------+
only showing top 20 rows



In [51]:
spark_PE_df.createOrReplaceTempView('final_PE')

In [52]:
spark_final = spark.sql("""SELECT ftf.*, fpe.PE, fpe.PE_err
                            from Final_table_filt as ftf
                            INNER JOIN final_PE as fpe on fpe.item_code = ftf.item_code""")
spark_final.show()
spark_final.createOrReplaceTempView('spark_final_tmp')

+------------------+--------------------+----------+----------+------------------+-------+------+
|         item_code|       avg_conv_rate|last_price| last_date|  avg_unique_views|     PE|PE_err|
+------------------+--------------------+----------+----------+------------------+-------+------+
|000000001000012530|0.016219531350040728|     99.99|2021-02-13| 66.08138736263736|-0.0187|0.0059|
|000000001000021922|0.023490245459702064|    114.99|2021-02-16| 90.11309523809524|-0.0194|0.0025|
|000000001000022179|0.003121343752816...|    479.99|2021-02-06| 93.25873910040576|-0.0136|0.0033|
|000000001000034743|0.005481390945972513|    279.99|2021-02-04|110.20249433106576|-0.0186|0.0048|
|000000001000145006|0.004873447305681728|    289.99|2021-02-04| 188.1904761904762|-0.0086|0.0025|
|000000001000005727|0.017079081037479026|     99.99|2021-02-25| 170.0990338164251|-0.0246|0.0044|
|000000001000007327| 0.01119257340355497|    209.99|2021-02-25|46.738461538461536|-0.0229|0.0049|
|000000001000008870|

In [53]:
spark.sql("""SELECT COUNT(DISTINCT item_code)
from spark_final_tmp""").show()


+-------------------------+
|count(DISTINCT item_code)|
+-------------------------+
|                     1907|
+-------------------------+



In [54]:
spark.sql("""SELECT COUNT(item_code)
from spark_final_tmp""").show()



+----------------+
|count(item_code)|
+----------------+
|            1907|
+----------------+



In [55]:
#Introduce d_item table
sp_df_item = spark.read.parquet(
    '/Users/gabriele.sabato/PycharmProjects/raw_data/redshift_tables/d_item/*.parquet',
    header=True)
sp_df_item.show()

+---------------+------------------+---------------------+-------------+----------------+----------+-------------+-----------+----------+-----------+--------------+-------------+--------------------+----------------------+--------------------+------------------+--------------------+------------------+-------------------+-------------------+---------+--------------------------------+---------------------------------------------+-----------------------+---------------------+------------------------+--------------------------+------------------------+--------------------+-----------+-------------------+------------------+-------------------+----------------------------+---------------+-----------------------+-----------------------+----------------------+------------------------------------+-------------------------------------------+-------------------------------------------------+--------------------------------------------------------+------------------------+---------------------+---

In [56]:
#Introduce PC2 table
sp_df_pc2 = spark.read.parquet(
    '/Users/gabriele.sabato/PycharmProjects/raw_data/redshift_tables/f_item_status_last/*.parquet',
    header=True)
sp_df_pc2.show()

+-----------+---------+----------------+-----------------------+--------------------------+--------------------------+--------------------------+--------------------------------+---------------------------------+---------------------------------------+
|  item_skey|shop_skey|item_status_skey|item_status_weekly_skey|item_pc1_costs_before_cogs|item_pc2_costs_before_cogs|group_pc2_margin_simulated|group_pc2_margin_ratio_simulated|group_pc2_margin_simulated_pocket|group_pc2_margin_ratio_simulated_pocket|
+-----------+---------+----------------+-----------------------+--------------------------+--------------------------+--------------------------+--------------------------------+---------------------------------+---------------------------------------+
| 4184608796|        1|               4|                      4|                   674.169|                   800.340|                      null|                            null|                             null|                             

In [57]:
sp_df_item.createOrReplaceTempView('d_item_tmp')
sp_df_pc2.createOrReplaceTempView('d_item_pc2')

In [58]:
pc2_table = spark.sql(""" WITH pc2_avg as (
                            SELECT item_skey,
                            item_pc2_costs_before_cogs as pc2
                            from d_item_pc2
                            where shop_skey = 1)
                            SELECT pc.item_skey,di.item_code, pc.pc2
                            from d_item_tmp as di
                            INNER JOIN pc2_avg as pc on pc.item_skey = di.item_skey
                            where di.item_code like '0%'
                            """)
pc2_table.show()


+------------+------------------+-------+
|   item_skey|         item_code|    pc2|
+------------+------------------+-------+
|  4184608796|000000001000157424|800.340|
| 43499951512|000000001000114687|342.220|
| 63209271384|000000001000193143|113.050|
| 64339265064|000000001000038173|307.399|
| 67389826157|000000001000232523| 13.970|
|178212850851|000000001000220797|111.220|
|251843179141|000000001000125573|510.380|
|321153977917|000000001000151885|532.309|
|324513273733|000000001000176129| 29.980|
|360959333013|000000001000222621| 73.629|
|391563460770|000000001000061915|212.229|
|414751597565|000000001000024545| 65.040|
|451907443162|000000001000103142|285.139|
|453294840650|000000001000047001|146.060|
|462665822707|000000001000112387| 46.340|
|464745354617|000000001000020098|346.680|
|518044503882|000000001000016906|619.179|
|526193144479|000000001000010513| 33.609|
|551043742079|000000001000219102| 13.289|
|556925096750|000000001000187198| 15.800|
+------------+------------------+-

In [59]:
pc2_table.createOrReplaceTempView('pc2_table')

In [60]:
spark.sql("""SELECT COUNT(DISTINCT item_skey) FROM pc2_table""").show()
spark.sql("""SELECT COUNT(item_skey) FROM pc2_table""").show()

+-------------------------+
|count(DISTINCT item_skey)|
+-------------------------+
|                   229627|
+-------------------------+

+----------------+
|count(item_skey)|
+----------------+
|          229627|
+----------------+



In [61]:
spark_pc2_final = spark.sql(""" SELECT sp.*, pc.pc2, pc.item_skey
                                from spark_final_tmp as sp
                                 INNER JOIN pc2_table as pc on sp.item_code = pc.item_code
                                 """)

spark_pc2_final.show()

+------------------+--------------------+----------+----------+------------------+-------+------+-------+---------------+
|         item_code|       avg_conv_rate|last_price| last_date|  avg_unique_views|     PE|PE_err|    pc2|      item_skey|
+------------------+--------------------+----------+----------+------------------+-------+------+-------+---------------+
|000000001000012530|0.016219531350040728|     99.99|2021-02-13| 66.08138736263736|-0.0187|0.0059| 44.680|247523697279378|
|000000001000021922|0.023490245459702064|    114.99|2021-02-16| 90.11309523809524|-0.0194|0.0025|   null|156995084198890|
|000000001000022179|0.003121343752816...|    479.99|2021-02-06| 93.25873910040576|-0.0136|0.0033|276.870| 11954970507506|
|000000001000034743|0.005481390945972513|    279.99|2021-02-04|110.20249433106576|-0.0186|0.0048|171.030|206871238538442|
|000000001000145006|0.004873447305681728|    289.99|2021-02-04| 188.1904761904762|-0.0086|0.0025|188.819|280637889289282|
|000000001000005727|0.01

In [62]:

df_final_spark = spark_pc2_final.toPandas()

In [63]:
import math
import numpy as np


In [64]:
df_final_spark=df_final_spark.astype({'pc2': 'float64'})

print(df_final_spark.dtypes)

df_final_spark.dropna(inplace=True)

item_code            object
avg_conv_rate       float64
last_price          float64
last_date            object
avg_unique_views    float64
PE                  float64
PE_err              float64
pc2                 float64
item_skey             int64
dtype: object


In [65]:
#df_final_spark= df_final_spark[df_final_spark['PE'] < 0]


In [66]:
VAT = 1.19



In [67]:
def create_Margin( increment = 0.01,dataf= df_final_spark,):

    #dataf['price_up']   = dataf['last_price']*math.exp(increment)
    dataf['price_down'] = dataf['last_price']*math.exp(-increment)
    #dataf['conv_rate_up'] =   dataf['avg_conv_rate']*np.exp(dataf['PE']*np.log(dataf['price_up']/dataf['last_price']))
    dataf['conv_rate_down'] = dataf['avg_conv_rate']*np.exp(dataf['PE']*np.log(dataf['price_down']/dataf['last_price']))
    #dataf['conv_rate_up_err']   = np.fabs(dataf['conv_rate_up'])*np.fabs(dataf['PE_err']*np.log(dataf['price_up']/dataf['last_price']))
    dataf['conv_rate_down_err'] = np.fabs(dataf['conv_rate_down'])*np.fabs(dataf['PE_err']*np.log(dataf['price_down']/dataf['last_price']))

    dataf['Delta_price'] = (dataf['last_price']/VAT - dataf['pc2'])
    #dataf['Margin_up']     = dataf['avg_unique_views']*dataf['conv_rate_up']*(dataf['Delta_price'])
    dataf['Margin_down']   = dataf['avg_unique_views']*dataf['conv_rate_down']*(dataf['Delta_price'])
    if increment == 0.01:
        dataf['Margin_base']   = dataf['avg_unique_views']*dataf['avg_conv_rate']*(dataf['Delta_price'])

    #Propagate Margin error
    #dataf['Margin_up_err']     = dataf['avg_unique_views']*dataf['conv_rate_up_err']*(dataf['Delta_price'])
    dataf['Margin_down_err']   = dataf['avg_unique_views']*dataf['conv_rate_down_err']*(dataf['Delta_price'])

    #dataf['Group_flag'] = np.where(dataf['Margin_base'] < (dataf['Margin_up'] - 3*dataf['Margin_up_err']), 'up',
    #                                    np.where(dataf['Margin_base'] < (dataf['Margin_down'] - 3*dataf['Margin_down_err']), 'down','neutral'))
    dataf['Group_flag'] = np.where(dataf['Margin_base'] < (dataf['Margin_down'] - 3*dataf['Margin_down_err']), 'down','neutral')




In [68]:
print(df_final_spark[['item_code','avg_conv_rate','last_price']].head())
#create_Margin(0.01)

            item_code  avg_conv_rate  last_price
0  000000001000012530       0.016220       99.99
2  000000001000022179       0.003121      479.99
3  000000001000034743       0.005481      279.99
4  000000001000145006       0.004873      289.99
5  000000001000005727       0.017079       99.99


In [69]:
original_df= df_final_spark
tmp_df = pd.DataFrame()


In [70]:
for inc in np.linspace(0.01,0.1,10):
    print('Increment = ' + str(inc))
    create_Margin(inc,original_df)
    #print(original_df)
    tmp_df = tmp_df.append(original_df[original_df['Group_flag'] == 'neutral'])
    original_df = original_df[original_df['Group_flag'] != 'neutral']
    #original_df['last_price']    = original_df['price_down']
    #original_df['avg_conv_rate'] = original_df['conv_rate_down']
    original_df['Margin_base']   = original_df['Margin_down']
    #print(original_df[['item_code','avg_conv_rate','conv_rate_down','last_price','price_down','Margin_base','Margin_down']].head())
    print(len(original_df.index))
    print(len(tmp_df.index))



Increment = 0.01
1887
13
Increment = 0.020000000000000004
356
1544
Increment = 0.030000000000000006
23
1877
Increment = 0.04000000000000001
1
1899
Increment = 0.05000000000000001
0
1900
Increment = 0.06000000000000001
0
1900
Increment = 0.07
0
1900
Increment = 0.08
0
1900
Increment = 0.09000000000000001
0
1900
Increment = 0.1
0
1900


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_df['Margin_base']   = original_df['Margin_down']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataf['price_down'] = dataf['last_price']*math.exp(-increment)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataf['conv_rate_down'] = dataf['avg_conv_rate']*np.exp(dataf['PE']*np.log(dataf['pr

In [71]:
df_final_spark.head(400)


Unnamed: 0,item_code,avg_conv_rate,last_price,last_date,avg_unique_views,PE,PE_err,pc2,item_skey,price_down,conv_rate_down,conv_rate_down_err,Delta_price,Margin_down,Margin_base,Margin_down_err,Group_flag
0,1000012530,0.01622,99.99,2021-02-13,66.081387,-0.0187,0.0059,44.68,247523697279378,98.995083,0.016223,9.571313e-07,39.34521,42.178442,42.170556,0.002488528,down
2,1000022179,0.003121,479.99,2021-02-06,93.258739,-0.0136,0.0033,276.87,11954970507506,475.21402,0.003122,1.030184e-07,126.482941,36.823254,36.818246,0.001215167,down
3,1000034743,0.005481,279.99,2021-02-04,110.202494,-0.0186,0.0048,171.03,206871238538442,277.204053,0.005482,2.631557e-07,64.255714,38.821717,38.814497,0.001863442,down
4,1000145006,0.004873,289.99,2021-02-04,188.190476,-0.0086,0.0025,188.819,280637889289282,287.104551,0.004874,1.218467e-07,54.870076,50.32767,50.323342,0.001258192,down
5,1000005727,0.017079,99.99,2021-02-25,170.099034,-0.0246,0.0044,40.729,71488185050378,98.995083,0.017083,7.516645e-07,43.29621,125.812289,125.781343,0.005535741,down
6,1000007327,0.011193,209.99,2021-02-25,46.738462,-0.0229,0.0049,71.5,9591881119337,207.900565,0.011195,5.485617e-07,104.962185,54.920778,54.908202,0.002691118,down
7,1000008870,0.001093,619.99,2021-02-11,130.857143,-0.0148,0.0036,259.209,242371880591990,613.820996,0.001093,3.934313e-08,261.791,37.438512,37.432972,0.001347786,down
8,1000016028,0.01629,139.99,2021-02-25,26.446154,-0.0328,0.0057,52.43,48800168481821,138.597076,0.016295,9.288084e-07,65.208655,28.100822,28.091607,0.001601747,down
9,1000050787,1e-06,369.99,2021-02-20,27.333333,-0.0284,0.0087,150.389,50327313240902,366.308538,1e-06,8.702471e-11,160.526966,0.004389,0.004388,3.818416e-07,down
10,1000114877,0.018526,119.99,2021-02-25,269.909091,-0.0155,0.0029,77.859,25038272605281,118.79608,0.018529,5.373302e-07,22.972933,114.888671,114.870864,0.003331771,down


In [72]:
tmp_df.head(400)


Unnamed: 0,item_code,avg_conv_rate,last_price,last_date,avg_unique_views,PE,PE_err,pc2,item_skey,price_down,conv_rate_down,conv_rate_down_err,Delta_price,Margin_down,Margin_base,Margin_down_err,Group_flag
239,1000175282,0.007469,249.99,2020-12-11,124.402116,0.0916,0.0278,131.59,148907049945338,247.502558,0.007462,2.074375e-06,78.48563,72.855164,72.92193,0.02025374,neutral
298,1000136054,0.001121,699.99,2020-11-05,136.439394,-0.0051,0.0017,465.79,200873009917477,693.024983,0.001121,1.90642e-08,122.436891,18.733617,18.732661,0.0003184715,neutral
420,1000112266,0.030676,84.99,2021-02-16,36.222222,-0.0783,0.0109,112.31,184970899271917,84.144335,0.0307,3.346286e-06,-40.889832,-45.470217,-45.434628,-0.004956254,neutral
468,1000129379,0.054341,94.99,2021-02-17,67.475758,-0.0551,0.0111,124.599,265399766737684,94.044834,0.054371,6.035198e-06,-44.775471,-164.269324,-164.178836,-0.01823389,neutral
562,1000126205,0.009288,214.99,2021-01-30,93.727778,-0.0102,0.0034,98.219,92777316620627,212.850814,0.009289,3.158145e-07,82.444866,71.776956,71.769635,0.002440417,neutral
757,1000150447,0.008452,619.99,2020-12-29,101.428571,-0.0102,0.0034,238.169,208878918158620,613.820996,0.008453,2.873873e-07,282.831,242.47999,242.455259,0.00824432,neutral
1083,1000114009,0.002832,274.99,2021-02-21,159.631944,-0.0135,0.0045,125.3,202498930410110,272.253804,0.002833,1.274753e-07,105.784034,47.835859,47.829402,0.002152614,neutral
1097,1000196335,0.000697,549.99,2021-01-06,112.662565,0.0001,0.0,334.639,185532945316284,544.517508,0.000697,0.0,127.537471,10.0179,10.01791,0.0,neutral
1126,1000112304,0.048644,72.99,2021-02-11,98.222222,-0.0708,0.0146,105.01,202959285352840,72.263737,0.048678,7.106986e-06,-43.673866,-208.816103,-208.668314,-0.03048715,neutral
1334,1000205104,0.004933,329.99,2021-02-25,115.857143,-0.0099,0.0033,158.78,152791380798837,326.706545,0.004934,1.628111e-07,118.522521,67.747593,67.740887,0.002235671,neutral


In [73]:
tmp_df = tmp_df.sort_index()

In [74]:
tmp_df['Margin_last'] = tmp_df['Margin_base']
tmp_df.head()

Unnamed: 0,item_code,avg_conv_rate,last_price,last_date,avg_unique_views,PE,PE_err,pc2,item_skey,price_down,conv_rate_down,conv_rate_down_err,Delta_price,Margin_down,Margin_base,Margin_down_err,Group_flag,Margin_last
0,1000012530,0.01622,99.99,2021-02-13,66.081387,-0.0187,0.0059,44.68,247523697279378,98.010065,0.016226,1.914621e-06,39.34521,42.18633,42.178442,0.004978,neutral,42.178442
2,1000022179,0.003121,479.99,2021-02-06,93.258739,-0.0136,0.0033,276.87,11954970507506,470.485561,0.003122,2.060647e-07,126.482941,36.828262,36.823254,0.002431,neutral,36.823254
3,1000034743,0.005481,279.99,2021-02-04,110.202494,-0.0186,0.0048,171.03,206871238538442,274.445827,0.005483,5.264093e-07,64.255714,38.828938,38.821717,0.003728,neutral,38.821717
4,1000145006,0.004873,289.99,2021-02-04,188.190476,-0.0086,0.0025,188.819,280637889289282,284.247813,0.004874,2.437143e-07,54.870076,50.331998,50.32767,0.002517,neutral,50.32767
5,1000005727,0.017079,99.99,2021-02-25,170.099034,-0.0246,0.0044,40.729,71488185050378,98.010065,0.017087,1.503699e-06,43.29621,125.843243,125.812289,0.011074,neutral,125.812289


In [75]:
create_Margin(0.01,df_final_spark)
df_final_spark.head()

Unnamed: 0,item_code,avg_conv_rate,last_price,last_date,avg_unique_views,PE,PE_err,pc2,item_skey,price_down,conv_rate_down,conv_rate_down_err,Delta_price,Margin_down,Margin_base,Margin_down_err,Group_flag
0,1000012530,0.01622,99.99,2021-02-13,66.081387,-0.0187,0.0059,44.68,247523697279378,98.995083,0.016223,9.571313e-07,39.34521,42.178442,42.170556,0.002489,down
2,1000022179,0.003121,479.99,2021-02-06,93.258739,-0.0136,0.0033,276.87,11954970507506,475.21402,0.003122,1.030184e-07,126.482941,36.823254,36.818246,0.001215,down
3,1000034743,0.005481,279.99,2021-02-04,110.202494,-0.0186,0.0048,171.03,206871238538442,277.204053,0.005482,2.631557e-07,64.255714,38.821717,38.814497,0.001863,down
4,1000145006,0.004873,289.99,2021-02-04,188.190476,-0.0086,0.0025,188.819,280637889289282,287.104551,0.004874,1.218467e-07,54.870076,50.32767,50.323342,0.001258,down
5,1000005727,0.017079,99.99,2021-02-25,170.099034,-0.0246,0.0044,40.729,71488185050378,98.995083,0.017083,7.516645e-07,43.29621,125.812289,125.781343,0.005536,down


In [76]:
tmp_df['Margin_base']=df_final_spark['Margin_base']

tmp_df['Margin_rate_to_base'] = np.log(tmp_df['Margin_down']/tmp_df['Margin_base'])
#tmp_df['Margin_rate_to_last'] = np.log(tmp_df['Margin_down']/tmp_df['Margin_last'])

In [77]:
tmp_df[['item_code','avg_conv_rate'	,'last_price','avg_unique_views', 'price_down', 'Delta_price', 'Margin_down','Margin_base','Margin_down_err','Margin_last','Group_flag','Margin_rate_to_base']]

Unnamed: 0,item_code,avg_conv_rate,last_price,avg_unique_views,price_down,Delta_price,Margin_down,Margin_base,Margin_down_err,Margin_last,Group_flag,Margin_rate_to_base
0,000000001000012530,0.016220,99.99,66.081387,98.010065,39.345210,42.186330,42.170556,0.004978,42.178442,neutral,0.000374
2,000000001000022179,0.003121,479.99,93.258739,470.485561,126.482941,36.828262,36.818246,0.002431,36.823254,neutral,0.000272
3,000000001000034743,0.005481,279.99,110.202494,274.445827,64.255714,38.828938,38.814497,0.003728,38.821717,neutral,0.000372
4,000000001000145006,0.004873,289.99,188.190476,284.247813,54.870076,50.331998,50.323342,0.002517,50.327670,neutral,0.000172
5,000000001000005727,0.017079,99.99,170.099034,98.010065,43.296210,125.843243,125.781343,0.011074,125.812289,neutral,0.000492
...,...,...,...,...,...,...,...,...,...,...,...,...
1902,000000001000176709,0.036867,99.99,186.000000,98.010065,46.766210,320.835626,320.691282,0.036575,320.763446,neutral,0.000450
1903,000000001000176740,0.027844,99.99,49.038462,98.010065,50.965210,69.608544,69.589613,0.006265,69.599078,neutral,0.000272
1904,000000001000179473,0.016499,104.99,73.510638,102.911059,57.106891,69.313318,69.261491,0.013863,69.287400,neutral,0.000748
1905,000000001000182823,0.005275,529.99,190.994048,519.495495,288.589748,290.807321,290.725906,0.018030,290.766610,neutral,0.000280


In [78]:
print(tmp_df['Margin_rate_to_base'].mean()*100)

0.054547473684211


In [79]:
tmp_df.to_csv('../raw_data/Excel_files/Margin_result_PE_analysis_PC2_no_stock_filter_20210226.csv')