In [1]:

from pyspark.sql import SparkSession
#from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').config('spark.driver.memory', '8g').appName('Ready_for_analysis').getOrCreate()



In [2]:

#IMPORTING SETUP PACKAGES
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 500)



In [3]:
number_of_top_items = '5000'
plot_name = number_of_top_items[0] + 'k'

#print(plot_name)

df = pd.read_pickle('/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top'+number_of_top_items+'_seller_YTD_group_by_item_visit_20210226.pickle')


In [5]:
print(df.dtypes)

min_date                  object
max_date                  object
item_code                 object
item_price               float64
delivery_weeks             int64
avg_sales                float64
avg_unique_views         float64
conv_rate                float64
bin                        int32
item_parent_item_code     object
item_main_category        object
item_sub_category_1       object
item_sub_category_2       object
log_price                float64
log_delivery_weeks       float64
log_sales                float64
log_avg_unique_views     float64
log_conv_rate            float64
dtype: object


In [6]:
def filter_df(DF = df,bins=6, avg_PDP = 19.):
    df_filt = DF[DF['bin'] > bins]
    df_items = df_filt.groupby('item_code').item_price.nunique() > 1
    df_filt  = df_filt.merge(df_items,on='item_code',how='inner')
    df_filt  = df_filt[df_filt['item_price_y']== True]
    #filter PDP
    #print(df_filt)
    df_filt = df_filt[df_filt['avg_unique_views'] > avg_PDP]
    #print(df_filt)
    return df_filt

In [7]:
df_filt = filter_df(df)


print(df_filt.item_code.nunique())


4181


In [8]:
#introducing the similarity score
sim_score = pd.read_parquet('/Users/gabriele.sabato/PycharmProjects/raw_data/recommendations_prod/part-00000-5f77392a-0bfa-4f0b-a4ae-d816ce2d71de-c000.snappy.parquet')

In [9]:
sim_score

Unnamed: 0,sku,recommended_sku,score
0,000000001000200072,000000001000200087,0.260000
1,000000001000200072,000000001000183538,0.216580
2,000000001000200072,000000001000142209,0.200000
3,000000001000200072,000000001000204906,0.190794
4,000000001000200072,000000001000183468,0.183980
...,...,...,...
3827600,000000001000064787,000000001000115010,0.203238
3827601,000000001000064787,000000001000112641,0.201339
3827602,000000001000064787,000000001000046204,0.200052
3827603,000000001000064787,000000001000114086,0.199867


In [10]:
# Filter table with only topX item_code

unique_item_codes = list(df_filt.item_code.unique())


sim_score_filt = sim_score[sim_score['sku'].isin(unique_item_codes)]

sim_score_filt

Unnamed: 0,sku,recommended_sku,score
780,000000001000135008,000000001000135012,0.881024
781,000000001000135008,000000001000121202,0.715546
782,000000001000135008,000000001000167468,0.633481
783,000000001000135008,000000001000166799,0.564164
784,000000001000135008,000000001000208980,0.544883
...,...,...,...
3827360,000000001000176744,000000001000171107,0.339302
3827361,000000001000176744,000000001000163202,0.335340
3827362,000000001000176744,000000001000163190,0.327887
3827363,000000001000176744,000000001000176438,0.324293


In [11]:
#Create a map with X subset for X linear fit:

def create_map(list_of_items = unique_item_codes , df=sim_score_filt, score_cut = 0.):
    map_of_items = {}
    for item_code in list_of_items:
        df_mod = df[df['sku'] == item_code]
        df_score_filt = df_mod[df_mod['score'] > score_cut]
        final_set_sku = list(df_score_filt.recommended_sku)
        final_set_sku.insert(0,item_code)
        map_of_items[item_code] = final_set_sku
    #print(map_of_items)
    return map_of_items


In [12]:
sim_score_bin = [0.16]#0.3,0.4,0.5,0.6,0.7,0.8,0.9]




In [13]:
map_of_map_of_items = {}
for sim_score_i in sim_score_bin:
    Map_of_Items = create_map(score_cut = sim_score_i)
    map_of_map_of_items[str(sim_score_i)] = Map_of_Items
#print(map_of_map_of_items)

In [14]:
#load greater dataset
largest_dataset = '/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/All_catalogue_seller_YTD_group_by_item_visit_20210226.pickle'

#df_k = pd.read_pickle(dataset10k)
df_k = pd.read_pickle(largest_dataset)

In [15]:
PDP_bin = [19]#,30,40,50]

In [16]:
df_PDP_bin_filt = {}
for PDP_view in PDP_bin:
    df_k_filt = filter_df(df_k, avg_PDP= PDP_view)
    df_PDP_bin_filt[PDP_view] = df_k_filt
    #print(df_k_filt)

#print(df_PDP_bin_filt)


In [17]:
def create_map_of_dataframes(items_map = Map_of_Items,df=df_filt):
    map_of_df = {}
    for key in items_map.keys():
        items  = items_map[key]
        #print(f'Item_code = {key}')

        df_mod = df[df['item_code'].isin(items)]
        #
        #print(f'Number of items per cluster {df_mod.item_code.nunique()}')
        map_of_df[key]= df_mod
    return map_of_df



In [20]:
import numpy as np
def run_model(item_code, df, map_results):
    #print('\n')
    wls_cr_with_fe = smf.wls("log_sales ~ log_price + log_delivery_weeks + C(item_code)", df, weights=df['bin']).fit()
    wls_cr_with_fe.summary()
    beta_log_price, beta_std_err_log_price = wls_cr_with_fe.params[-2].round(4),wls_cr_with_fe.bse[-2].round(4)
    beta_log_del, beta_std_err_log_del = wls_cr_with_fe.params[-1].round(4),wls_cr_with_fe.bse[-1].round(4)
    Result_log_price = 'Log_price  = ' + str(beta_log_price) + ' ± ' + str(beta_std_err_log_price)
    Result_log_del = 'Log_del_week = ' + str(beta_log_del) + ' ± ' + str(beta_std_err_log_del)
    flag_stat_ok = False
    if beta_log_price != 0 :
        flag_stat_ok = np.fabs((3*beta_std_err_log_price)/beta_log_price) < 1.
    #flag_stat_ok = fabs(beta_std_err_log_price/beta_log_price)

    if flag_stat_ok:
        #print(f'Enough statistics for the item = {item_code}')
        #print(Result_log_price)
        #print(Result_log_del)
        map_results[item_code] = [beta_log_price, beta_std_err_log_price, beta_log_del,beta_std_err_log_del, df.item_code.nunique(), True]
        #print(map_results[item_code])
        return True
    else:
        #print(f'Not enough statistics for the item = {item_code}')
        #print(Result_log_price)
        #print(Result_log_del)
        map_results[item_code] = [beta_log_price, beta_std_err_log_price, beta_log_del,beta_std_err_log_del, df.item_code.nunique(), False]
        #print(map_results[item_code])
        return False


In [21]:
df_result = pd.DataFrame()

map_of_maps_of_df = {}
#print('Cut on PDP, sim_score, result = ')
for PDP_cut in df_PDP_bin_filt.keys():
    for score_cut in  map_of_map_of_items.keys():
        map_of_dfs = create_map_of_dataframes(map_of_map_of_items[score_cut],df=df_PDP_bin_filt[PDP_cut])
        map_of_maps_of_df[PDP_cut] = map_of_dfs
        print(f'Number of items passing the PE cut {len(map_of_dfs)}')
        map_result = {}
        #print("Model used: conv_rate ~ log_price + log_delivery_weeks + C(item_code)\n")
        map_items_stat = {}
        for key in map_of_dfs.keys():
            #map_of_dfs[key].head()
            map_items_stat[key] = run_model(key, map_of_dfs[key], map_result)
            #print(map_result)
        items_stat_series = pd.Series(map_items_stat)
        #print(map_items_stat)
        #print(items_stat_series)
        x,y, y_err, rel_err = {'True':[],'False':[]},{'True':[],'False':[]},{'True':[],'False':[]},{'True':[],'False':[]}
        for i_code,flag in items_stat_series.items():
        #flag=T
            if flag == True:
                y['True'].append(map_result[i_code][0])
                y_err['True'].append(map_result[i_code][1])
                x['True'].append(map_result[i_code][-2])
                #print('True',map_result[i_code][0],map_result[i_code][1])
                rel_err['True'].append(abs(map_result[i_code][1]/map_result[i_code][0]))

            else:
                y['False'].append(map_result[i_code][0])
                y_err['False'].append(map_result[i_code][1])
                x['False'].append(map_result[i_code][-2])
                #print('False',map_result[i_code][0],map_result[i_code][1])
                rel_err['False'].append(abs(map_result[i_code][1]/map_result[i_code][0]))

                continue
        #print(f'{PDP_cut},{score_cut} , {len(x)}')
        df_result = pd.DataFrame.from_dict(map_result, orient='index', columns = ['Sales_PE', 'Sales_PE_err', 'log_del_week', 'log_del_week_err', 'Cluster_size', 'has_enough_stat'])
        print (df_result.head(100))

Number of items passing the PE cut 4181
                    Sales_PE  Sales_PE_err  log_del_week  log_del_week_err  \
000000001000012530   -2.9051        1.1702        0.0043            0.0764   
000000001000021922   -6.5170        1.1057       -0.9297            0.2101   
000000001000034762   -6.4893        2.9438       -1.6689            0.6093   
000000001000125896   -7.0887        2.8063       -0.8351            0.4253   
000000001000199717  -10.6119        5.4424       -0.7920            0.3898   
000000001000005727   -2.6853        0.8351        0.0433            0.0984   
000000001000007327   -4.2463        1.8361       -0.5654            0.2854   
000000001000007822   -4.2083        2.6291       -0.0014            0.2000   
000000001000016194   -5.6962        2.7671       -0.0991            0.2552   
000000001000022565  -10.4964       10.3810       -3.4724            1.3953   
000000001000050787   -6.1549        2.6971       -1.0676            0.5293   
000000001000116614   -1.

  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  llf = -np.log(SSR) * nobs2      # concentrated likelihood
  dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observat

In [24]:
len(df_result.index)

4181

In [25]:
df_result_true = df_result[df_result['has_enough_stat']==True]

In [26]:
print(df_result_true['Sales_PE'].max(),df_result_true['Sales_PE'].min())

48.3576 -47.5047


In [27]:
print(df_result_true['Sales_PE'].nlargest(5),'\n',df_result_true['Sales_PE'].nsmallest(5) )

000000001000119266    48.3576
000000001000128816    32.6864
000000001000080472     0.2994
000000001000080481     0.2992
000000001000196335    -0.9367
Name: Sales_PE, dtype: float64 
 000000001000123336   -47.5047
000000001000009278   -34.9123
000000001000009867   -33.0342
000000001000125632   -28.8310
000000001000031306   -23.1860
Name: Sales_PE, dtype: float64


In [28]:
df_result_true = df_result_true[(df_result_true['Sales_PE'] < 0.)]

In [29]:
print(df_result_true['Sales_PE'].mean())

-6.642132061978545


In [30]:
import math

In [31]:
print(df_result_true['Sales_PE'].std()/math.sqrt(len(df_result_true.index)))

0.1528508449718939


In [32]:
df_result_true[df_result_true['Sales_PE'].isna()]

Unnamed: 0,Sales_PE,Sales_PE_err,log_del_week,log_del_week_err,Cluster_size,has_enough_stat


In [33]:
df_result_true_final = df_result_true[['Sales_PE','Sales_PE_err']]

In [34]:
df_result_true_final.head()


Unnamed: 0,Sales_PE,Sales_PE_err
1000021922,-6.517,1.1057
1000005727,-2.6853,0.8351
1000164660,-9.4314,2.6096
1000176734,-3.0015,0.9799
1000008470,-5.8561,1.648


In [35]:
df_result_true_final.to_csv('./../raw_data/Excel_files/Top5k_Sales_PE.csv')
