In [1]:

from pyspark.sql import SparkSession

#from pyspark.sql.functions import *
spark = SparkSession.builder.master('local').config('spark.driver.memory', '8g').appName(
    'Ready_for_analysis').getOrCreate()



In [3]:

#IMPORTING SETUP PACKAGES
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt

plt.style.use('seaborn-whitegrid')

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 500)



In [4]:
number_of_top_items = '5000'
plot_name = number_of_top_items[0] + 'k'

#print(plot_name)

df = pd.read_pickle(
    '/Users/gabriele.sabato/PycharmProjects/raw_data/DataFrames/Top' + number_of_top_items + '_seller_YTD_group_by_item_visit_20210503.pickle')


In [6]:

list_of_top_seller_item_code = ['000000001000016021','000000001000016133','000000001000022708', '000000001000015958',
                       '000000001000016020','000000001000015959','000000001000016019','000000001000022573','000000001000015944',
                       '000000001000024281']

df = df[df['item_code'].isin(list_of_top_seller_item_code)]

In [8]:
def filter_df(DF=df, bins=6, avg_PDP=19.):
    df_filt = DF[DF['bin'] > bins]
    df_items = df_filt.groupby('item_code').item_price.nunique() > 1
    df_filt = df_filt.merge(df_items, on='item_code', how='inner')
    df_filt = df_filt[df_filt['item_price_y'] == True]
    #filter PDP
    #print(df_filt)
    df_filt = df_filt[df_filt['avg_unique_views'] > avg_PDP]
    #print(df_filt)
    return df_filt

In [9]:
df_filt = filter_df(df)

print(df_filt.item_code.nunique())


10


In [14]:
wls_cr_with_fe = smf.wls("avg_sales ~ log_avg_sessions + C(item_code) ", df_filt, weights=df_filt['bin']).fit()

In [15]:
print(wls_cr_with_fe.summary())

                            WLS Regression Results                            
Dep. Variable:              avg_sales   R-squared:                       0.626
Model:                            WLS   Adj. R-squared:                  0.582
Method:                 Least Squares   F-statistic:                     14.38
Date:                Wed, 05 May 2021   Prob (F-statistic):           1.27e-14
Time:                        22:47:38   Log-Likelihood:                -286.51
No. Observations:                  97   AIC:                             595.0
Df Residuals:                      86   BIC:                             623.3
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                         coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Inte

In [103]:
import numpy as np

def test_session_model(item_code, df_tmp, map_results):
    #print('Starting test model \n')
    wls_cr_with_fe = smf.wls("avg_sales ~ log_avg_sessions + C(item_code)", df_tmp, weights=df_tmp['bin']).fit()
    wls_cr_with_fe.summary()
    beta_log_session, beta_std_err_log_session = wls_cr_with_fe.params[1].round(4), wls_cr_with_fe.bse[1].round(4)
    intercept, intercept_err = wls_cr_with_fe.params[0].round(4), wls_cr_with_fe.bse[0].round(4)
    flag_stat_ok = False
    if beta_log_session != 0:
        flag_stat_ok = np.fabs((3 * beta_std_err_log_session) / beta_log_session) < 1.
    #flag_stat_ok = fabs(beta_std_err_log_price/beta_log_price)

    if flag_stat_ok:
        #print(f'Enough statistics for the item = {item_code}')
        #print(Result_log_price)
        #print(Result_log_del)
        map_results[item_code] = [beta_log_session, beta_std_err_log_session, intercept, intercept_err,
                                  df_tmp.item_code.nunique(), True]
        #print(map_results[item_code])
        return True
    else:
        #print(f'Not enough statistics for the item = {item_code}')
        #print(Result_log_price)
        #print(Result_log_del)
        map_results[item_code] = [beta_log_session, beta_std_err_log_session, intercept, intercept_err,
                                  df_tmp.item_code.nunique(), False]
        #print(map_results[item_code])
        return False


In [118]:
#print('Cut on PDP, sim_score, result = ')
def run_clusters(which_model='std', df_cluster=df_PDP_bin_filt, map_items=map_of_map_of_items):
    map_of_maps_of_df = {}

    for PDP_cut in df_cluster.keys():
        for score_cut in map_items.keys():
            map_of_dfs = create_map_of_dataframes(map_items[score_cut], df_tmp=df_PDP_bin_filt[PDP_cut])
            map_of_maps_of_df[PDP_cut] = map_of_dfs
            print(f'Number of items passing the PE cut {len(map_of_dfs)}')
            map_result = {}
            #print("Model used: conv_rate ~ log_price + log_delivery_weeks + C(item_code)\n")
            map_items_stat = {}
            for key in map_of_dfs.keys():
                #map_of_dfs[key].head()
                if which_model == 'std':
                    map_items_stat[key] = run_model(key, map_of_dfs[key], map_result, log_session_flag=False)
                elif which_model == 'std_session':
                    map_items_stat[key] = run_model(key, map_of_dfs[key], map_result, log_session_flag=True)
                elif which_model == 'test':
                    map_items_stat[key] = test_session_model(key, map_of_dfs[key], map_result)
                #print(map_result)
            items_stat_series = pd.Series(map_items_stat)
            #print(map_items_stat)
            #print(items_stat_series)
            x, y, y_err, rel_err = {'True': [], 'False': []}, {'True': [], 'False': []}, {'True': [], 'False': []}, {
                'True': [], 'False': []}
            for i_code, flag in items_stat_series.items():
                #flag=T
                if flag == True:
                    y['True'].append(map_result[i_code][0])
                    y_err['True'].append(map_result[i_code][1])
                    x['True'].append(map_result[i_code][-2])
                    #print('True',map_result[i_code][0],map_result[i_code][1])
                    rel_err['True'].append(abs(map_result[i_code][1] / map_result[i_code][0]))

                else:
                    y['False'].append(map_result[i_code][0])
                    y_err['False'].append(map_result[i_code][1])
                    x['False'].append(map_result[i_code][-2])
                    #print('False',map_result[i_code][0],map_result[i_code][1])
                    rel_err['False'].append(abs(map_result[i_code][1] / map_result[i_code][0]))

                    continue
            #print(f'{PDP_cut},{score_cut} , {len(x)}')
        list_of_col_df = []
        if which_model == 'std':
            list_of_col_df = ['Sales_PE', 'Sales_PE_err', 'del_week', 'del_week_err',
                              'intercept', 'intercept_err', 'Cluster_size',
                              'has_enough_stat']
        elif which_model == 'test':
            list_of_col_df = ['log_avg_sessions', 'log_avg_sessions_err', 'Cluster_size', 'intercept', 'intercept_err',
                              'has_enough_stat']
        elif which_model == 'std_session':
            list_of_col_df = ['Sales_PE', 'Sales_PE_err', 'del_week', 'del_week_err',
                              'intercept', 'intercept_err', 'log_avg_sessions', 'log_avg_sessions_err', 'Cluster_size',
                              'has_enough_stat']
        df_result = pd.DataFrame.from_dict(map_result, orient='index',
                                           columns=list_of_col_df)
        print(df_result.head(100))
        return df_result

In [111]:
df_result = run_clusters(which_model='std_session')


Number of items passing the PE cut 4148


  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.d

                    Sales_PE  Sales_PE_err  del_week  del_week_err  intercept  \
000000001000012530   -0.3058        0.1424   -0.2049        0.1244    12.6982   
000000001000021922   -0.4365        0.1511    1.4839        0.1522    17.5105   
000000001000034762   -0.1026        0.0809   -0.0143        0.2039     4.2289   
000000001000080319    0.3263        0.0906    0.3382        0.0896    15.4846   
000000001000125896   -0.0530        0.0555    0.0536        0.0712     6.6076   
...                      ...           ...       ...           ...        ...   
000000001000012836   -0.7226        0.4483   -0.4417        0.3340    14.8846   
000000001000051675   -0.9025        0.3133   -0.5530        0.4441    14.8465   
000000001000123220   -0.4305        0.2722   -2.7548        0.3855    25.7100   
000000001000012560    0.4416        0.2215    1.2323        0.6358     9.5829   
000000001000047960    2.5466        0.6921    0.5176        0.1438    17.9840   

                    interce

In [119]:
df_result_test = run_clusters(which_model='test')



Number of items passing the PE cut 4148


  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  return 1 - self.ssr/self.centered_tss
  return 1 - (np.divide(self.nobs - self.k_constant, self.df_resid)
  return np.dot(wresid, wresid) / self.df_resid
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn("omni_normtest is not valid with less than 8 observations; %i "
  warn(

                    log_avg_sessions  log_avg_sessions_err  Cluster_size  \
000000001000012530            1.3204                1.6958        1.1914   
000000001000021922            5.4450                2.0232        5.7461   
000000001000034762           -0.1394                1.0662       -0.1735   
000000001000080319           10.7158                1.2882       10.4528   
000000001000125896            2.1515                0.6641        2.0974   
...                              ...                   ...           ...   
000000001000012836            7.4900                1.9676        7.1168   
000000001000051675            5.9701                1.7060        5.7717   
000000001000123220            4.6762                2.0765        3.2623   
000000001000012560            3.3403                2.1903        3.1489   
000000001000047960            5.3998                1.4280        5.9923   

                    intercept  intercept_err  has_enough_stat  
000000001000012530     

In [120]:
def only_enough_stat(df_tmp=df_result):
    df_true = df_tmp[df_tmp['has_enough_stat'] == True]
    return df_true


In [121]:
df_result_true = only_enough_stat(df_result)
df_result_test_true = only_enough_stat(df_result_test)

In [126]:
def obtain_stats(df_tmp=df_result_test_true, which_model='test'):
    print('Model results: ', which_model)
    column = ''
    PE = ''
    if which_model == 'test':
        column = ['intercept','log_avg_sessions']
        PE = 'log_avg_sessions'
    elif which_model == 'std':
        column = ['intercept', 'Sales_PE', 'del_week']
        PE = ['Sales_PE']
    elif which_model == 'std_session':
        column = ['intercept', 'Sales_PE', 'del_week', 'log_avg_sessions']
        PE = ['Sales_PE']

    print('Number of items = ', len(df_tmp.index))  #print number of items
    print('Max value = ', round(df_tmp[PE].max(), 3), ', Min value =', round(df_tmp[PE].min(), 3))  # print max & min
    #print(df_tmp[PE].nlargest(5),'\n',df_tmp[PE].nsmallest(5))
    import math
    print('Mean\n', round(df_tmp[column].mean(), 3), '\nError\n', round(df_tmp[column].sem(), 3))
    print('Median\n ', round(df_tmp[column].median(), 3), '\nError\n ', round(1.253 * df_tmp[column].sem(), 3))
    #print('Std ',round(df_tmp[column].std(),3))

In [127]:
obtain_stats(df_result_test_true, 'test')

Model results:  test
Number of items =  1426
Max value =  95.658 , Min value = -58.03
Mean
 intercept            3.180
log_avg_sessions    12.232
dtype: float64 
Error
 intercept           0.092
log_avg_sessions    0.342
dtype: float64
Median
  intercept           2.006
log_avg_sessions    8.251
dtype: float64 
Error
  intercept           0.115
log_avg_sessions    0.428
dtype: float64


In [115]:
for outliers in df_result_true[(df_result_true['Sales_PE'] > 0.)].index:
    print(outliers)
    df_result_true.drop(index=outliers, inplace=True)


000000001000012530
000000001000050787
000000001000164660
000000001000008470
000000001000008827
000000001000008474
000000001000073238
000000001000119830
000000001000054980
000000001000005773
000000001000012529
000000001000054039
000000001000008796
000000001000012559
000000001000079630
000000001000138403
000000001000009286
000000001000012675
000000001000012484
000000001000016142
000000001000117280
000000001000008769
000000001000086539
000000001000176729
000000001000024343
000000001000171105
000000001000015609
000000001000071866
000000001000169678
000000001000123475
000000001000046717
000000001000005759
000000001000127784
000000001000027340
000000001000128457
000000001000092012
000000001000113629
000000001000138429
000000001000007105
000000001000173494
000000001000054056
000000001000012526
000000001000052289
000000001000176739
000000001000112681
000000001000134815
000000001000138424
000000001000156835
000000001000183822
000000001000005744
000000001000163087
000000001000007449
000000001000

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


000000001000016090
000000001000124118
000000001000169127
000000001000087175
000000001000123336
000000001000092354
000000001000119268
000000001000016098
000000001000121703
000000001000086507
000000001000034755
000000001000182348
000000001000027342
000000001000199469
000000001000008204
000000001000052422
000000001000136719
000000001000126927
000000001000166816
000000001000008487
000000001000152183
000000001000116680
000000001000128192
000000001000046288
000000001000021892
000000001000050870
000000001000131585
000000001000169679
000000001000062037
000000001000138434
000000001000030917
000000001000016113
000000001000008483
000000001000063637
000000001000006084
000000001000016026
000000001000159195
000000001000032621
000000001000104040
000000001000161395
000000001000119358
000000001000151794
000000001000118134
000000001000118883
000000001000166781
000000001000016095
000000001000183574
000000001000021855
000000001000116650
000000001000007446
000000001000169125
000000001000016028
000000001000

In [101]:
obtain_stats(df_result_true, 'std')



Model results:  std
Number of items =  1523
Max value =  Sales_PE    25.439
dtype: float64 , Min value = Sales_PE   -24.557
dtype: float64
Mean
 intercept    15.279
Sales_PE     -0.128
del_week     -0.122
dtype: float64 
Error
 intercept    0.360
Sales_PE     0.074
del_week     0.071
dtype: float64
Median
  intercept    12.030
Sales_PE     -0.336
del_week     -0.132
dtype: float64 
Error
  intercept    0.451
Sales_PE     0.093
del_week     0.089
dtype: float64


In [37]:
plt.scatter(rel_err['False'], y['False'], c='red')
plt.scatter(rel_err['True'], y['True'], c='blue')
plt.ylabel('Price elasticity')
plt.xlabel('% Relative error ')
plt.xscale('log')
#plt.yscale('log')
create_legend()
plt.savefig('./plots/Relative_error_vs_PE_top' + plot_name + '_20210226.png')
plt.show()


#df_result_true = df_result_true[(df_result_true['Sales_PE'] < 0.)]


2381


In [44]:
df_result_true_final = df_result_true[['Sales_PE', 'Sales_PE_err']]

In [1]:
df_result_true_final.head()


NameError: name 'df_result_true_final' is not defined

In [46]:
df_result_true_final.to_csv('./../raw_data/Excel_files/Top5k_Sales_PE_new_model.csv')
