In [30]:
import json
import os
import pandas as pd
import datetime
import numpy as np
from iteround import saferound
from tqdm.notebook import tqdm
from multiprocessing import Pool

In [40]:
np.array([[1,2],[1,4]]).sum(axis=1)

array([3, 5])

In [41]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [2]:
DATASET_FOLDER = "./dataset"
train_processed_data = os.path.join(DATASET_FOLDER, 'processed/sku_feature_data.parquet')
test_data = os.path.join(DATASET_FOLDER, 'test_data.csv')

In [3]:
df_train_v1 = pd.read_parquet(train_processed_data, engine='fastparquet')
df_train_v1 = df_train_v1.set_index('sku')

In [29]:
df_train_v1.dtypes

item_domain_id                            object
item_id                                    int64
item_title                                object
site_id                                   object
product_id                                object
product_family_id                         object
count                                      int64
date_first                        datetime64[ns]
date_last                         datetime64[ns]
date_diff                                  int64
sold_quantity_first                        int64
sold_quantity_last                         int64
sold_quantity_sum                          int64
sold_quantity_mean                       float64
sold_quantity_std                        float64
sold_quantity_min                          int64
sold_quantity_max                          int64
sold_quantity_mode                         int64
sold_quantity_mode_tx                    float64
current_price_first                      float64
current_price_last  

In [4]:
df_test = pd.read_csv(test_data)

In [119]:
#uniform
predictions=np.empty((len(df_test), 30))
predictions.fill(0.0333)
predictions[:, -1] = 0.0343

df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/uniform.csv', index=False, header=False)

In [126]:
#random uniform
random_uniform = np.random.uniform(size=(len(df_test), 30))

def normalize_f(data_point):
    data = data_point/data_point.sum()
    rounded = saferound(data, places=4)
    return np.array(rounded)
    
predictions = []
with Pool(50) as p:
    for data in tqdm(p.imap(normalize_f, random_uniform), total=len(random_uniform)):
        predictions.append(data)
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/random.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [155]:
#random cummulative
random_uniform = np.random.uniform(size=(len(df_test), 30))

def cumm_f(data_point):
    data = np.cumsum(data_point)
    data = data/data.sum()
    rounded = saferound(data, places=4)
    return np.array(rounded)
    
predictions = []
with Pool(50) as p:
    for data in tqdm(p.imap(cumm_f, random_uniform), total=len(random_uniform)):
        predictions.append(data)
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/random_cummulative.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [153]:
#random specific date
random_int = np.random.randint(0, 30, size=len(df_test))

def uniform_after_date(specific_date):
    data = np.zeros(30)
    data[specific_date:].fill(1/(30-specific_date))
    rounded = saferound(data, places=4)
    return np.array(rounded)
    
predictions = []
with Pool(100) as p:
    for data in tqdm(p.imap(uniform_after_date, random_int), total=len(random_int)):
        predictions.append(data)
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/uniform_after_random_specific_date.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [171]:
#spike specific date
predictions = np.eye(30)[np.random.choice(30, len(df_test))]
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/spike_random_specific_date.csv', index=False, header=False)

In [156]:
def simple_first_30_days(data):
    i, row = data
    sku = row['sku']
    target_stock = row['target_stock']
    train_row = df_train_v1.loc[sku]
    sold_quantity_series = json.loads(train_row['sold_quantity_series'])
    if len(sold_quantity_series) < 30:
        sold_quantity_series = np.pad(sold_quantity_series, 30-len(sold_quantity_series))

    sold_quantity_series = sold_quantity_series[:30]

    sold_quantity_cumsum = np.cumsum(sold_quantity_series)
    stock_percentage = sold_quantity_cumsum/target_stock
    stock_percentage = np.clip(stock_percentage,0,1)

    if stock_percentage[-1] == 0:
        stock_percentage[-1] = 1

    probalities = stock_percentage/stock_percentage.sum()
    probalities = saferound(probalities, places=4)
    probalities = np.array(probalities)
    return (sku, probalities)

predictions = []
skus = []
with Pool(100) as p:
    for data in tqdm(p.imap(simple_first_30_days, df_test.iterrows()), total=len(df_test)):
        sku, probabilities = data
        skus.append(sku)
        predictions.append(probabilities)
        
skus = np.array(skus)
comparison = skus == df_test['sku'].to_numpy()
assert comparison.all()
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/simple_first_30_days.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [28]:
def simple_first_30_days_fixed_spike(data):
    #row = df_test.iloc[0]
    i, row = data
    sku = row['sku']
    target_stock = row['target_stock']
    train_row = df_train_v1.loc[sku]
    sold_quantity_series = json.loads(train_row['sold_quantity_series'])

    original_len = len(sold_quantity_series)

    if original_len < 30:
        sold_quantity_series = np.pad(sold_quantity_series, (0, 30-original_len))

    sold_quantity_series = sold_quantity_series[:30]

    sold_quantity_cumsum = np.cumsum(sold_quantity_series)
    stock_percentage = sold_quantity_cumsum/target_stock
    stock_percentage = np.clip(stock_percentage,0,1)

    index_max = np.argmax(stock_percentage == stock_percentage.max())
    probalities = np.eye(30)[index_max]
    return (sku, probalities)

predictions = []
skus = []
with Pool(100) as p:
    for data in tqdm(p.imap(simple_first_30_days_fixed_spike, df_test.iterrows()), total=len(df_test)):
        sku, probabilities = data
        skus.append(sku)
        predictions.append(probabilities)
        
skus = np.array(skus)
comparison = skus == df_test['sku'].to_numpy()
assert comparison.all()
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/simple_first_30_days_fixed_spike.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [5]:
def shift(arr, num, fill_value=0):
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = fill_value
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = fill_value
        result[:num] = arr[-num:]
    else:
        result[:] = arr
    return result

def voted_shifted_padded_spike(data):
    i, row = data
    sku = row['sku']
    target_stock = row['target_stock']
    train_row = df_train_v1.loc[sku]
    sold_quantity_series = json.loads(train_row['sold_quantity_series'])

    original_len = len(sold_quantity_series)

    if original_len < 59:
        sold_quantity_series = np.pad(sold_quantity_series, (0, 59-original_len))

    #sold_quantity_series = sold_quantity_series[:30]
    voted_probalities = np.zeros(30)
    for i in range(original_len):
        shifted_sum_sold_quantity_series = shift(sold_quantity_series, -i)[:30]

        sold_quantity_cumsum = np.cumsum(shifted_sum_sold_quantity_series)
        stock_percentage = sold_quantity_cumsum/target_stock
        stock_percentage = np.clip(stock_percentage,0,1) 
        max_consumed = stock_percentage.max()

        index_max = np.argmax(stock_percentage == max_consumed)
        shifted_probalities = np.eye(30)[index_max]
        shifted_probalities *= max_consumed
        voted_probalities += shifted_probalities

    index_max = np.argmax(voted_probalities == voted_probalities.max())
    probalities = np.eye(30)[index_max]

    #if (voted_probalities == np.zeros(30)).all():
    #    voted_probalities[0] = 1
    #probalities = voted_probalities/voted_probalities.sum()
    #probalities = saferound(probalities, places=4)
    return (sku, probalities)

predictions = []
skus = []
with Pool(100) as p:
    for data in tqdm(p.imap(voted_shifted_padded_spike, df_test.iterrows()), total=len(df_test)):
        sku, probabilities = data
        skus.append(sku)
        predictions.append(probabilities)
        
skus = np.array(skus)
comparison = skus == df_test['sku'].to_numpy()
assert comparison.all()
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/voted_shifted_padded_spike.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [6]:
def shift(arr, num, fill_value=0):
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = fill_value
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = fill_value
        result[:num] = arr[-num:]
    else:
        result[:] = arr
    return result

def voted_shifted_padded_probs(data):
    i, row = data
    sku = row['sku']
    target_stock = row['target_stock']
    train_row = df_train_v1.loc[sku]
    sold_quantity_series = json.loads(train_row['sold_quantity_series'])

    original_len = len(sold_quantity_series)

    if original_len < 59:
        sold_quantity_series = np.pad(sold_quantity_series, (0, 59-original_len))

    #sold_quantity_series = sold_quantity_series[:30]
    voted_probalities = np.zeros(30)
    for i in range(original_len):
        shifted_sum_sold_quantity_series = shift(sold_quantity_series, -i)[:30]

        sold_quantity_cumsum = np.cumsum(shifted_sum_sold_quantity_series)
        stock_percentage = sold_quantity_cumsum/target_stock
        stock_percentage = np.clip(stock_percentage,0,1) 
        max_consumed = stock_percentage.max()

        index_max = np.argmax(stock_percentage == max_consumed)
        shifted_probalities = np.eye(30)[index_max]
        shifted_probalities *= max_consumed
        voted_probalities += shifted_probalities

    #index_max = np.argmax(voted_probalities == voted_probalities.max())
    #probalities = np.eye(30)[index_max]

    if (voted_probalities == np.zeros(30)).all():
        voted_probalities[0] = 1
    probalities = voted_probalities/voted_probalities.sum()
    probalities = saferound(probalities, places=4)
    return (sku, probalities)

predictions = []
skus = []
with Pool(100) as p:
    for data in tqdm(p.imap(voted_shifted_padded_probs, df_test.iterrows()), total=len(df_test)):
        sku, probabilities = data
        skus.append(sku)
        predictions.append(probabilities)
        
skus = np.array(skus)
comparison = skus == df_test['sku'].to_numpy()
assert comparison.all()
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/voted_shifted_padded_probs.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [27]:
def shift(arr, num, fill_value=0):
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = fill_value
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = fill_value
        result[:num] = arr[-num:]
    else:
        result[:] = arr
    return result

def gaussian_kernel1d(sigma, length, order=0):
    """
    Computes a 1-D Gaussian convolution kernel.
    """
    if order < 0:
        raise ValueError('order must be non-negative')
    exponent_range = np.arange(order + 1)
    sigma2 = sigma * sigma
    x = np.arange(np.floor(-length/2), np.ceil(length/2))
    phi_x = np.exp(-0.5 / sigma2 * x ** 2)
    phi_x = phi_x / phi_x.sum()
    return phi_x


def voted_shifted_padded_gaussian_probs(data):
    i, row = data
    sku = row['sku']
    target_stock = row['target_stock']
    train_row = df_train_v1.loc[sku]
    sold_quantity_series = json.loads(train_row['sold_quantity_series'])

    original_len = len(sold_quantity_series)

    if original_len < 59:
        sold_quantity_series = np.pad(sold_quantity_series, (0, 59-original_len))

    #sold_quantity_series = sold_quantity_series[:30]
    voted_probalities = np.zeros(30)
    for i in range(original_len):
        shifted_sum_sold_quantity_series = shift(sold_quantity_series, -i)[:30]

        sold_quantity_cumsum = np.cumsum(shifted_sum_sold_quantity_series)
        stock_percentage = sold_quantity_cumsum/target_stock
        stock_percentage_clipped = np.clip(stock_percentage,0,1) 

        index_max = np.argmax(stock_percentage_clipped == stock_percentage_clipped.max())
        shifted_probalities = np.eye(30)[index_max]
        shifted_probalities *= stock_percentage_clipped.max()
        #shifted_probalities *= stock_percentage[index_max]
        voted_probalities += shifted_probalities

    index_max = np.argmax(voted_probalities == voted_probalities.max())
    #probalities = np.eye(30)[index_max]

    if (voted_probalities == np.zeros(30)).all():
        voted_probalities[0] = 1

    gaussian_len = 30
    sigma = np.sqrt(voted_probalities.std())
    sigma = sigma if sigma > 0 else 1
    gaussian = gaussian_kernel1d(sigma, gaussian_len)
    gaussian_central_point = int(np.floor(gaussian_len/2))
    shift_amount = index_max-gaussian_central_point

    probalities = voted_probalities*shift(gaussian, shift_amount)

    probalities = probalities/probalities.sum()
    probalities = saferound(probalities, places=4)
    return (sku, probalities)

predictions = []
skus = []
with Pool(100) as p:
    for data in tqdm(p.imap(voted_shifted_padded_gaussian_probs, df_test.iterrows()), total=len(df_test)):
        sku, probabilities = data
        skus.append(sku)
        predictions.append(probabilities)
        
skus = np.array(skus)
comparison = skus == df_test['sku'].to_numpy()
assert comparison.all()
        
df_predictions = pd.DataFrame(predictions)
df_predictions.to_csv('predictions/voted_shifted_padded_gaussian_probs.csv', index=False, header=False)

  0%|          | 0/551472 [00:00<?, ?it/s]

In [25]:
def shift(arr, num, fill_value=0):
    result = np.empty_like(arr)
    if num > 0:
        result[:num] = fill_value
        result[num:] = arr[:-num]
    elif num < 0:
        result[num:] = fill_value
        result[:num] = arr[-num:]
    else:
        result[:] = arr
    return result

def gaussian_kernel1d(sigma, length, order=0):
    """
    Computes a 1-D Gaussian convolution kernel.
    """
    if order < 0:
        raise ValueError('order must be non-negative')
    exponent_range = np.arange(order + 1)
    sigma2 = sigma * sigma
    x = np.arange(np.floor(-length/2), np.ceil(length/2))
    phi_x = np.exp(-0.5 / sigma2 * x ** 2)
    phi_x = phi_x / phi_x.sum()
    return phi_x

row = df_test.iloc[606]
data = (0, row)
#test_case

i, row = data
sku = row['sku']
target_stock = row['target_stock']
train_row = df_train_v1.loc[sku]
sold_quantity_series = json.loads(train_row['sold_quantity_series'])

print('sold_quantity_series', sold_quantity_series)

original_len = len(sold_quantity_series)

if original_len < 59:
    sold_quantity_series = np.pad(sold_quantity_series, (0, 59-original_len))

#sold_quantity_series = sold_quantity_series[:30]
voted_probalities = np.zeros(30)
for i in range(original_len):
    shifted_sum_sold_quantity_series = shift(sold_quantity_series, -i)[:30]
    
    sold_quantity_cumsum = np.cumsum(shifted_sum_sold_quantity_series)
    stock_percentage = sold_quantity_cumsum/target_stock
    stock_percentage_clipped = np.clip(stock_percentage,0,1) 

    index_max = np.argmax(stock_percentage_clipped == stock_percentage_clipped.max())
    shifted_probalities = np.eye(30)[index_max]
    shifted_probalities *= stock_percentage_clipped.max()
    #shifted_probalities *= stock_percentage[index_max]
    voted_probalities += shifted_probalities
    
index_max = np.argmax(voted_probalities == voted_probalities.max())
#probalities = np.eye(30)[index_max]

if (voted_probalities == np.zeros(30)).all():
    voted_probalities[0] = 1
    
print('voted_probalities', voted_probalities)

gaussian_len = 30
sigma = np.sqrt(voted_probalities.std())
sigma = sigma if sigma > 0 else 1
gaussian = gaussian_kernel1d(sigma, gaussian_len)
gaussian_central_point = int(np.floor(gaussian_len/2))
shift_amount = index_max-gaussian_central_point

probalities = voted_probalities*shift(gaussian, shift_amount)

probalities = probalities/probalities.sum()
probalities = saferound(probalities, places=4)
probalities

sold_quantity_series [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
voted_probalities [0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5
 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]


[0.5704,
 0.3459,
 0.0772,
 0.0063,
 0.0002,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [16]:
np.sqrt(voted_probalities.std())

0.0

In [11]:
voted_probalities.mean() + voted_probalities.std() * np.random.standard_normal(size=30)

array([ 1.80774322,  0.30721028,  2.28631206, -1.2033625 ,  1.89920547,
        1.19630212,  2.60930263,  2.33853422,  0.68637868,  2.74119347,
        0.89575834, -0.63289762,  0.49892918,  1.23123853,  2.10912054,
        2.04006513,  1.55623063,  0.73979904,  1.06822032,  1.06394256,
        1.40492017,  0.28809414,  1.66671544,  2.31546173,  2.44752861,
        0.17956176, -0.55929072,  2.93896606,  1.94720861,  1.25051815])

In [79]:
gaussian_len = 30
gaussian = _gaussian_kernel1d(1.2, gaussian_len)
central_point = int(np.floor(gaussian_len/2))
print(central_point, gaussian.round(2)[central_point])
gaussian.round(2)

15 0.33


array([0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.01, 0.08, 0.23, 0.33, 0.23, 0.08, 0.01, 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ])

In [87]:
voted_probalities[index_max]

4.5

In [92]:
shift_amount = 15-index_max
shift(shift(voted_probalities, shift_amount)*gaussian, -shift_amount)

array([3.39336650e-09, 3.09733232e-07, 7.05864686e-05, 2.89177312e-03,
       6.20793976e-02, 3.52314867e-01, 7.63511329e-01, 1.49603355e+00,
       8.22242970e-01, 1.24346423e-01, 2.92138342e-02, 1.28523250e-03,
       5.64691749e-05, 1.23893293e-06, 1.35734660e-08, 7.42575016e-11,
       2.02860044e-13, 2.76731954e-16, 1.88507534e-19, 6.41216552e-23,
       1.08914913e-26, 9.23797067e-31, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00])

In [77]:
central_point

15.0

In [20]:
[0, 1, 0, 0, 1, (1), 1, 0, 1, 1, 1, 5, 1]
[-, 1, 0, 0, 1, 1, (1), 0, 1, 1, 1, 5, 1, 0]
[-, -, 0, 0, 1, 1, (1), 0, 1, 1, 1, 5, 1, 0, 0]

array([0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 5, 1, 1, 5, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 2, 0, 0, 3, 0, 0, 1, 0, 0, 0, 0, 0, 0])