In [1]:
import os
import sys
import time
import json
from datetime import datetime

import numpy as np
import pandas as pd

from models import models
from utils import read_df, read_numpy, write_df
import logging

from evaluate import rps

In [2]:
#Model imports
import numpy as np
import json

from tqdm.auto import tqdm
from multiprocessing import Pool
from iteround import saferound

from model import Model
from utils import read_df

In [3]:
DATASET_PATH = './dataset/processed/train_v1'
TEST_DATA_FILENAME = 'test_data.csv'
GROUND_TRUTH_FILENAME = 'test_ground_truth.npy'
TRAIN_DATA_FILENAME = 'train_data.parquet'
TRAIN_DATA_PROCESSED_FILENAME = 'train_sku_feature_data.parquet'

In [4]:
dateset_indexes = [0]
model_name = 'simple_first_30_days_fixed_spike'

In [5]:
dataset_index = dateset_indexes[0]
dataset_current_path = os.path.join(DATASET_PATH, str(dataset_index))

test_data_filepath = os.path.join(dataset_current_path, TEST_DATA_FILENAME)
ground_truth_filepath = os.path.join(dataset_current_path, GROUND_TRUTH_FILENAME)
train_data_filepath = os.path.join(dataset_current_path, TRAIN_DATA_FILENAME)
train_data_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_PROCESSED_FILENAME)

In [6]:
df_test = read_df(test_data_filepath)
ground_truth = read_numpy(ground_truth_filepath)
df_train = read_df(train_data_filepath)
df_train_processed = read_df(train_data_processed_filepath)

In [7]:
df_train_v1 = df_train_processed.set_index('sku')
df_train_v1['sold_quantity_series'] = df_train_v1['sold_quantity_series'].apply(lambda x: np.array(json.loads(x)))
training_data = df_train_v1[df_train_v1['sold_quantity_sum'] > 0]
training_data = training_data[training_data['count'] >= 21]

In [8]:
def rolling_window(a, window):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
    strides = a.strides + (a.strides[-1],)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def build_training_data(sold_quantity_series):
    windows = rolling_window(sold_quantity_series, 21)
    X = windows[:, :-1]
    y = windows[:, -1]
    return X, y

In [9]:
X = []
y = []
with Pool(100) as p:
    for data in tqdm(p.imap(build_training_data, training_data['sold_quantity_series'].values), total=len(training_data['sold_quantity_series'])):
        X.append(data[0])
        y.append(data[1])

  0%|          | 0/608752 [00:00<?, ?it/s]

In [10]:
X_train = np.concatenate(X)
y_train = np.concatenate(y)

In [11]:
len(X_train)

19499387

In [12]:
indexes = np.random.choice(np.arange(len(X_train)), 2000000)

In [13]:
y_non_zeros = y_train[indexes] > 0
index_non_zeros = indexes[y_non_zeros]

y_zeros = y_train[indexes] == 0
index_zeros = indexes[y_zeros]

index_balanced = np.concatenate((index_non_zeros, index_zeros[:int(len(index_non_zeros))]))
len(index_balanced)

1113504

In [14]:
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor(max_depth=20, n_jobs=70)
clf = clf.fit(X_train[index_balanced], y_train[index_balanced])

In [15]:
import sklearn

mask = np.ones(X_train.shape[0],dtype=bool)
mask[index_balanced ] = False
test_index = np.arange(len(X_train))[~mask]

indexes_2 = np.random.choice(test_index, 1000000)
y_pred = clf.predict(X_train[indexes_2])
y_true = y_train[indexes_2]
sklearn.metrics.mean_squared_error(y_true, y_pred.round())

14.230686

In [16]:
def simple_random_forest_20dp(data):
    i, row = data
    sku = row['sku']
    target_stock = row['target_stock']
    train_row = df_train_v1.loc[sku]
    sold_quantity_series = train_row['sold_quantity_series']
    
    original_len = len(sold_quantity_series)
    len_to_pad = 20
    
    if original_len < len_to_pad:
        sold_quantity_series = np.pad(sold_quantity_series, (len_to_pad-original_len, 0))
    
    X_init = sold_quantity_series[-20:]
    predict_series = np.zeros(50)
    predict_series[:20] = X_init
    stock_count = 0
    for i in range(30):
        prediction = clf.predict([predict_series[i:20+i]]).round()[0]
        predict_series[20+i] = prediction
        stock_count += prediction
        if stock_count >= target_stock:
            break
            
    #print(X_init, predict_series[20:])  
    return np.eye(30)[i]

"""
n = 5000
n = min(n, len(df_test))
skus = []
predictions = []
with Pool(100) as p:
    for data in tqdm(p.imap(simple_random_forest_20dp, df_test[:n].iterrows()), total=n):
        sku, probabilities = data
        skus.append(sku)
        predictions.append(probabilities)
        
index = 1
print('pred', simple_random_forest_20dp((0, df_test.iloc[index])))
print('pred', ground_truth[index])

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-16-9e240ea095d8>, line 41)

In [None]:
rps(predictions, ground_truth[:len(predictions)])