In [1]:
#Model imports
import numpy as np
import json
import os
import pandas as pd
from datetime import datetime
import time

from tqdm.auto import tqdm
from multiprocessing import Pool
from iteround import saferound
import scipy.stats as st
import tweedie
from category_encoders import OrdinalEncoder
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=50)

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

from utils import read_df, read_numpy, write_df
from evaluate import rps

INFO: Pandarallel will run on 50 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
DATASET_PATH = './dataset/processed/train_v1'
TEST_DATA_FILENAME = 'test_data.csv'
GROUND_TRUTH_FILENAME = 'test_ground_truth.npy'
TRAIN_DATA_FILENAME = 'train_data.parquet'
TRAIN_DATA_PROCESSED_FILENAME = 'train_data_features.parquet'
ITEM_DATA_FILEPATH = './dataset/items_static_metadata_full.jl'

TRAIN_DATA_X_FILENAME = 'train_data_x.parquet'
TRAIN_DATA_Y_FILENAME = 'train_data_y.parquet'
TRAIN_DATA_X_PROCESSED_FILENAME = 'train_data_x_features.parquet'
TRAIN_DATA_Y_PROCESSED_FILENAME = 'train_data_y_features.parquet'

TEST_FROMTRAIN_DATA_LAST29_FILENAME = 'test_fromtrain_data_last29.parquet'
TEST_FROMTRAIN_DATA_LAST29_PROCESSED_FILENAME = 'test_fromtrain_data_last29_features.parquet'

TRAIN_DATA_X_PROCESSED_TSFRESH_FILENAME = 'train_data_x_features_tsfresh.parquet'
TEST_FROMTRAIN_DATA_LAST29_PROCESSED_TSFRESH_FILENAME = 'test_fromtrain_data_last29_features_tsfresh.parquet'

In [3]:
dateset_indexes = [0]
model_name = 'simple_first_30_days_fixed_spike'

In [4]:
dataset_index = dateset_indexes[0]
dataset_current_path = os.path.join(DATASET_PATH, str(dataset_index))

test_data_filepath = os.path.join(dataset_current_path, TEST_DATA_FILENAME)
ground_truth_filepath = os.path.join(dataset_current_path, GROUND_TRUTH_FILENAME)
train_data_filepath = os.path.join(dataset_current_path, TRAIN_DATA_FILENAME)
train_data_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_PROCESSED_FILENAME)

train_data_x_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_FILENAME)
train_data_y_filepath = os.path.join(dataset_current_path, TRAIN_DATA_Y_FILENAME)
train_data_x_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_PROCESSED_FILENAME)
train_data_y_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_Y_PROCESSED_FILENAME)

test_fromtrain_data_last29_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_FILENAME)
test_fromtrain_data_last29_processed_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_PROCESSED_FILENAME)

train_data_x_processed_tsfresh_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_PROCESSED_TSFRESH_FILENAME)
test_fromtrain_data_last29_processed_tsfresh_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_PROCESSED_TSFRESH_FILENAME)

In [5]:
df_train = read_df(train_data_filepath)
df_item = read_df(ITEM_DATA_FILEPATH)

In [6]:
df_test = read_df(test_data_filepath)
ground_truth = read_numpy(ground_truth_filepath)

#df_train_processed = read_df(train_data_processed_filepath)

In [7]:
df_all = df_train.join(df_item, how='left', on='sku', rsuffix="_item")

In [26]:
df_all['date'] = pd.to_datetime(df_all['date'])

In [17]:
import tsfresh
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, EfficientFCParameters

In [21]:
df_item.columns

Index(['item_domain_id', 'item_id', 'item_title', 'site_id', 'sku',
       'product_id', 'product_family_id'],
      dtype='object')

In [10]:
df_item_domain = df_all[df_all['item_domain_id'] == 'MLM-HEADPHONES']

In [11]:
id_column = 'item_domain_id'
sort_column = 'date'
numeric_columns = ['sold_quantity', 'current_price', 'minutes_active']

features = [id_column] + [sort_column] + numeric_columns

In [12]:
df_item_domain = df_item_domain[features]

In [15]:
df_features = extract_features(df_item_domain, column_id=id_column, column_sort=sort_column,
                         default_fc_parameters=EfficientFCParameters(), n_jobs=10, disable_progressbar=False)

Feature Extraction: 100%|██████████| 3/3 [42:50<00:00, 856.84s/it] 


In [18]:
tsfresh.feature_extraction.settings.from_columns(df_features.columns)

{'minutes_active': {'variance_larger_than_standard_deviation': None,
  'has_duplicate_max': None,
  'has_duplicate_min': None,
  'has_duplicate': None,
  'sum_values': None,
  'abs_energy': None,
  'mean_abs_change': None,
  'mean_change': None,
  'mean_second_derivative_central': None,
  'median': None,
  'mean': None,
  'length': None,
  'standard_deviation': None,
  'variation_coefficient': None,
  'variance': None,
  'skewness': None,
  'kurtosis': None,
  'root_mean_square': None,
  'absolute_sum_of_changes': None,
  'longest_strike_below_mean': None,
  'longest_strike_above_mean': None,
  'count_above_mean': None,
  'count_below_mean': None,
  'last_location_of_maximum': None,
  'first_location_of_maximum': None,
  'last_location_of_minimum': None,
  'first_location_of_minimum': None,
  'percentage_of_reoccurring_values_to_all_values': None,
  'percentage_of_reoccurring_datapoints_to_all_datapoints': None,
  'sum_of_reoccurring_values': None,
  'sum_of_reoccurring_data_points': N

In [None]:
from scipy.stats import linregress

In [92]:
df_all.iloc[[1,1,1]]

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,item_title,site_id,sku_item,product_id,product_family_id
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,464801,MLB9838512,MLB9838510
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,464801,MLB9838512,MLB9838510
1,464801,2021-02-02,0,156.78,REA,classic,fulfillment,free_shipping,1440.0,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,464801,MLB9838512,MLB9838510


In [95]:
item_columns = ['item_domain_id', 'item_id', 'item_title', 'site_id', 'sku', 'product_id', 'product_family_id']
sku_numeric_columns = ['sold_quantity', 'current_price', 'minutes_active']
sku_categorical_columns = ['currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment']
#sku_date_columns = ['date']
sku_columns = sku_numeric_columns + sku_categorical_columns

df = df_all[df_all['sku'] == df_all['sku'].iloc[0]]

#sku = df['sku'].iloc[0]
new_row = df_item[item_rows].iloc[0].to_dict()

count = len(df)
new_row['count'] = count
new_row['date__first'] = df['date'].iloc[0]
new_row['date__last'] = df['date'].iloc[-1]
new_row['date__diff'] = (new_row['date__last'] - new_row['date__first']).days
for date in ['date__first', 'date__last']:
    new_row[date+'_day'] = new_row[date].day
    new_row[date+'_month'] = new_row[date].month
    new_row[date+'_dayofweek'] = new_row[date].dayofweek
    new_row[date+'_weekofmonth'] = np.floor(new_row[date].day/((new_row[date].daysinmonth + 1)/4)).astype(int)
    

for column in sku_columns:
    new_row[column+'__first'] = df[column].iloc[0]
    new_row[column+'__last'] = df[column].iloc[-1]
    new_row[column+'__mode'] = df[column].mode().iloc[0]
    new_row[column+'__count_of_mode'] = df[column].value_counts().iloc[0]
    
for column in sku_numeric_columns:
    series = df[column].values
    new_row[column+'__sum'] = series.sum()
    new_row[column+'__mean'] = series.mean()
    new_row[column+'__median'] = np.median(series)
    new_row[column+'__std'] = series.std()
    new_row[column+'__var'] = series.var()
    new_row[column+'__variance_large_than_std'] = new_row[column+'__var'] > new_row[column+'__std']
    new_row[column+'__min'] = series.min()
    new_row[column+'__max'] = series.max()
    new_row[column+'__last_location_of_maximum'] = count - 1 - np.argmax(series[::-1])
    new_row[column+'__last_location_of_minimum'] = count - 1 - np.argmin(series[::-1])
    zero_locations = np.where(df[column] == 0)[0]
    non_zeros_locations = np.where(series != 0)[0]
    new_row[column+'__count_of_zero'] = zero_locations.shape[0]
    new_row[column+'__count_of_non_zero'] = non_zeros_locations.shape[0] 
    new_row[column+'__last_location_of_zero'] = zero_locations[-1] if zero_locations.shape[0] != 0 else -1 
    new_row[column+'__last_location_of_non_zero'] = non_zeros_locations[-1] if non_zeros_locations.shape[0] != 0 else -1
    
    regression = linregress(np.arange(count), series)
    new_row[column+'__linregress_slope'] = regression.slope
    new_row[column+'__linregress_intercept'] = regression.intercept
    new_row[column+'__linregress_pvalue'] = regression.pvalue
    new_row[column+'__linregress_rvalue'] = regression.rvalue
    new_row[column+'__linregress_stderr'] = regression.stderr
    
    series_change = np.diff(series)
    new_row[column+'___change_sum'] = series_change.sum()
    new_row[column+'__change_mean'] = series_change.mean()
    new_row[column+'__change_median'] = np.median(series_change)
    new_row[column+'__change_std'] = series_change.std()
    new_row[column+'__change_min'] = series_change.min()
    new_row[column+'__change_max'] = series_change.max()
    
    series_change_abs = np.abs(series_change)
    new_row[column+'___change_absolute_sum'] = series_change_abs.sum()
    new_row[column+'__change_absolute_mean'] = series_change_abs.mean()
    new_row[column+'__change_absolute_median'] = np.median(series_change_abs)
    new_row[column+'__change_absolute_std'] = series_change_abs.std()
    new_row[column+'__change_absolute_min'] = series_change_abs.min()
    new_row[column+'__change_absolute_max'] = series_change_abs.max()

for column in sku_columns:
    new_row[column+'__series'] = df[column].to_json(orient='values')
new_row

{'item_domain_id': 'MLB-SNEAKERS',
 'item_id': 492155,
 'item_title': 'Tênis Masculino Olympikus Cyber Barato Promoçao',
 'site_id': 'MLB',
 'sku': 0,
 'product_id': None,
 'product_family_id': 'MLB15832732',
 'count': 59,
 'date__first': Timestamp('2021-02-01 00:00:00'),
 'date__last': Timestamp('2021-03-31 00:00:00'),
 'date__diff': 58,
 'date__first_day': 1,
 'date__first_month': 2,
 'date__first_dayofweek': 0,
 'date__first_weekofmonth': 0,
 'date__last_day': 31,
 'date__last_month': 3,
 'date__last_dayofweek': 2,
 'date__last_weekofmonth': 3,
 'sold_quantity__first': 0,
 'sold_quantity__last': 0,
 'sold_quantity__mode': 0,
 'sold_quantity__count_of_mode': 49,
 'current_price__first': 156.78,
 'current_price__last': 169.99,
 'current_price__mode': 169.99,
 'current_price__count_of_mode': 22,
 'minutes_active__first': 1440.0,
 'minutes_active__last': 1440.0,
 'minutes_active__mode': 1440.0,
 'minutes_active__count_of_mode': 59,
 'currency__first': 'REA',
 'currency__last': 'REA',
 '

In [102]:
value_counts = df['sold_quantity'].value_counts()

In [103]:
value_counts

0    49
1     8
5     1
2     1
Name: sold_quantity, dtype: int64

In [98]:
np.sum(series ** 2)

122342400.0

In [86]:
np.floor(new_row[date].day/((new_row[date].daysinmonth)/4)).astype(int)

4

In [90]:
1/((new_row[date].daysinmonth + 1)/4)

0.125

In [77]:
new_row[date].day

31

In [58]:
regression = linregress(np.arange(count), df['sold_quantity'].values)

In [61]:
regression.count

<function LinregressResult.count>

In [30]:
zero_locations

array([], dtype=int64)

In [27]:
np.argmax(x)

NameError: name 'x' is not defined