In [1]:
#Model imports
import numpy as np
import json
import os
import pandas as pd
from datetime import datetime
import time

from tqdm.auto import tqdm
from multiprocessing import Pool
from iteround import saferound
import scipy.stats as st
import tweedie
from category_encoders import OrdinalEncoder
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=50)

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

from utils import read_df, read_numpy, write_df, read_json
from evaluate import rps

INFO: Pandarallel will run on 50 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
DATASET_PATH = './dataset/processed/train_v1'

FEATURES_METADATA_PATH = './dataset/features_metadata.json'

TEST_DATA_FILENAME = 'test_data.csv'
GROUND_TRUTH_FILENAME = 'test_ground_truth.npy'
TRAIN_DATA_FILENAME = 'train_data.parquet'
TRAIN_DATA_PROCESSED_FILENAME = 'train_data_features.parquet'
ITEM_DATA_FILEPATH = './dataset/items_static_metadata_full.jl'

TRAIN_DATA_X_FILENAME = 'train_data_x.parquet'
TRAIN_DATA_Y_FILENAME = 'train_data_y.parquet'
TRAIN_DATA_X_PROCESSED_FILENAME = 'train_data_x_features.parquet'
TRAIN_DATA_Y_PROCESSED_FILENAME = 'train_data_y_features.parquet'

TEST_FROMTRAIN_DATA_LAST29_FILENAME = 'test_fromtrain_data_last29.parquet'
TEST_FROMTRAIN_DATA_LAST29_PROCESSED_FILENAME = 'test_fromtrain_data_last29_features.parquet'

TRAIN_DATA_X_PROCESSED_TSFRESH_FILENAME = 'train_data_x_features_tsfresh.parquet'
TEST_FROMTRAIN_DATA_LAST29_PROCESSED_TSFRESH_FILENAME = 'test_fromtrain_data_last29_features_tsfresh.parquet'

In [3]:
dateset_indexes = [0]
model_name = 'simple_first_30_days_fixed_spike'

In [4]:
dataset_index = dateset_indexes[0]
dataset_current_path = os.path.join(DATASET_PATH, str(dataset_index))

test_data_filepath = os.path.join(dataset_current_path, TEST_DATA_FILENAME)
ground_truth_filepath = os.path.join(dataset_current_path, GROUND_TRUTH_FILENAME)
train_data_filepath = os.path.join(dataset_current_path, TRAIN_DATA_FILENAME)
train_data_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_PROCESSED_FILENAME)

train_data_x_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_FILENAME)
train_data_y_filepath = os.path.join(dataset_current_path, TRAIN_DATA_Y_FILENAME)
train_data_x_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_PROCESSED_FILENAME)
train_data_y_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_Y_PROCESSED_FILENAME)

test_fromtrain_data_last29_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_FILENAME)
test_fromtrain_data_last29_processed_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_PROCESSED_FILENAME)

train_data_x_processed_tsfresh_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_PROCESSED_TSFRESH_FILENAME)
test_fromtrain_data_last29_processed_tsfresh_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_PROCESSED_TSFRESH_FILENAME)

In [5]:
df_train = read_df(train_data_filepath)
df_item = read_df(ITEM_DATA_FILEPATH)

In [6]:
df_test = read_df(test_data_filepath)
ground_truth = read_numpy(ground_truth_filepath)

#df_train_processed = read_df(train_data_processed_filepath)

In [7]:
id_column =  'sku'
date_column = 'date'

item_string_columns = ['item_title']
item_categorical_columns = ['item_domain_id', 'item_id', 'site_id', 'product_id', 'product_family_id']
item_columns = item_string_columns + item_categorical_columns

sku_numeric_columns = ['sold_quantity', 'current_price', 'minutes_active']
sku_categorical_columns = ['currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment']
sku_columns = sku_numeric_columns + sku_categorical_columns

string_columns = item_string_columns
categorical_columns = sku_categorical_columns + item_categorical_columns
numeric_columns = sku_numeric_columns

In [8]:
features_metadata = read_json(FEATURES_METADATA_PATH)

In [29]:
import feature_extraction_v2
from importlib import reload 
reload(feature_extraction_v2)
df_features_v2 = feature_extraction_v2.extract_features_per_sku(df_train.iloc[0:1000], df_item, n_workers=10)

preprocess_data: 0.2561614513397217 seconds
sku_split: 0.2609446048736572 seconds


  0%|          | 0/20 [00:00<?, ?it/s]

sku_processing: 1.741872787475586 seconds
item_domain_split: 1.8712129592895508 seconds
      sku       date  sold_quantity  current_price currency listing_type  \
0   99516 2021-02-01              0           22.9      REA      premium   
1   99516 2021-02-02              2           22.9      REA      premium   
2   99516 2021-02-03              6           22.9      REA      premium   
3   99516 2021-02-04              2           22.9      REA      premium   
4   99516 2021-02-05              1           22.9      REA      premium   
5   99516 2021-02-06              2           22.9      REA      premium   
6   99516 2021-02-07              3           22.9      REA      premium   
7   99516 2021-02-08              1           22.9      REA      premium   
8   99516 2021-02-09             10           22.9      REA      premium   
9   99516 2021-02-10              3           22.9      REA      premium   
10  99516 2021-02-11              2           22.9      REA      premium   


  0%|          | 0/14 [00:00<?, ?it/s]


n
       sku       date  sold_quantity  current_price currency listing_type  \
0   464801 2021-02-01              0         156.78      REA      classic   
1   464801 2021-02-02              0         156.78      REA      classic   
2   464801 2021-02-03              0         156.78      REA      classic   
3   464801 2021-02-04              0         156.78      REA      classic   
4   464801 2021-02-05              1         156.78      REA      classic   
..     ...        ...            ...            ...      ...          ...   
83  645793 2021-02-25              0         164.99      REA      classic   
84  645793 2021-02-26              0         164.99      REA      classic   
85  645793 2021-02-27              0         164.99      REA      classic   
86  645793 2021-02-28              0         164.99      REA      classic   
87  645793 2021-03-01              0         164.99      REA      classic   

   shipping_logistic_type shipping_payment  minutes_active  item_domain_

In [16]:
df_features_v2

Unnamed: 0,item_title,item_domain_id,item_id,site_id,product_id,product_family_id,sku,count,date__first,date__last,...,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_sum,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_mean,minutes_active__by_item_domain_id__by_dayofthemonth__bigger_std,minutes_active__by_item_domain_id__by_dayofthemonth__bigger_var,minutes_active__by_item_domain_id__by_dayofthemonth__smaller_std,minutes_active__by_item_domain_id__by_dayofthemonth__smaller_var,minutes_active__by_item_domain_id__by_dayofthemonth__with_bigger_std,minutes_active__by_item_domain_id__by_dayofthemonth__with_bigger_var,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_std,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_var
0,Fita Dupla Face Preta Colar Touch Lcd 3mm Celu...,MLB-ADHESIVE_TAPES,221252,MLB,,,99516,59,2021-02-01,2021-03-31,...,15,15,0.0,0.0,0.0,0.0,1,1,1,1
1,Kit 12 Plantas Mini Suculentas Artificiais C/ ...,MLB-ARTIFICIAL_FLOWERS,192180,MLB,,,58546,59,2021-02-01,2021-03-31,...,26,26,0.0,0.0,0.0,0.0,1,1,1,1
2,[6-18m] Saco Antialérgico Para Bebê Dormir Est...,MLB-BABY_BLANKETS,194503,MLB,,,274856,59,2021-02-01,2021-03-31,...,10,10,0.0,0.0,0.0,0.0,1,1,1,1
3,Fio De Cobre P/ Reparo Jamper Trilha Placa Cel...,MLB-CELLPHONE_PARTS,61865,MLB,,,80056,59,2021-02-01,2021-03-31,...,12,12,0.0,0.0,0.0,0.0,1,1,1,1
4,Adesivo De Parede Unicornio + 30 Florais,MLB-DECORATIVE_VINYLS,168198,MLB,,,557191,59,2021-02-01,2021-03-31,...,1,1,0.0,0.0,0.0,0.0,1,1,1,1
5,Incensário E Castiçal Flor De Lotus Em Resina ...,MLB-INCENSE_HOLDERS,490376,MLB,,,105633,59,2021-02-01,2021-03-31,...,3,3,1018.233765,1036800.0,0.0,0.0,3,3,1,1
6,Incensário E Castiçal Flor De Lotus Em Resina ...,MLB-INCENSE_HOLDERS,491262,MLB,,,261478,29,2021-02-01,2021-03-01,...,3,3,1018.233765,1036800.0,0.0,0.0,3,3,1,1
7,Incensário E Castiçal Flor De Lotus Em Resina ...,MLB-INCENSE_HOLDERS,114861,MLB,,,426773,59,2021-02-01,2021-03-31,...,3,3,1018.233765,1036800.0,0.0,0.0,3,3,1,1
8,Amortecedor Pro Link Broz 150 125 Original Cofap,MLB-MOTORCYCLE_SHOCK_ABSORBERS,333050,MLB,,,378086,29,2021-02-01,2021-03-01,...,1,1,0.0,0.0,0.0,0.0,1,1,1,1
9,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB-NEBULIZERS,344151,MLB,MLB9838512,MLB9838510,464801,59,2021-02-01,2021-03-31,...,15,15,1018.233765,1036800.0,0.0,0.0,15,15,1,1


In [None]:
write_df(df_features_v2, './dataset/test.parquet')

In [None]:
for name, dtype in df_features_v2.dtypes.items():
    print(name, dtype)

In [None]:
numeric_features = []
categorical_features = []
positional_features = []
counting_features = []
date_features = []
string_features = []
series_features = []
id_features = []
for feature in df_features_v2.columns:
    feature_components = feature.split('__')
    if ('with' in feature_components[-1]) or ('location' in feature_components[-1]):
        positional_features.append(feature)
    elif ('count' in feature_components[-1]):
        counting_features.append(feature)
    elif feature_components[-1] == 'series':
        series_features.append(feature)
    elif feature_components[0] in categorical_columns:
        categorical_features.append(feature)
    elif feature_components[0] == 'date':
        if feature == 'date__first' or feature == 'date__last':
            date_features.append(feature)
        else:
            positional_features.append(feature)
    elif feature_components[0] in string_columns:
        string_features.append(feature)
    elif feature_components[0] == id_column:
        id_features.append(feature)
    else:
        numeric_features.append(feature)

In [None]:
categorical_features

In [None]:
df_all['date'] = pd.to_datetime(df_all['date'])

In [None]:
features_data = {}

for column in categorical_columns:
    data = {
        'column': column,
        'type': 'category'
    }
    df_all[column] = df_all[column].astype(str).astype('category')
    value_counts = df_all[column].value_counts()
    data['categories'] = list(value_counts.index)
    data['value_counts'] = list(value_counts.values)
    df_all[column].cat.set_categories(data['categories'])
    data['size'] = len(data['categories'])
    features_data[column] = data

In [None]:
for column in numeric_columns:
    data = {
        'column': column,
        'type': 'numeric',
        'dtype': str(df_all[column].dtype)
    }
    data['min'] = df_all[column].min()
    data['max'] = df_all[column].max()
    data['var'] = df_all[column].var()
    data['std'] = df_all[column].std()
    data['mean'] = df_all[column].mean()
    data['sum'] = df_all[column].sum()
    value_counts = df_all[column].value_counts()
    data['unique'] = list(value_counts.index)
    data['value_counts'] = list(value_counts.values)
    data['size_unique'] = len(data['unique'])
    features_data[column] = data

for column in string_columns:
    features_data[column] = {
        'column': column,
        'type': 'string',
        'dtype': 'str',
        'size_unique': df_all[date_column].nunique()
    }
    
features_data[id_column] = {
    'column': id_column,
    'type': 'id',
    'dtype': str(df_all['sku'].dtype),
    'dataset_type': 'metadata',
    'size_unique': df_all['sku'].nunique()
}

features_data[date_column] = {
    'column': date_column,
    'type': 'date',
    'dataset_type': 'timeseries',
    'date_max': str(df_all[date_column].min()),
    'date_min': str(df_all[date_column].max()),
    'size_unique': df_all[date_column].nunique(),
}

for column in item_columns:
    features_data[column]['dataset_type']: 'timeseries'
for column in sku_columns:
    features_data[column]['dataset_type']: 'metadata'

In [None]:
str(df_all[column].dtype)

In [None]:
df_all[column].cat.codes

In [None]:
df_all[column].cat.set_categories(np.flip(df_all[column].cat.categories)).cat.codes

In [None]:
from scipy.stats import linregress

In [None]:
def extract_series_default_features(new_row, series, name, zerout=False):
    if series.shape[0] == 0:
        zerout = True
    
    if not zerout:
        new_row[name+'__sum'] = series.sum()
        new_row[name+'__mean'] = series.mean()
        new_row[name+'__median'] = np.median(series)
        new_row[name+'__std'] = series.std()
        new_row[name+'__var'] = series.var()
        new_row[name+'__variance_large_than_std'] = new_row[name+'__var'] > new_row[name+'__std']
        new_row[name+'__min'] = series.min()
        new_row[name+'__max'] = series.max()
        new_row[name+'__abs_energy'] = np.dot(series, series)
        new_row[name+'__count_of_zero'] = np.count_nonzero(series==0)
        new_row[name+'__count_of_non_zero'] = np.count_nonzero(series)
    else:
        new_row[name+'__sum'] = 0
        new_row[name+'__mean'] = 0
        new_row[name+'__median'] = 0
        new_row[name+'__std'] = 0
        new_row[name+'__var'] = 0
        new_row[name+'__variance_large_than_std'] = False
        new_row[name+'__min'] = 0
        new_row[name+'__max'] = 0
        new_row[name+'__abs_energy'] = 0
        new_row[name+'__count_of_zero'] = 0
        new_row[name+'__count_of_non_zero'] = 0

In [None]:
def dayoftheweek_filter(df_dt, n):
    return df_dt.dayofweek == n

def weekofthemonth_filter(df_dt, n):
    return np.floor(df_dt.day/((df_dt.daysinmonth + 1)/4)) == n

In [None]:
def extract_series_date_features(new_row, df, name, date_filter, quantity):
    column = df.columns[-1]
    
    zeros_counts = []
    non_zeros_counts = []
    mean_values = []
    for n_date in range(quantity):
        n_date_prefix = name + '_' + str(n_date)
        df_date = df[date_filter(df['date'].dt, n_date)]
        
        series = df_date[column].values
        extract_series_default_features(new_row, series, n_date_prefix)

        zeros_counts.append(new_row[n_date_prefix+'__count_of_zero'])
        non_zeros_counts.append(new_row[n_date_prefix+'__count_of_non_zero'])
        mean_values.append(new_row[n_date_prefix+'__mean'])

    new_row[name+'__with_most_count_of_zero'] = np.argmax(zeros_counts)
    new_row[name+'__with_most_count_of_non_zero'] = np.argmax(non_zeros_counts)
    new_row[name+'__with_bigger_mean'] = np.argmax(mean_values)
    new_row[name+'__with_least_count_of_zero'] = np.argmin(zeros_counts)
    new_row[name+'__with_least_count_of_non_zero'] = np.argmin(non_zeros_counts)
    new_row[name+'__with_smaller_mean'] = np.argmin(mean_values)

In [None]:
rows = []
for sku in df_all['sku'].unique()[:10]:
    df = df_all[df_all['sku'] == sku]

    #sku = df['sku'].iloc[0]
    new_row = df[item_columns].iloc[0].to_dict()

    count = len(df)
    #new_row['sku'] = df['sku'].iloc[0]
    new_row['count'] = count
    new_row['date__first'] = df['date'].iloc[0]
    new_row['date__last'] = df['date'].iloc[-1]
    new_row['date__diff'] = (new_row['date__last'] - new_row['date__first']).days
    for date in ['date__first', 'date__last']:
        new_row[date+'_day'] = new_row[date].day
        new_row[date+'_month'] = new_row[date].month
        new_row[date+'_dayofweek'] = new_row[date].dayofweek
        new_row[date+'_weekofmonth'] = np.floor(new_row[date].day/((new_row[date].daysinmonth + 1)/4)).astype(int)


    for column in sku_columns:
        new_row[column+'__first'] = df[column].iloc[0]
        new_row[column+'__last'] = df[column].iloc[-1]
        new_row[column+'__mode'] = df[column].mode().iloc[0]
        new_row[column+'__count_of_mode'] = df[column].value_counts().iloc[0]

    for column in sku_numeric_columns:
        series = df[column].values
        extract_series_default_features(new_row, series, column)
        new_row[column+'__last_location_of_maximum'] = count - 1 - np.argmax(series[::-1])
        new_row[column+'__last_location_of_minimum'] = count - 1 - np.argmin(series[::-1])
        zero_locations = np.where(series == 0)[0]
        non_zeros_locations = np.where(series != 0)[0]
        new_row[column+'__last_location_of_zero'] = zero_locations[-1] if zero_locations.shape[0] != 0 else -1 
        new_row[column+'__last_location_of_non_zero'] = non_zeros_locations[-1] if non_zeros_locations.shape[0] != 0 else -1

        #Boolean variable denoting if the distribution of x *looks symmetric*  | mean(X)-median(X)| < r * (max(X)-min(X))
        mean_minus_median_abs = np.abs(new_row[column+'__mean'] - new_row[column+'__median'])
        max_minus_min = new_row[column+'__max'] - new_row[column+'__min']
        for r in [0.3, 0.5, 0.7]:
            new_row[column+f'__symmetry_looking_r_{r}'] = mean_minus_median_abs < r*max_minus_min

        #series energy ratio last chunk
        full_series_energy = np.sum(series ** 2)
        for n_chuncks, focus in [(3,2), (5,4), (10,9)]:
            if full_series_energy != 0:
                series_split = np.array_split(series, n_chuncks)
                selected_series = series_split[focus]
                for i in range(focus, -1, -1):
                    if series_split[i].shape[0] > 0:
                        selected_series = series_split[i]
                        break
                new_row[column+f'__energy_ratio_by_chunks_{n_chuncks}_segments_focus_{focus}'] = np.sum(selected_series ** 2) / full_series_energy
            else:
                new_row[column+f'__energy_ratio_by_chunks_{n_chuncks}_segments_focus_{focus}'] = 0

        if count > 1:
            regression = linregress(np.arange(count), series)
            new_row[column+'__linregress_slope'] = regression.slope
            new_row[column+'__linregress_intercept'] = regression.intercept
            new_row[column+'__linregress_pvalue'] = regression.pvalue
            new_row[column+'__linregress_rvalue'] = regression.rvalue
            new_row[column+'__linregress_stderr'] = regression.stderr
        else:
            new_row[column+'__linregress_slope'] = 0
            new_row[column+'__linregress_intercept'] = series[0]
            new_row[column+'__linregress_pvalue'] = 0
            new_row[column+'__linregress_rvalue'] = 0
            new_row[column+'__linregress_stderr'] = 0

        series_change = np.diff(series)
        extract_series_default_features(new_row, series_change, column+'__change')

        series_change_abs = np.abs(series_change)
        extract_series_default_features(new_row, series_change_abs, column+'__change_absolute')

        last_5_series = series[-5:]
        extract_series_default_features(new_row, last_5_series, column+'__last_5')

        last_3_series = series[-3:]
        extract_series_default_features(new_row, last_5_series, column+'__last_3')

        weekdate_prefix = '__by_dayoftheweek'
        extract_series_date_features(new_row, df[['date', column]], column+weekdate_prefix, dayoftheweek_filter, 7)

        monthweek_prefix = '__by_weekofthemonth'
        extract_series_date_features(new_row, df[['date', column]], column+monthweek_prefix, weekofthemonth_filter, 4)

    for column in sku_columns:
        new_row[column+'__series'] = df[column].to_json(orient='values')

    print(len(new_row))    
    rows.append(new_row)

In [None]:
df_sku_features = pd.DataFrame(rows)

In [None]:
df_sku_features

In [None]:
df_sku_features[item_columns]

In [None]:
#df = df_all[df_all['item_domain_id'] == 'MLA-RADIO_BASE_STATIONS']


In [None]:
item_rows = []
for item_domain_id in ['MLB-ADHESIVE_TAPES']:
    df = df_all[df_all['item_domain_id'] == item_domain_id]
    df_group_date = df.groupby('date')

    count = len(df)
    new_row = {
        'item_domain_id': df['item_domain_id'].iloc[0],
        'count__by_item_domain_id': len(df),
        'count_sku__by_item_domain_id': df['sku'].nunique()
    }

    by_prefix = '__by_item_domain_id'

    for column in sku_columns:
        new_row[column+by_prefix+'__mode'] = df[column].mode().iloc[0]
        new_row[column+by_prefix+'__count_of_mode'] = df[column].value_counts().iloc[0]

    for column in sku_numeric_columns:
        series = df[column].values
        extract_series_default_features(new_row, series, column+by_prefix)

        series = df_group_date[column].mean().values
        if series.shape[0] > 1:
            regression = linregress(np.arange(series.shape[0]), series)
            new_row[column+by_prefix+'__mean__linregress_slope'] = regression.slope
            new_row[column+by_prefix+'__mean__linregress_intercept'] = regression.intercept
            new_row[column+by_prefix+'__mean__linregress_pvalue'] = regression.pvalue
            new_row[column+by_prefix+'__mean__linregress_rvalue'] = regression.rvalue
            new_row[column+by_prefix+'__mean__linregress_stderr'] = regression.stderr
        else:
            new_row[column+by_prefix+'__mean__linregress_slope'] = 0
            new_row[column+by_prefix+'__mean__linregress_intercept'] = series[0]
            new_row[column+by_prefix+'__mean__linregress_pvalue'] = 0
            new_row[column+by_prefix+'__mean__linregress_rvalue'] = 0
            new_row[column+by_prefix+'__mean__linregress_stderr'] = 0

        weekdate_prefix = '__by_dayoftheweek'
        extract_series_date_features(new_row, df[['date', column]], column+by_prefix+weekdate_prefix, dayoftheweek_filter, 7)

        monthweek_prefix = '__by_weekofthemonth'
        extract_series_date_features(new_row, df[['date', column]], column+by_prefix+monthweek_prefix, weekofthemonth_filter, 4)

        daymonth_prefix = '__by_dayofthemonth'
        #new_row[column+by_prefix+daymonth_prefix+'__with_smaller_mean'] = df_group_date[column].max().idxmax().day
        count_zero = df_group_date[column].apply(lambda x: np.where(x == 0)[0].shape[0])
        count_non_zero = df_group_date[column].apply(lambda x: np.where(x != 0)[0].shape[0])
        new_row[column+by_prefix+daymonth_prefix+'__with_most_count_of_zero'] = count_zero.idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_most_count_of_non_zero'] = count_non_zero.idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_least_count_of_zero'] = count_zero.idxmin().day
        new_row[column+by_prefix+daymonth_prefix+'__with_least_count_of_non_zero'] = count_non_zero.idxmin().day

        count_date = df_group_date[column].count().shape[0]
        new_row[column+by_prefix+daymonth_prefix+'__bigger_sum'] = df_group_date[column].sum().max()
        new_row[column+by_prefix+daymonth_prefix+'__bigger_mean'] = df_group_date[column].mean().max()
        new_row[column+by_prefix+daymonth_prefix+'__smaller_sum'] = df_group_date[column].sum().min()
        new_row[column+by_prefix+daymonth_prefix+'__smaller_mean'] = df_group_date[column].mean().min()
        new_row[column+by_prefix+daymonth_prefix+'__with_bigger_mean'] = df_group_date[column].sum().idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_bigger_mean'] = df_group_date[column].mean().idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_smaller_sum'] = df_group_date[column].mean().idxmin().day
        new_row[column+by_prefix+daymonth_prefix+'__with_smaller_mean'] = df_group_date[column].mean().idxmin().day

        if count_date > 1:
            new_row[column+by_prefix+daymonth_prefix+'__bigger_std'] = df_group_date[column].std().max()
            new_row[column+by_prefix+daymonth_prefix+'__bigger_var'] = df_group_date[column].var().max()
            new_row[column+by_prefix+daymonth_prefix+'__smaller_std'] = df_group_date[column].std().min()
            new_row[column+by_prefix+daymonth_prefix+'__smaller_var'] = df_group_date[column].var().min()
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_std'] = df_group_date[column].std().idxmax().day
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_var'] = df_group_date[column].var().idxmax().day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_std'] = df_group_date[column].std().idxmin().day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_var'] = df_group_date[column].var().idxmin().day
        else:
            new_row[column+by_prefix+daymonth_prefix+'__bigger_std'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__bigger_var'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__smaller_std'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__smaller_var'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_std'] = df_group_date[column].std().index[0].day
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_var'] = df_group_date[column].var().index[0].day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_std'] = df_group_date[column].std().index[0].day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_var'] = df_group_date[column].var().index[0].day
    print(len(new_row)) 
    item_rows.append(new_row)

In [None]:
new_row

In [None]:
df_item_domain_id_features = pd.DataFrame(item_rows)
#df_item_domain_id_features = df_sku_features.set_index('item_domain_id')

In [None]:
df_item_domain_id_features

In [None]:
print(list(df_sku_features.merge(df_item_domain_id_features, on='item_domain_id').columns))

In [None]:
pd.concat([df_sku_features, df_item_domain_id_features])

In [None]:
df_group_date[column].count().shape

In [None]:
df_group_date[column]

In [None]:
df_group_date[column].std().index[0]

In [None]:
df_group_date[column].count()

In [None]:
df_group_date[column].apply(lambda x: np.where(x == 0)[0].shape[0])

In [None]:
df[np.floor(df['date'].dt.day/((df['date'].dt.daysinmonth + 1)/4)) == 4]

In [None]:
df_date

In [None]:
zeros_counts

In [None]:
non_zeros_counts

In [None]:
np.array_split(series, 3)[3]

In [None]:
value_counts = df['sold_quantity'].value_counts()

In [None]:
value_counts

In [None]:
np.sum(series ** 2)

In [None]:
np.floor(new_row[date].day/((new_row[date].daysinmonth)/4)).astype(int)

In [None]:
1/((new_row[date].daysinmonth + 1)/4)

In [None]:
new_row[date].day

In [None]:
regression = linregress(np.arange(count), df['sold_quantity'].values)

In [None]:
regression.count

In [None]:
zero_locations

In [None]:
np.argmax(x)