In [5]:
#Model imports
import numpy as np
import json
import os
import pandas as pd
from datetime import datetime
import time

from tqdm.auto import tqdm
from multiprocessing import Pool
from iteround import saferound
import scipy.stats as st
import tweedie
from category_encoders import OrdinalEncoder
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=50)

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

from utils import read_df, read_numpy, write_df, read_json
from evaluate import rps

INFO: Pandarallel will run on 50 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [9]:
DATASET_PATH = './dataset/processed/train_v1'

FEATURES_METADATA_PATH = './dataset/features_metadata.json'

TEST_DATA_FILENAME = 'test_data.csv'
GROUND_TRUTH_FILENAME = 'test_ground_truth.npy'
TRAIN_DATA_FILENAME = 'train_data.parquet'
TRAIN_DATA_PROCESSED_FILENAME = 'train_data_features.parquet'
ITEM_DATA_FILEPATH = './dataset/items_static_metadata_full.jl'

TRAIN_DATA_X_FILENAME = 'train_data_x.parquet'
TRAIN_DATA_Y_FILENAME = 'train_data_y.parquet'
TRAIN_DATA_X_PROCESSED_FILENAME = 'train_data_x_features.parquet'
TRAIN_DATA_Y_PROCESSED_FILENAME = 'train_data_y_features.parquet'

TEST_FROMTRAIN_DATA_LAST29_FILENAME = 'test_fromtrain_data_last29.parquet'
TEST_FROMTRAIN_DATA_LAST29_PROCESSED_FILENAME = 'test_fromtrain_data_last29_features.parquet'

TRAIN_DATA_X_PROCESSED_TSFRESH_FILENAME = 'train_data_x_features_tsfresh.parquet'
TEST_FROMTRAIN_DATA_LAST29_PROCESSED_TSFRESH_FILENAME = 'test_fromtrain_data_last29_features_tsfresh.parquet'

In [2]:
dateset_indexes = [0]
model_name = 'simple_first_30_days_fixed_spike'

In [6]:
dataset_index = dateset_indexes[0]
dataset_current_path = os.path.join(DATASET_PATH, str(dataset_index))

test_data_filepath = os.path.join(dataset_current_path, TEST_DATA_FILENAME)
ground_truth_filepath = os.path.join(dataset_current_path, GROUND_TRUTH_FILENAME)
train_data_filepath = os.path.join(dataset_current_path, TRAIN_DATA_FILENAME)
train_data_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_PROCESSED_FILENAME)

train_data_x_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_FILENAME)
train_data_y_filepath = os.path.join(dataset_current_path, TRAIN_DATA_Y_FILENAME)
train_data_x_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_PROCESSED_FILENAME)
train_data_y_processed_filepath = os.path.join(dataset_current_path, TRAIN_DATA_Y_PROCESSED_FILENAME)

test_fromtrain_data_last29_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_FILENAME)
test_fromtrain_data_last29_processed_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_PROCESSED_FILENAME)

train_data_x_processed_tsfresh_filepath = os.path.join(dataset_current_path, TRAIN_DATA_X_PROCESSED_TSFRESH_FILENAME)
test_fromtrain_data_last29_processed_tsfresh_filepath = os.path.join(dataset_current_path, TEST_FROMTRAIN_DATA_LAST29_PROCESSED_TSFRESH_FILENAME)

In [7]:
df_train = read_df(train_data_filepath)
df_item = read_df(ITEM_DATA_FILEPATH)

In [6]:
df_test = read_df(test_data_filepath)
ground_truth = read_numpy(ground_truth_filepath)

#df_train_processed = read_df(train_data_processed_filepath)

In [14]:
id_column =  'sku'
date_column = 'date'

item_string_columns = ['item_title']
item_categorical_columns = ['item_domain_id', 'item_id', 'site_id', 'product_id', 'product_family_id']
item_columns = item_string_columns + item_categorical_columns

sku_numeric_columns = ['sold_quantity', 'current_price', 'minutes_active']
sku_categorical_columns = ['currency', 'listing_type', 'shipping_logistic_type', 'shipping_payment']
sku_columns = sku_numeric_columns + sku_categorical_columns

string_columns = item_string_columns
categorical_columns = sku_categorical_columns + item_categorical_columns
numeric_columns = sku_numeric_columns

In [10]:
features_metadata = read_json(FEATURES_METADATA_PATH)

In [57]:
import feature_extraction_v2
from importlib import reload 
reload(feature_extraction_v2)
df_features_v2 = feature_extraction_v2.extract_features_per_sku(df_train.iloc[0:1000], df_item, n_workers=10)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

In [52]:
write_df(df_features_v2, './dataset/test.parquet')

'./dataset/test.parquet'

In [58]:
for name, dtype in df_features_v2.dtypes.items():
    print(name, dtype)

item_title object
item_domain_id category
item_id category
site_id category
product_id category
product_family_id category
sku int64
count int64
date__first datetime64[ns]
date__last datetime64[ns]
date__diff int64
date__first_day int64
date__first_month int64
date__first_dayofweek int64
date__first_weekofmonth int64
date__last_day int64
date__last_month int64
date__last_dayofweek int64
date__last_weekofmonth int64
sold_quantity__first int64
sold_quantity__last int64
sold_quantity__mode int64
sold_quantity__count_of_mode int64
current_price__first float64
current_price__last float64
current_price__mode float64
current_price__count_of_mode int64
minutes_active__first float64
minutes_active__last float64
minutes_active__mode float64
minutes_active__count_of_mode int64
currency__first category
currency__last category
currency__mode category
currency__count_of_mode int64
listing_type__first category
listing_type__last category
listing_type__mode category
listing_type__count_of_mode int64
s

In [54]:
numeric_features = []
categorical_features = []
positional_features = []
counting_features = []
date_features = []
string_features = []
series_features = []
id_features = []
for feature in df_features_v2.columns:
    feature_components = feature.split('__')
    if ('with' in feature_components[-1]) or ('location' in feature_components[-1]):
        positional_features.append(feature)
    elif ('count' in feature_components[-1]):
        counting_features.append(feature)
    elif feature_components[-1] == 'series':
        series_features.append(feature)
    elif feature_components[0] in categorical_columns:
        categorical_features.append(feature)
    elif feature_components[0] == 'date':
        if feature == 'date__first' or feature == 'date__last':
            date_features.append(feature)
        else:
            positional_features.append(feature)
    elif feature_components[0] in string_columns:
        string_features.append(feature)
    elif feature_components[0] == id_column:
        id_features.append(feature)
    else:
        numeric_features.append(feature)

In [55]:
categorical_features

['item_domain_id',
 'item_id',
 'site_id',
 'product_id',
 'product_family_id',
 'currency__first',
 'currency__last',
 'currency__mode',
 'listing_type__first',
 'listing_type__last',
 'listing_type__mode',
 'shipping_logistic_type__first',
 'shipping_logistic_type__last',
 'shipping_logistic_type__mode',
 'shipping_payment__first',
 'shipping_payment__last',
 'shipping_payment__mode',
 'currency__by_item_domain_id__mode',
 'listing_type__by_item_domain_id__mode',
 'shipping_logistic_type__by_item_domain_id__mode',
 'shipping_payment__by_item_domain_id__mode']

In [7]:
df_all['date'] = pd.to_datetime(df_all['date'])

In [101]:
features_data = {}

for column in categorical_columns:
    data = {
        'column': column,
        'type': 'category'
    }
    df_all[column] = df_all[column].astype(str).astype('category')
    value_counts = df_all[column].value_counts()
    data['categories'] = list(value_counts.index)
    data['value_counts'] = list(value_counts.values)
    df_all[column].cat.set_categories(data['categories'])
    data['size'] = len(data['categories'])
    features_data[column] = data

In [99]:
for column in numeric_columns:
    data = {
        'column': column,
        'type': 'numeric',
        'dtype': str(df_all[column].dtype)
    }
    data['min'] = df_all[column].min()
    data['max'] = df_all[column].max()
    data['var'] = df_all[column].var()
    data['std'] = df_all[column].std()
    data['mean'] = df_all[column].mean()
    data['sum'] = df_all[column].sum()
    value_counts = df_all[column].value_counts()
    data['unique'] = list(value_counts.index)
    data['value_counts'] = list(value_counts.values)
    data['size_unique'] = len(data['unique'])
    features_data[column] = data

for column in string_columns:
    features_data[column] = {
        'column': column,
        'type': 'string',
        'dtype': 'str',
        'size_unique': df_all[date_column].nunique()
    }
    
features_data[id_column] = {
    'column': id_column,
    'type': 'id',
    'dtype': str(df_all['sku'].dtype),
    'dataset_type': 'metadata',
    'size_unique': df_all['sku'].nunique()
}

features_data[date_column] = {
    'column': date_column,
    'type': 'date',
    'dataset_type': 'timeseries',
    'date_max': str(df_all[date_column].min()),
    'date_min': str(df_all[date_column].max()),
    'size_unique': df_all[date_column].nunique(),
}

for column in item_columns:
    features_data[column]['dataset_type']: 'timeseries'
for column in sku_columns:
    features_data[column]['dataset_type']: 'metadata'

In [91]:
str(df_all[column].dtype)

'float64'

In [80]:
df_all[column].cat.codes

0           18926
1           18926
2           18926
3           18926
4           18926
            ...  
33346154    29600
33346155     1965
33346156    29600
33346157     2517
33346158    29600
Length: 33346159, dtype: int16

In [84]:
df_all[column].cat.set_categories(np.flip(df_all[column].cat.categories)).cat.codes

0           10674
1           10674
2           10674
3           10674
4           10674
            ...  
33346154        0
33346155    27635
33346156        0
33346157    27083
33346158        0
Length: 33346159, dtype: int16

In [19]:
from scipy.stats import linregress

In [8]:
def extract_series_default_features(new_row, series, name, zerout=False):
    if series.shape[0] == 0:
        zerout = True
    
    if not zerout:
        new_row[name+'__sum'] = series.sum()
        new_row[name+'__mean'] = series.mean()
        new_row[name+'__median'] = np.median(series)
        new_row[name+'__std'] = series.std()
        new_row[name+'__var'] = series.var()
        new_row[name+'__variance_large_than_std'] = new_row[name+'__var'] > new_row[name+'__std']
        new_row[name+'__min'] = series.min()
        new_row[name+'__max'] = series.max()
        new_row[name+'__abs_energy'] = np.dot(series, series)
        new_row[name+'__count_of_zero'] = np.count_nonzero(series==0)
        new_row[name+'__count_of_non_zero'] = np.count_nonzero(series)
    else:
        new_row[name+'__sum'] = 0
        new_row[name+'__mean'] = 0
        new_row[name+'__median'] = 0
        new_row[name+'__std'] = 0
        new_row[name+'__var'] = 0
        new_row[name+'__variance_large_than_std'] = False
        new_row[name+'__min'] = 0
        new_row[name+'__max'] = 0
        new_row[name+'__abs_energy'] = 0
        new_row[name+'__count_of_zero'] = 0
        new_row[name+'__count_of_non_zero'] = 0

In [9]:
def dayoftheweek_filter(df_dt, n):
    return df_dt.dayofweek == n

def weekofthemonth_filter(df_dt, n):
    return np.floor(df_dt.day/((df_dt.daysinmonth + 1)/4)) == n

In [10]:
def extract_series_date_features(new_row, df, name, date_filter, quantity):
    column = df.columns[-1]
    
    zeros_counts = []
    non_zeros_counts = []
    mean_values = []
    for n_date in range(quantity):
        n_date_prefix = name + '_' + str(n_date)
        df_date = df[date_filter(df['date'].dt, n_date)]
        
        series = df_date[column].values
        extract_series_default_features(new_row, series, n_date_prefix)

        zeros_counts.append(new_row[n_date_prefix+'__count_of_zero'])
        non_zeros_counts.append(new_row[n_date_prefix+'__count_of_non_zero'])
        mean_values.append(new_row[n_date_prefix+'__mean'])

    new_row[name+'__with_most_count_of_zero'] = np.argmax(zeros_counts)
    new_row[name+'__with_most_count_of_non_zero'] = np.argmax(non_zeros_counts)
    new_row[name+'__with_bigger_mean'] = np.argmax(mean_values)
    new_row[name+'__with_least_count_of_zero'] = np.argmin(zeros_counts)
    new_row[name+'__with_least_count_of_non_zero'] = np.argmin(non_zeros_counts)
    new_row[name+'__with_smaller_mean'] = np.argmin(mean_values)

In [24]:
rows = []
for sku in df_all['sku'].unique()[:10]:
    df = df_all[df_all['sku'] == sku]

    #sku = df['sku'].iloc[0]
    new_row = df[item_columns].iloc[0].to_dict()

    count = len(df)
    #new_row['sku'] = df['sku'].iloc[0]
    new_row['count'] = count
    new_row['date__first'] = df['date'].iloc[0]
    new_row['date__last'] = df['date'].iloc[-1]
    new_row['date__diff'] = (new_row['date__last'] - new_row['date__first']).days
    for date in ['date__first', 'date__last']:
        new_row[date+'_day'] = new_row[date].day
        new_row[date+'_month'] = new_row[date].month
        new_row[date+'_dayofweek'] = new_row[date].dayofweek
        new_row[date+'_weekofmonth'] = np.floor(new_row[date].day/((new_row[date].daysinmonth + 1)/4)).astype(int)


    for column in sku_columns:
        new_row[column+'__first'] = df[column].iloc[0]
        new_row[column+'__last'] = df[column].iloc[-1]
        new_row[column+'__mode'] = df[column].mode().iloc[0]
        new_row[column+'__count_of_mode'] = df[column].value_counts().iloc[0]

    for column in sku_numeric_columns:
        series = df[column].values
        extract_series_default_features(new_row, series, column)
        new_row[column+'__last_location_of_maximum'] = count - 1 - np.argmax(series[::-1])
        new_row[column+'__last_location_of_minimum'] = count - 1 - np.argmin(series[::-1])
        zero_locations = np.where(series == 0)[0]
        non_zeros_locations = np.where(series != 0)[0]
        new_row[column+'__last_location_of_zero'] = zero_locations[-1] if zero_locations.shape[0] != 0 else -1 
        new_row[column+'__last_location_of_non_zero'] = non_zeros_locations[-1] if non_zeros_locations.shape[0] != 0 else -1

        #Boolean variable denoting if the distribution of x *looks symmetric*  | mean(X)-median(X)| < r * (max(X)-min(X))
        mean_minus_median_abs = np.abs(new_row[column+'__mean'] - new_row[column+'__median'])
        max_minus_min = new_row[column+'__max'] - new_row[column+'__min']
        for r in [0.3, 0.5, 0.7]:
            new_row[column+f'__symmetry_looking_r_{r}'] = mean_minus_median_abs < r*max_minus_min

        #series energy ratio last chunk
        full_series_energy = np.sum(series ** 2)
        for n_chuncks, focus in [(3,2), (5,4), (10,9)]:
            if full_series_energy != 0:
                series_split = np.array_split(series, n_chuncks)
                selected_series = series_split[focus]
                for i in range(focus, -1, -1):
                    if series_split[i].shape[0] > 0:
                        selected_series = series_split[i]
                        break
                new_row[column+f'__energy_ratio_by_chunks_{n_chuncks}_segments_focus_{focus}'] = np.sum(selected_series ** 2) / full_series_energy
            else:
                new_row[column+f'__energy_ratio_by_chunks_{n_chuncks}_segments_focus_{focus}'] = 0

        if count > 1:
            regression = linregress(np.arange(count), series)
            new_row[column+'__linregress_slope'] = regression.slope
            new_row[column+'__linregress_intercept'] = regression.intercept
            new_row[column+'__linregress_pvalue'] = regression.pvalue
            new_row[column+'__linregress_rvalue'] = regression.rvalue
            new_row[column+'__linregress_stderr'] = regression.stderr
        else:
            new_row[column+'__linregress_slope'] = 0
            new_row[column+'__linregress_intercept'] = series[0]
            new_row[column+'__linregress_pvalue'] = 0
            new_row[column+'__linregress_rvalue'] = 0
            new_row[column+'__linregress_stderr'] = 0

        series_change = np.diff(series)
        extract_series_default_features(new_row, series_change, column+'__change')

        series_change_abs = np.abs(series_change)
        extract_series_default_features(new_row, series_change_abs, column+'__change_absolute')

        last_5_series = series[-5:]
        extract_series_default_features(new_row, last_5_series, column+'__last_5')

        last_3_series = series[-3:]
        extract_series_default_features(new_row, last_5_series, column+'__last_3')

        weekdate_prefix = '__by_dayoftheweek'
        extract_series_date_features(new_row, df[['date', column]], column+weekdate_prefix, dayoftheweek_filter, 7)

        monthweek_prefix = '__by_weekofthemonth'
        extract_series_date_features(new_row, df[['date', column]], column+monthweek_prefix, weekofthemonth_filter, 4)

    for column in sku_columns:
        new_row[column+'__series'] = df[column].to_json(orient='values')

    print(len(new_row))    
    rows.append(new_row)

663
663
663
663
663
663
663
663
663
663


In [26]:
df_sku_features = pd.DataFrame(rows)

In [36]:
df_sku_features

Unnamed: 0,item_domain_id,item_id,item_title,site_id,sku,product_id,product_family_id,count,date__first,date__last,...,minutes_active__by_weekofthemonth__with_least_count_of_zero,minutes_active__by_weekofthemonth__with_least_count_of_non_zero,minutes_active__by_weekofthemonth__with_smaller_mean,sold_quantity__series,current_price__series,minutes_active__series,currency__series,listing_type__series,shipping_logistic_type__series,shipping_payment__series
0,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,464801,MLB9838512,MLB9838510,59,2021-02-01,2021-03-31,...,0,0,0,"[0,0,0,0,1,0,0,0,1,1,0,0,0,1,5,1,0,0,0,0,0,0,0...","[156.78,156.78,156.78,156.78,156.78,157.78,156...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""free_shipping"",""free_shipping"",""free_shippin..."
1,MLB-NEBULIZERS,438135,"Inalador Infantil, 2 Anos De Garantia, G-tech,...",MLB,645793,,MLB9838510,29,2021-02-01,2021-03-01,...,0,2,2,"[0,0,2,1,0,1,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0...","[134.23,134.23,145.9,145.9,145.9,145.9,149.99,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""free_shipping"",""free_shipping"",""free_shippin..."
2,MLB-TOOLS,440115,Esteira Porta Cabos 10x10mm Interno 1m -pontei...,MLB,77402,,,59,2021-02-01,2021-03-31,...,0,3,3,"[0,0,0,0,0,0,0,0,0,3,2,2,1,1,0,4,0,0,0,0,4,0,0...","[59.5,59.9,59.9,59.9,59.9,59.9,59.9,59.9,59.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""cross_docking"",""cross_docking"",""cross_dockin...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
3,MLB-ARTIFICIAL_FLOWERS,192180,Kit 12 Plantas Mini Suculentas Artificiais C/ ...,MLB,58546,,,59,2021-02-01,2021-03-31,...,0,0,3,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[157.0,157.0,157.0,157.0,157.0,157.0,157.0,157...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""cross_docking"",""cross_docking"",""cross_dockin...","[""free_shipping"",""free_shipping"",""free_shippin..."
4,MLB-ADHESIVE_TAPES,221252,Fita Dupla Face Preta Colar Touch Lcd 3mm Celu...,MLB,99516,,,59,2021-02-01,2021-03-31,...,0,1,1,"[0,2,6,2,1,2,3,1,10,3,2,3,1,1,0,0,0,0,0,0,3,2,...","[22.9,22.9,22.9,22.9,22.9,22.9,22.9,22.9,22.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
5,MLB-SCHOOL_AND_OFFICE_GLUES,62099,Cola T-7000 Black Pra Uso Em Touch E Celular E...,MLB,538100,,,59,2021-02-01,2021-03-31,...,0,0,0,"[0,0,2,1,2,0,0,0,0,1,0,2,1,0,0,0,0,0,0,1,0,0,2...","[34.9,34.9,34.9,34.9,34.9,34.9,34.9,34.9,34.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
6,MLB-DECORATIVE_VINYLS,168198,Adesivo De Parede Unicornio + 30 Florais,MLB,557191,,,59,2021-02-01,2021-03-31,...,3,1,1,"[0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,3,0,1,1,1,0,0,0...","[39.81,39.81,39.81,39.81,39.81,39.81,39.81,39....","[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1405.1166...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
7,MLB-CELLPHONE_PARTS,61865,Fio De Cobre P/ Reparo Jamper Trilha Placa Cel...,MLB,80056,,,59,2021-02-01,2021-03-31,...,0,1,1,"[2,1,0,5,1,0,1,2,0,2,1,0,0,0,0,0,0,0,0,0,1,1,3...","[19.9,19.9,19.9,19.9,19.9,19.9,19.9,19.9,19.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
8,MLB-BABY_BLANKETS,194503,[6-18m] Saco Antialérgico Para Bebê Dormir Est...,MLB,274856,,,59,2021-02-01,2021-03-31,...,0,2,2,"[0,2,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
9,MLB-SOLDERING_IRONS,306726,Ponta Ferro De Solda Ponteira Hakko Yaxun Ya X...,MLB,425233,,,59,2021-02-01,2021-03-31,...,0,3,3,"[0,0,1,0,0,1,1,0,0,0,5,0,0,0,0,1,1,2,1,1,0,0,0...","[29.9,29.9,29.9,29.9,29.9,29.9,29.9,29.9,29.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."


In [27]:
df_sku_features[item_columns]

Unnamed: 0,item_domain_id,item_id,item_title,site_id,sku,product_id,product_family_id
0,MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,464801,MLB9838512,MLB9838510
1,MLB-NEBULIZERS,438135,"Inalador Infantil, 2 Anos De Garantia, G-tech,...",MLB,645793,,MLB9838510
2,MLB-TOOLS,440115,Esteira Porta Cabos 10x10mm Interno 1m -pontei...,MLB,77402,,
3,MLB-ARTIFICIAL_FLOWERS,192180,Kit 12 Plantas Mini Suculentas Artificiais C/ ...,MLB,58546,,
4,MLB-ADHESIVE_TAPES,221252,Fita Dupla Face Preta Colar Touch Lcd 3mm Celu...,MLB,99516,,
5,MLB-SCHOOL_AND_OFFICE_GLUES,62099,Cola T-7000 Black Pra Uso Em Touch E Celular E...,MLB,538100,,
6,MLB-DECORATIVE_VINYLS,168198,Adesivo De Parede Unicornio + 30 Florais,MLB,557191,,
7,MLB-CELLPHONE_PARTS,61865,Fio De Cobre P/ Reparo Jamper Trilha Placa Cel...,MLB,80056,,
8,MLB-BABY_BLANKETS,194503,[6-18m] Saco Antialérgico Para Bebê Dormir Est...,MLB,274856,,
9,MLB-SOLDERING_IRONS,306726,Ponta Ferro De Solda Ponteira Hakko Yaxun Ya X...,MLB,425233,,


In [269]:
#df = df_all[df_all['item_domain_id'] == 'MLA-RADIO_BASE_STATIONS']


In [56]:
item_rows = []
for item_domain_id in ['MLB-ADHESIVE_TAPES']:
    df = df_all[df_all['item_domain_id'] == item_domain_id]
    df_group_date = df.groupby('date')

    count = len(df)
    new_row = {
        'item_domain_id': df['item_domain_id'].iloc[0],
        'count__by_item_domain_id': len(df),
        'count_sku__by_item_domain_id': df['sku'].nunique()
    }

    by_prefix = '__by_item_domain_id'

    for column in sku_columns:
        new_row[column+by_prefix+'__mode'] = df[column].mode().iloc[0]
        new_row[column+by_prefix+'__count_of_mode'] = df[column].value_counts().iloc[0]

    for column in sku_numeric_columns:
        series = df[column].values
        extract_series_default_features(new_row, series, column+by_prefix)

        series = df_group_date[column].mean().values
        if series.shape[0] > 1:
            regression = linregress(np.arange(series.shape[0]), series)
            new_row[column+by_prefix+'__mean__linregress_slope'] = regression.slope
            new_row[column+by_prefix+'__mean__linregress_intercept'] = regression.intercept
            new_row[column+by_prefix+'__mean__linregress_pvalue'] = regression.pvalue
            new_row[column+by_prefix+'__mean__linregress_rvalue'] = regression.rvalue
            new_row[column+by_prefix+'__mean__linregress_stderr'] = regression.stderr
        else:
            new_row[column+by_prefix+'__mean__linregress_slope'] = 0
            new_row[column+by_prefix+'__mean__linregress_intercept'] = series[0]
            new_row[column+by_prefix+'__mean__linregress_pvalue'] = 0
            new_row[column+by_prefix+'__mean__linregress_rvalue'] = 0
            new_row[column+by_prefix+'__mean__linregress_stderr'] = 0

        weekdate_prefix = '__by_dayoftheweek'
        extract_series_date_features(new_row, df[['date', column]], column+by_prefix+weekdate_prefix, dayoftheweek_filter, 7)

        monthweek_prefix = '__by_weekofthemonth'
        extract_series_date_features(new_row, df[['date', column]], column+by_prefix+monthweek_prefix, weekofthemonth_filter, 4)

        daymonth_prefix = '__by_dayofthemonth'
        #new_row[column+by_prefix+daymonth_prefix+'__with_smaller_mean'] = df_group_date[column].max().idxmax().day
        count_zero = df_group_date[column].apply(lambda x: np.where(x == 0)[0].shape[0])
        count_non_zero = df_group_date[column].apply(lambda x: np.where(x != 0)[0].shape[0])
        new_row[column+by_prefix+daymonth_prefix+'__with_most_count_of_zero'] = count_zero.idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_most_count_of_non_zero'] = count_non_zero.idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_least_count_of_zero'] = count_zero.idxmin().day
        new_row[column+by_prefix+daymonth_prefix+'__with_least_count_of_non_zero'] = count_non_zero.idxmin().day

        count_date = df_group_date[column].count().shape[0]
        new_row[column+by_prefix+daymonth_prefix+'__bigger_sum'] = df_group_date[column].sum().max()
        new_row[column+by_prefix+daymonth_prefix+'__bigger_mean'] = df_group_date[column].mean().max()
        new_row[column+by_prefix+daymonth_prefix+'__smaller_sum'] = df_group_date[column].sum().min()
        new_row[column+by_prefix+daymonth_prefix+'__smaller_mean'] = df_group_date[column].mean().min()
        new_row[column+by_prefix+daymonth_prefix+'__with_bigger_mean'] = df_group_date[column].sum().idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_bigger_mean'] = df_group_date[column].mean().idxmax().day
        new_row[column+by_prefix+daymonth_prefix+'__with_smaller_sum'] = df_group_date[column].mean().idxmin().day
        new_row[column+by_prefix+daymonth_prefix+'__with_smaller_mean'] = df_group_date[column].mean().idxmin().day

        if count_date > 1:
            new_row[column+by_prefix+daymonth_prefix+'__bigger_std'] = df_group_date[column].std().max()
            new_row[column+by_prefix+daymonth_prefix+'__bigger_var'] = df_group_date[column].var().max()
            new_row[column+by_prefix+daymonth_prefix+'__smaller_std'] = df_group_date[column].std().min()
            new_row[column+by_prefix+daymonth_prefix+'__smaller_var'] = df_group_date[column].var().min()
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_std'] = df_group_date[column].std().idxmax().day
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_var'] = df_group_date[column].var().idxmax().day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_std'] = df_group_date[column].std().idxmin().day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_var'] = df_group_date[column].var().idxmin().day
        else:
            new_row[column+by_prefix+daymonth_prefix+'__bigger_std'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__bigger_var'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__smaller_std'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__smaller_var'] = 0
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_std'] = df_group_date[column].std().index[0].day
            new_row[column+by_prefix+daymonth_prefix+'__with_bigger_var'] = df_group_date[column].var().index[0].day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_std'] = df_group_date[column].std().index[0].day
            new_row[column+by_prefix+daymonth_prefix+'__with_smaller_var'] = df_group_date[column].var().index[0].day
    print(len(new_row)) 
    item_rows.append(new_row)

521


In [57]:
new_row

{'item_domain_id': 'MLB-ADHESIVE_TAPES',
 'count__by_item_domain_id': 30599,
 'count_sku__by_item_domain_id': 594,
 'sold_quantity__by_item_domain_id__mode': 0,
 'sold_quantity__by_item_domain_id__count_of_mode': 22824,
 'current_price__by_item_domain_id__mode': 29.9,
 'current_price__by_item_domain_id__count_of_mode': 787,
 'minutes_active__by_item_domain_id__mode': 1440.0,
 'minutes_active__by_item_domain_id__count_of_mode': 21653,
 'currency__by_item_domain_id__mode': 'REA',
 'currency__by_item_domain_id__count_of_mode': 30599,
 'listing_type__by_item_domain_id__mode': 'premium',
 'listing_type__by_item_domain_id__count_of_mode': 16067,
 'shipping_logistic_type__by_item_domain_id__mode': 'fulfillment',
 'shipping_logistic_type__by_item_domain_id__count_of_mode': 24340,
 'shipping_payment__by_item_domain_id__mode': 'paid_shipping',
 'shipping_payment__by_item_domain_id__count_of_mode': 28247,
 'sold_quantity__by_item_domain_id__sum': 34080,
 'sold_quantity__by_item_domain_id__mean': 

In [34]:
df_item_domain_id_features = pd.DataFrame(item_rows)
#df_item_domain_id_features = df_sku_features.set_index('item_domain_id')

In [30]:
df_item_domain_id_features

Unnamed: 0_level_0,item_id,item_title,site_id,sku,product_id,product_family_id,count,date__first,date__last,date__diff,...,minutes_active__by_weekofthemonth__with_least_count_of_zero,minutes_active__by_weekofthemonth__with_least_count_of_non_zero,minutes_active__by_weekofthemonth__with_smaller_mean,sold_quantity__series,current_price__series,minutes_active__series,currency__series,listing_type__series,shipping_logistic_type__series,shipping_payment__series
item_domain_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MLB-NEBULIZERS,344151,Inalador E Nebulizador Infantil Nebdog Superfl...,MLB,464801,MLB9838512,MLB9838510,59,2021-02-01,2021-03-31,58,...,0,0,0,"[0,0,0,0,1,0,0,0,1,1,0,0,0,1,5,1,0,0,0,0,0,0,0...","[156.78,156.78,156.78,156.78,156.78,157.78,156...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""free_shipping"",""free_shipping"",""free_shippin..."
MLB-NEBULIZERS,438135,"Inalador Infantil, 2 Anos De Garantia, G-tech,...",MLB,645793,,MLB9838510,29,2021-02-01,2021-03-01,28,...,0,2,2,"[0,0,2,1,0,1,1,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0...","[134.23,134.23,145.9,145.9,145.9,145.9,149.99,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""free_shipping"",""free_shipping"",""free_shippin..."
MLB-TOOLS,440115,Esteira Porta Cabos 10x10mm Interno 1m -pontei...,MLB,77402,,,59,2021-02-01,2021-03-31,58,...,0,3,3,"[0,0,0,0,0,0,0,0,0,3,2,2,1,1,0,4,0,0,0,0,4,0,0...","[59.5,59.9,59.9,59.9,59.9,59.9,59.9,59.9,59.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""cross_docking"",""cross_docking"",""cross_dockin...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
MLB-ARTIFICIAL_FLOWERS,192180,Kit 12 Plantas Mini Suculentas Artificiais C/ ...,MLB,58546,,,59,2021-02-01,2021-03-31,58,...,0,0,3,"[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[157.0,157.0,157.0,157.0,157.0,157.0,157.0,157...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""classic"",""classic"",""classic"",""classic"",""clas...","[""cross_docking"",""cross_docking"",""cross_dockin...","[""free_shipping"",""free_shipping"",""free_shippin..."
MLB-ADHESIVE_TAPES,221252,Fita Dupla Face Preta Colar Touch Lcd 3mm Celu...,MLB,99516,,,59,2021-02-01,2021-03-31,58,...,0,1,1,"[0,2,6,2,1,2,3,1,10,3,2,3,1,1,0,0,0,0,0,0,3,2,...","[22.9,22.9,22.9,22.9,22.9,22.9,22.9,22.9,22.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
MLB-SCHOOL_AND_OFFICE_GLUES,62099,Cola T-7000 Black Pra Uso Em Touch E Celular E...,MLB,538100,,,59,2021-02-01,2021-03-31,58,...,0,0,0,"[0,0,2,1,2,0,0,0,0,1,0,2,1,0,0,0,0,0,0,1,0,0,2...","[34.9,34.9,34.9,34.9,34.9,34.9,34.9,34.9,34.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
MLB-DECORATIVE_VINYLS,168198,Adesivo De Parede Unicornio + 30 Florais,MLB,557191,,,59,2021-02-01,2021-03-31,58,...,3,1,1,"[0,0,0,0,0,0,0,0,0,0,0,0,1,0,2,3,0,1,1,1,0,0,0...","[39.81,39.81,39.81,39.81,39.81,39.81,39.81,39....","[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1405.1166...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
MLB-CELLPHONE_PARTS,61865,Fio De Cobre P/ Reparo Jamper Trilha Placa Cel...,MLB,80056,,,59,2021-02-01,2021-03-31,58,...,0,1,1,"[2,1,0,5,1,0,1,2,0,2,1,0,0,0,0,0,0,0,0,0,1,1,3...","[19.9,19.9,19.9,19.9,19.9,19.9,19.9,19.9,19.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
MLB-BABY_BLANKETS,194503,[6-18m] Saco Antialérgico Para Bebê Dormir Est...,MLB,274856,,,59,2021-02-01,2021-03-31,58,...,0,2,2,"[0,2,0,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0...","[89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,89.0,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."
MLB-SOLDERING_IRONS,306726,Ponta Ferro De Solda Ponteira Hakko Yaxun Ya X...,MLB,425233,,,59,2021-02-01,2021-03-31,58,...,0,3,3,"[0,0,1,0,0,1,1,0,0,0,5,0,0,0,0,1,1,2,1,1,0,0,0...","[29.9,29.9,29.9,29.9,29.9,29.9,29.9,29.9,29.9,...","[1440.0,1440.0,1440.0,1440.0,1440.0,1440.0,144...","[""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""REA"",""RE...","[""premium"",""premium"",""premium"",""premium"",""prem...","[""fulfillment"",""fulfillment"",""fulfillment"",""fu...","[""paid_shipping"",""paid_shipping"",""paid_shippin..."


In [40]:
print(list(df_sku_features.merge(df_item_domain_id_features, on='item_domain_id').columns))

['item_domain_id', 'item_id', 'item_title', 'site_id', 'sku', 'product_id', 'product_family_id', 'count', 'date__first', 'date__last', 'date__diff', 'date__first_day', 'date__first_month', 'date__first_dayofweek', 'date__first_weekofmonth', 'date__last_day', 'date__last_month', 'date__last_dayofweek', 'date__last_weekofmonth', 'sold_quantity__first', 'sold_quantity__last', 'sold_quantity__mode', 'sold_quantity__count_of_mode', 'current_price__first', 'current_price__last', 'current_price__mode', 'current_price__count_of_mode', 'minutes_active__first', 'minutes_active__last', 'minutes_active__mode', 'minutes_active__count_of_mode', 'currency__first', 'currency__last', 'currency__mode', 'currency__count_of_mode', 'listing_type__first', 'listing_type__last', 'listing_type__mode', 'listing_type__count_of_mode', 'shipping_logistic_type__first', 'shipping_logistic_type__last', 'shipping_logistic_type__mode', 'shipping_logistic_type__count_of_mode', 'shipping_payment__first', 'shipping_paymen

In [272]:
pd.concat([df_sku_features, df_item_domain_id_features])

Unnamed: 0,item_domain_id,item_id,item_title,site_id,sku,product_id,product_family_id,count,date__first,date__last,...,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_sum,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_mean,minutes_active__by_item_domain_id__by_dayofthemonth__bigger_std,minutes_active__by_item_domain_id__by_dayofthemonth__bigger_var,minutes_active__by_item_domain_id__by_dayofthemonth__smaller_std,minutes_active__by_item_domain_id__by_dayofthemonth__smaller_var,minutes_active__by_item_domain_id__by_dayofthemonth__with_bigger_std,minutes_active__by_item_domain_id__by_dayofthemonth__with_bigger_var,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_std,minutes_active__by_item_domain_id__by_dayofthemonth__with_smaller_var
0,MLB-SNEAKERS,492155.0,Tênis Masculino Olympikus Cyber Barato Promoçao,MLB,0.0,,MLB15832732,1.0,2021-03-31,2021-03-31,...,,,,,,,,,,
0,MLB-SNEAKERS,,,,,,,,NaT,NaT,...,20.0,20.0,669.758564,448576.534583,596.501749,355814.336693,20.0,20.0,26.0,26.0


In [244]:
df_group_date[column].count().shape

(1,)

In [234]:
df_group_date[column]

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ffc85444518>

In [241]:
df_group_date[column].std().index[0]

Timestamp('2021-03-31 00:00:00')

In [228]:
df_group_date[column].count()

date
2021-03-31    1
Name: sold_quantity, dtype: int64

In [185]:
df_group_date[column].apply(lambda x: np.where(x == 0)[0].shape[0])

date
2021-02-01    1
2021-02-02    0
2021-02-03    0
2021-02-04    0
2021-02-05    1
2021-02-06    1
2021-02-07    1
2021-02-08    1
2021-02-09    2
2021-02-10    3
2021-02-11    4
2021-02-12    5
2021-02-13    3
2021-02-14    3
2021-02-15    4
2021-02-16    4
2021-02-17    4
2021-02-18    5
2021-02-19    4
2021-02-20    4
2021-02-21    4
2021-02-22    4
2021-02-23    4
2021-02-24    5
2021-02-25    5
2021-02-26    5
2021-02-27    5
2021-02-28    5
2021-03-01    5
2021-03-02    3
2021-03-03    3
2021-03-04    3
2021-03-05    4
2021-03-06    6
2021-03-07    6
2021-03-08    6
2021-03-09    6
2021-03-10    7
2021-03-11    7
2021-03-12    7
2021-03-13    7
2021-03-14    7
2021-03-15    7
2021-03-16    7
2021-03-17    5
2021-03-18    4
2021-03-19    4
2021-03-20    4
2021-03-21    4
2021-03-22    4
2021-03-23    5
2021-03-24    5
2021-03-25    3
2021-03-26    3
2021-03-27    3
2021-03-28    3
2021-03-29    3
2021-03-30    3
2021-03-31    4
Name: minutes_active, dtype: int64

In [154]:
df[np.floor(df['date'].dt.day/((df['date'].dt.daysinmonth + 1)/4)) == 4]

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,item_title,site_id,sku_item,product_id,product_family_id


{'item_domain_id': 'MLA-THERMAL_PADS',
 'count__by_item_doimain_id': 878,
 'sold_quantity__by_item_doimain_id__mode': 0,
 'current_price__by_item_doimain_id__mode': 1802.85,
 'minutes_active__by_item_doimain_id__mode': 1440.0,
 'currency__by_item_doimain_id__mode': 'ARG',
 'listing_type__by_item_doimain_id__mode': 'classic',
 'shipping_logistic_type__by_item_doimain_id__mode': 'fulfillment',
 'shipping_payment__by_item_doimain_id__mode': 'paid_shipping',
 'sold_quantity__by_item_doimain_id__sum': 508,
 'sold_quantity__by_item_doimain_id__mean': 0.5785876993166287,
 'sold_quantity__by_item_doimain_id__median': 0.0,
 'sold_quantity__by_item_doimain_id__std': 1.5273055497538237,
 'sold_quantity__by_item_doimain_id__var': 2.33266224230883,
 'sold_quantity__by_item_doimain_id__variance_large_than_std': True,
 'sold_quantity__by_item_doimain_id__min': 0,
 'sold_quantity__by_item_doimain_id__max': 14,
 'sold_quantity__by_item_doimain_id__abs_energy': 2342,
 'sold_quantity__by_item_doimain_id_

In [145]:
df_date

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active,item_domain_id,item_id,item_title,site_id,sku_item,product_id,product_family_id


In [137]:
zeros_counts

[235, 235, 235, 235, 235, 235, 235]

In [138]:
non_zeros_counts

[643, 643, 643, 643, 643, 643, 643]

In [120]:
np.array_split(series, 3)[3]

IndexError: list index out of range

In [102]:
value_counts = df['sold_quantity'].value_counts()

In [103]:
value_counts

0    49
1     8
5     1
2     1
Name: sold_quantity, dtype: int64

In [98]:
np.sum(series ** 2)

122342400.0

In [86]:
np.floor(new_row[date].day/((new_row[date].daysinmonth)/4)).astype(int)

4

In [90]:
1/((new_row[date].daysinmonth + 1)/4)

0.125

In [77]:
new_row[date].day

31

In [58]:
regression = linregress(np.arange(count), df['sold_quantity'].values)

In [61]:
regression.count

<function LinregressResult.count>

In [30]:
zero_locations

array([], dtype=int64)

In [27]:
np.argmax(x)

NameError: name 'x' is not defined