In [1]:
import json
import os
import pandas as pd
import datetime
import numpy as np
from fastparquet import write
import sklearn
from sklearn.model_selection import KFold
from utils import read_df, write_df
from feature_extraction import extract_features_per_sku

In [2]:
DATASET_FOLDER = "./dataset"
#train_processed_data = os.path.join(DATASET_FOLDER, 'processed/sku_feature_data.parquet')
train_data = os.path.join(DATASET_FOLDER, 'train_data.parquet')
item_data = os.path.join(DATASET_FOLDER, 'items_static_metadata_full.jl')

In [3]:
target_folder = "./dataset/processed/train_v2"
if not os.path.exists(target_folder):
    os.makedirs(target_folder)

In [4]:
df_train = pd.read_parquet(train_data, engine='fastparquet')

#df_train_v1 = pd.read_parquet(train_processed_data, engine='fastparquet')
#df_train_v1 = df_train_v1.set_index('sku')

df_item = pd.read_json(item_data, lines=True)

In [5]:
skus = df_train['sku'].unique()

In [6]:
# Create sku to index on train data
index_to_sku = df_train[df_train['sku'].diff() != 0]['sku']
shifted_index = np.append(index_to_sku.index.values[1:].copy(), [len(df_train)])
index_range = list(zip(index_to_sku.index.values, shifted_index))
sku_to_index_range = pd.Series(index_range, index=index_to_sku)

In [7]:
kf = KFold(n_splits=4, random_state=None, shuffle=True)

In [8]:
for kfold, (train_index, test_index) in enumerate(kf.split(skus)):
    print(kfold, len(train_index), len(test_index))
    
    #Create folder structure
    data_target_folder = os.path.join(target_folder, str(kfold))
    if not os.path.exists(data_target_folder):
        os.makedirs(data_target_folder)
    
    #Pick last 30 datapoints from train
    def pick_last_30(index_range):
        x1, x2 = index_range
        if x2-x1 <= 30:
            return np.array([np.nan])
        else:
            return np.arange(x2-30, x2)

    test_df_index = np.concatenate(sku_to_index_range[test_index].apply(pick_last_30).values)
    test_df_index = test_df_index[~np.isnan(test_df_index)].astype('int64')
    df_kfold_test = df_train.loc[test_df_index]
    
    #Remove sku's with total sold_quantity == 0 from test set
    test_zero_solded_sku = df_kfold_test.groupby('sku')['sold_quantity'].sum() == 0
    invalid_sku = test_zero_solded_sku[test_zero_solded_sku].index.values
    df_kfold_test = df_kfold_test[~df_kfold_test['sku'].isin(invalid_sku)]
    
    #Rebuild train data with rows not in test data
    df_kfold_train = df_train.loc[~df_train.index.isin(df_kfold_test.index)].copy().reset_index(drop=True)
    df_kfold_test = df_kfold_test.copy().reset_index(drop=True)
    
    #Build test data with random target_stock
    test_sold_quantity_agg = df_kfold_test.groupby('sku')['sold_quantity'].agg(list).apply(np.array)
    test_sold_quantity_possibilites = test_sold_quantity_agg.apply(np.cumsum).apply(lambda x: np.unique(x, return_index=True))
    
    def random_choose_target_stock(x):
        target_stocks, target_dates = x
        #ignore target_stock == 0
        if target_stocks[0] == 0:
            target_stocks = target_stocks[1:]
            target_dates = target_dates[1:]
        randint = np.random.randint(0, target_stocks.shape[0])
        target_date = target_dates[randint]
        target_stock = target_stocks[randint]
        return target_stock, target_date_0

    test_data = test_sold_quantity_possibilites.apply(random_choose_target_stock)
    test_data = pd.DataFrame([[sku, stock, date] for sku, (stock, date) in zip(test_data.index.values, test_data.values)], columns=['sku','target_stock', 'target_date_0'])
    
    ground_truth = np.eye(30)[test_data['target_date_0'].values]
    
    #Write data to folder
    write_df(df_kfold_train, os.path.join(data_target_folder, 'train_data.parquet'))
    write_df(df_kfold_test, os.path.join(data_target_folder, 'test_fromtrain_data.parquet'))
    write_df(test_data[['sku', 'target_stock']], os.path.join(data_target_folder, 'test_data.csv'))
    write_df(pd.DataFrame(ground_truth), os.path.join(data_target_folder, 'test_ground_truth.csv'), header=False)
    np.save(os.path.join(data_target_folder, 'test_ground_truth.npy'), ground_truth)
    
    with open(os.path.join(data_target_folder, 'test_sku.txt'), 'w') as f:
        for sku in test_data['sku']:
            f.write(str(sku)+'\n')
            
    #Feature extraction train data
    df_kfold_train_processed = extract_features_per_sku(df_kfold_train, df_item)
    
    write_df(df_kfold_train_processed, os.path.join(data_target_folder, 'train_sku_feature_data.parquet'))

0 495687 165229


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/opt/conda/envs/torch/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-0bb2e2ad70e9>", line 51, in <module>
    write_df(df_kfold_train, os.path.join(data_target_folder, 'train_data.parquet'))
  File "/home/edugarcia/kaggle/ml2021/utils.py", line 34, in write_df
    write(path, df)
  File "/opt/conda/envs/torch/lib/python3.6/site-packages/fastparquet/writer.py", line 956, in write
    compression, open_with, has_nulls, append)
  File "/opt/conda/envs/torch/lib/python3.6/site-packages/fastparquet/writer.py", line 790, in write_simple
    compression=compression)
  File "/opt/conda/envs/torch/lib/python3.6/site-packages/fastparquet/writer.py", line 656, in make_row_group
    compression=comp)
  File "/opt/conda/envs/torch/lib/python3.6/site-packages/fastparquet/writer.py", line 534, in write_column
    repetition_data, definition_d

TypeError: object of type 'NoneType' has no len()

In [20]:
df_kfold_train_processed = extract_features_per_sku(df_kfold_train, df_item)

  0%|          | 0/660916 [00:00<?, ?it/s]

In [21]:
df_kfold_train_processed.set_index('sku').loc[df_kfold_test['sku']]['count'].value_counts()

29    4077120
26      18180
28      14670
27      12690
19      12510
25      12210
18      11610
20      11250
13      10230
6       10170
22      10050
21       9780
5        9720
12       9600
7        9270
8        9270
11       8310
15       8310
14       7920
4        7830
1        7770
24       6780
17       4440
23       4020
10       3330
3        3120
9        2550
16       2490
2        2400
Name: count, dtype: int64

In [19]:
df_kfold_train[df_kfold_train['sku'].isin(df_kfold_test['sku'])]['sku'].value_counts().value_counts()

29    135904
26       606
28       489
27       423
19       417
25       407
18       387
20       375
13       341
6        339
22       335
21       326
5        324
12       320
7        309
8        309
11       277
15       277
14       264
4        261
1        259
24       226
17       148
23       134
10       111
3        104
9         85
16        83
2         80
Name: sku, dtype: int64

In [9]:
df_kfold_test

Unnamed: 0,sku,date,sold_quantity,current_price,currency,listing_type,shipping_logistic_type,shipping_payment,minutes_active
0,1,2021-03-02,0,135.90,REA,premium,fulfillment,free_shipping,1440.0
1,1,2021-03-03,0,135.90,REA,premium,fulfillment,free_shipping,1440.0
2,1,2021-03-04,0,135.90,REA,premium,fulfillment,free_shipping,1440.0
3,1,2021-03-05,1,135.90,REA,premium,fulfillment,free_shipping,1440.0
4,1,2021-03-06,0,135.90,REA,premium,fulfillment,free_shipping,1440.0
...,...,...,...,...,...,...,...,...,...
4317595,660915,2021-03-27,0,79.99,MEX,classic,fulfillment,paid_shipping,0.0
4317596,660915,2021-03-28,0,79.99,MEX,classic,fulfillment,paid_shipping,0.0
4317597,660915,2021-03-29,0,79.99,MEX,classic,fulfillment,paid_shipping,0.0
4317598,660915,2021-03-30,0,99.99,MEX,classic,fulfillment,paid_shipping,0.0
