In [1]:
# %load 501_train_test_split.py
from glob import glob
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import minmax_scale
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder

def read_pickles(path):
   
    #   print([pd.read_pickle(f).shape for f in tqdm(sorted(glob(path+'/*.pkl')))])
    #   df = pd.concat([pd.read_pickle(f) for f in tqdm(sorted(glob(path+'/*.pkl')))], axis=1)
    f_list = [f for f in (sorted(glob(path+'*.pkl')))]
    print(f_list)
    df = None   
    for f in tqdm(f_list):
        if df is None:
            df = pd.read_pickle(f)
        else:
            df = pd.merge(df, pd.read_pickle(f), on='instance_id', how='left') 
    return df

def data_split(add_features=True):
    train = load_pickle('../data_valid/train.pkl')
    test = load_pickle('../data_valid/test.pkl')
    print(train.index)
    
    if add_features:
        train_feats = read_pickles('../data_valid/train_feature/')
        test_feats = read_pickles('../data_valid/test_feature/')
        
#         print(train_feats.isnull().sum(axis=0).sort_values(ascending=False)[:10])
#         print(train.isnull().sum(axis=0).sort_values(ascending=False)[:10])
        
        print(train_feats.index)
        train = pd.merge(train, train_feats, on='instance_id', how='left')
        test = pd.merge(test, test_feats, on='instance_id', how='left')
#         train = pd.concat([train.reset_index(), train_feats.reset_index()], axis=1)
#         test = pd.concat([test.reset_index(), test_feats.reset_index()], axis=1)
        
        print(train.isnull().sum(axis=0).sort_values(ascending=False)[:10])
        
    
        del train_feats; gc.collect()
        del test_feats; gc.collect()

        
    drop_columns = ['time', 'realtime', 'item_property_list']
    train.drop(drop_columns, axis=1, inplace=True)
    test.drop(drop_columns, axis=1, inplace=True)

    

    train_df = train[train.day<=22]
    valid_df = train[train.day==23]
    test_df = test

    print(train_df.isnull().sum(axis=0).sort_values(ascending=False)[:10])
    
    train_df.fillna(-1, inplace=True)
    valid_df.fillna(-1, inplace=True)
    test_df.fillna(-1, inplace=True)
    
    
    print('the shape of train {}'.format(train_df.shape))
    print('the shape of valid {}'.format(valid_df.shape))
    print('the shape of test {}'.format(test_df.shape))
    

    dump_pickle(train_df, path='../data_valid/train_final.pkl')
    dump_pickle(valid_df, path='../data_valid/valid_final.pkl')
    dump_pickle(test_df, path='../data_valid/test_final.pkl')
    del train_df; gc.collect()
    del valid_df; gc.collect()
    del test_df; gc.collect()

def data_onehot(add_features=True):

    train_data = load_pickle(path='../data_valid/train_final.pkl')
    valid_data = load_pickle(path='../data_valid/valid_final.pkl')
    test_data = load_pickle(path='../data_valid/test_final.pkl')

    if add_features:
        cols = ['time_discrete', 'user_gender_id','user_age_level','user_occupation_id'
                ,'item_city_id','item_price_level'
                ,'context_page_id','shop_review_num_level']
    else:
        cols = ['user_gender_id','user_age_level','user_occupation_id'
                ,'item_city_id','item_price_level'
                ,'context_page_id','shop_review_num_level']

    data = pd.concat([train_data, valid_data, test_data], axis=0, ignore_index=True)

    for col in tqdm(cols):
        col_feature = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, col_feature], axis=1)

    print(train_data.index)
    print(test_data.index)
    train_data = data[(data.day<=22)&(data.day>=19)].copy()
    valid_data = data[data.day==23].copy()
    test_data = data[data.day==24].copy()
    
    print('train shap:',train_data.shape)
    print('cv shape', valid_data.shape)
    print('test shape', test_data.shape)
    
    dump_pickle(train_data, path='../data_valid/train_final_onehot.pkl')
    dump_pickle(valid_data, path='../data_valid/valid_final_onehot.pkl')
    dump_pickle(test_data, path='../data_valid/test_final_onehot.pkl')


data_split(add_features=True)
data_onehot(add_features=True)




  0%|          | 0/19 [00:00<?, ?it/s]

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            420666, 420667, 420668, 420669, 420670, 420671, 420672, 420673,
            420674, 420675],
           dtype='int64', length=420676)
['../data_valid/train_feature/002_property_count.pkl', '../data_valid/train_feature/101_user_based_statistics.pkl', '../data_valid/train_feature/102_smooth_features.pkl', '../data_valid/train_feature/103_diff_max_min.pkl', '../data_valid/train_feature/103_feature_last_query.pkl', '../data_valid/train_feature/103_last_day_count.pkl', '../data_valid/train_feature/103_user_time_count.pkl', '../data_valid/train_feature/104_user_visit_time.pkl', '../data_valid/train_feature/201_user_based_statistics.pkl', '../data_valid/train_feature/202_smooth_features.pkl', '../data_valid/train_feature/203_feature_last_query.pkl', '../data_valid/train_feature/203_last_day_count.pkl', '../data_valid/train_feature/203_user_time_count.pkl', '../da

100%|██████████| 19/19 [00:07<00:00,  2.39it/s]
 21%|██        | 4/19 [00:00<00:00, 33.60it/s]

['../data_valid/test_feature/002_property_count.pkl', '../data_valid/test_feature/101_user_based_statistics.pkl', '../data_valid/test_feature/102_smooth_features.pkl', '../data_valid/test_feature/103_diff_max_min.pkl', '../data_valid/test_feature/103_feature_last_query.pkl', '../data_valid/test_feature/103_last_day_count.pkl', '../data_valid/test_feature/103_user_time_count.pkl', '../data_valid/test_feature/104_user_visit_time.pkl', '../data_valid/test_feature/201_user_based_statistics.pkl', '../data_valid/test_feature/202_smooth_features.pkl', '../data_valid/test_feature/203_feature_last_query.pkl', '../data_valid/test_feature/203_last_day_count.pkl', '../data_valid/test_feature/203_user_time_count.pkl', '../data_valid/test_feature/301_smooth_item_features.pkl', '../data_valid/test_feature/301_user_based_statistics.pkl', '../data_valid/test_feature/303_feature_last_query.pkl', '../data_valid/test_feature/303_item_diff_max_min.pkl', '../data_valid/test_feature/303_item_last_day_count.p

100%|██████████| 19/19 [00:01<00:00, 17.56it/s]


Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            420666, 420667, 420668, 420669, 420670, 420671, 420672, 420673,
            420674, 420675],
           dtype='int64', length=420676)
user_id_lastday_count             386705
time_diff_last_query_x            176764
item_id_smooth_rate                84262
shop_id_lastday_count              82930
item_brand_id_lastday_count        80323
shop_id_smooth_rate                80294
item_brand_id_smooth_rate          79247
item_city_id_lastday_count         78296
item_city_id_smooth_rate           78284
item_price_level_lastday_count     78270
dtype: int64
user_id_lastday_count             329534
time_diff_last_query_x            153187
item_id_smooth_rate                83373
shop_id_lastday_count              81816
shop_id_smooth_rate                79984
item_brand_id_lastday_count        79933
item_brand_id_smooth_rate          79124
item_city_id_lastday_cou

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


the shape of train (357066, 168)
the shape of valid (63610, 168)
the shape of test (57411, 168)


100%|██████████| 8/8 [00:15<00:00,  1.90s/it]


Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            357056, 357057, 357058, 357059, 357060, 357061, 357062, 357063,
            357064, 357065],
           dtype='int64', length=357066)
Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            57401, 57402, 57403, 57404, 57405, 57406, 57407, 57408, 57409,
            57410],
           dtype='int64', length=57411)
train shap: (278805, 371)
cv shape (63610, 371)
test shape (57411, 371)
