In [16]:
# %load 501_train_test_split.py
from glob import glob
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import minmax_scale
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder

def read_pickles(path, col=None):
    if col is None:
#         print([pd.read_pickle(f).shape for f in tqdm(sorted(glob(path+'/*.pkl')))])
#         df = pd.concat([pd.read_pickle(f) for f in tqdm(sorted(glob(path+'/*.pkl')))], axis=1)
        f_list = [f for f in tqdm(sorted(glob(path+'/*.pkl')))]
        df = None   
        for f in f_list:
            df = pd.concat([df, pd.read_pickle(f)], axis=1)
            print(df.shape)
    else:
        df = pd.concat([pd.read_pickle(f)[col] for f in tqdm(sorted(glob(path+'/*.pkl')))], axis=1)
    return df

def data_split():
    train = load_pickle('../data/train.pkl')
    test = load_pickle('../data/test.pkl')
    
    print('train_shape:{}'.format(train.shape))
    print('test_shape:{}'.format(test.shape))
    
    train_feats = read_pickles('../data/train_feature')
    test_feats = read_pickles('../data/test_feature')
    
    train_feats.drop('instance_id', axis=1, inplace=True)
    test_feats.drop('instance_id', axis=1, inplace=True)
    
    print('train_feats_shape:{}'.format(train_feats.shape))
    print('test_feats_shape:{}'.format(test_feats.shape))
    
    train = pd.concat([train, train_feats], axis=1)
    test = pd.concat([test, test_feats], axis=1)
#     print(np.intersect1d(train.columns, train_feats.columns))
#     train_feats = train_feats[list(train_feats.columns[~train_feats.columns.duplicated()])]
#     test_feats = test_feats[list(test_feats.columns[~test_feats.columns.duplicated()])]
#     train = train.merge(train_feats, on='instance_id', how='left')
#     test = test.merge(test_feats, on='instance_id', how='left')
    
    del train_feats; gc.collect()
    del test_feats; gc.collect()
    
    drop_columns = ['time', 'realtime']
    train.drop(drop_columns, axis=1, inplace=True)
    test.drop(drop_columns, axis=1, inplace=True)

    train_df = train[train.day<24].copy()
    #train_df = handle_imbalance(train_df)
    valid_df = train[train.day==24].copy()
    test_df = test
    
    train_df.fillna(-1, inplace=True)
    valid_df.fillna(-1, inplace=True)
    test_df.fillna(-1, inplace=True)
    

    print('the shape of train {}'.format(train_df.shape))
    print('the shape of valid {}'.format(valid_df.shape))
    print('the shape of test {}'.format(test_df.shape))
    
    dump_pickle(train_df, path='../data/train_final.pkl')
    dump_pickle(valid_df, path='../data/valid_final.pkl')
    dump_pickle(test_df, path='../data/test_final.pkl')
    del train_df; gc.collect()
    del valid_df; gc.collect()
    del test_df; gc.collect()

def data_onehot():

    train_data = load_pickle(path='../data/train_final.pkl')
    cv_data = load_pickle(path='../data/valid_final.pkl')
    test_data = load_pickle(path='../data/test_final.pkl')

#     cols = ['user_gender_id', 'user_age_level', 'user_occupation_id'
#         , 'item_city_id', 'item_price_level', 'time_discrete'
#         , 'context_page_id', 'shop_review_num_level']
    cols = ['time_discrete']

    data = pd.concat([train_data, cv_data, test_data], axis=0)

    for col in tqdm(cols):
        col_feature = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, col_feature], axis=1)

#     X = minmax_scale(data.values)
#     data = pd.DataFrame(data=X, columns=data.columns)
    
    
    train_data = data.loc[train_data.index]
    cv_data = data.loc[cv_data.index]
    test_data = data.loc[test_data.index]

    train_data.reset_index(inplace=True, drop=True)
    cv_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)

    dump_pickle(train_data, path='../data/train_final_onehot.pkl')
    dump_pickle(cv_data, path='../data/valid_final_onehot.pkl')
    dump_pickle(test_data, path='../data/test_final_onehot.pkl')


data_split()
data_onehot()





  0%|          | 0/8 [00:00<?, ?it/s][A
 62%|██████▎   | 5/8 [00:00<00:00, 46.92it/s][A
100%|██████████| 8/8 [00:00<00:00, 47.30it/s][A
  0%|          | 0/8 [00:00<?, ?it/s][A

train_shape:(478138, 45)
test_shape:(18371, 45)
[(478138, 11), (478138, 6), (478138, 2), (478138, 7), (478138, 2), (478138, 4), (478138, 11), (478138, 7)]



 88%|████████▊ | 7/8 [00:00<00:00, 67.40it/s][A
100%|██████████| 8/8 [00:00<00:00, 56.89it/s][A
  0%|          | 0/8 [00:00<?, ?it/s][A
100%|██████████| 8/8 [00:00<00:00, 340.08it/s][A
  0%|          | 0/8 [00:00<?, ?it/s][A
100%|██████████| 8/8 [00:00<00:00, 436.30it/s][A

[(18371, 11), (18371, 6), (18371, 2), (18371, 7), (18371, 2), (18371, 4), (18371, 11), (18371, 7)]
train_feats_shape:(478138, 42)
test_feats_shape:(36742, 42)
the shape of train (420717, 85)
the shape of valid (57421, 85)
the shape of test (36742, 85)



  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  1.59it/s][A
[A

In [2]:
train_feats.shape

NameError: name 'train_feats' is not defined