In [7]:
# %load 501_train_test_split.py
from glob import glob
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import minmax_scale
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder

def read_pickles(path, col=None):
    if col is None:
#         print([pd.read_pickle(f).shape for f in tqdm(sorted(glob(path+'/*.pkl')))])
#         df = pd.concat([pd.read_pickle(f) for f in tqdm(sorted(glob(path+'/*.pkl')))], axis=1)
        f_list = [f for f in tqdm(sorted(glob(path+'/*.pkl')))]
        df = None   
        for f in f_list:
            df = pd.concat([df, pd.read_pickle(f).reset_index()], axis=1)
#             print(df.shape)
    else:
        df = pd.concat([pd.read_pickle(f)[col] for f in tqdm(sorted(glob(path+'/*.pkl')))], axis=1)
    return df

def data_split(add_features=True):
    train = load_pickle('../data/train.pkl')
    test = load_pickle('../data/test.pkl')
    
    print('train_shape_before:{}'.format(train.shape))
    print('test_shape_before:{}'.format(test.shape))
    
    if add_features:
        train_feats = read_pickles('../data/train_feature')
        test_feats = read_pickles('../data/test_feature')

        train_feats.drop('instance_id', axis=1, inplace=True)
        test_feats.drop('instance_id', axis=1, inplace=True)

        print('train_feats_shape:{}'.format(train_feats.shape))
        print('test_feats_shape:{}'.format(test_feats.shape))

        train = pd.concat([train.reset_index(), train_feats.reset_index()], axis=1)
        test = pd.concat([test.reset_index(), test_feats.reset_index()], axis=1)

        print('train_shape:{}'.format(train.shape))
        print('test_shape:{}'.format(test.shape))
    
    
        del train_feats; gc.collect()
        del test_feats; gc.collect()
    
    drop_columns = ['time', 'realtime']
    train.drop(drop_columns, axis=1, inplace=True)
    test.drop(drop_columns, axis=1, inplace=True)

    train_df = train[train.day<24].copy()
    #train_df = handle_imbalance(train_df)
    valid_df = train[train.day==24].copy()
    test_df = test
    
    train_df.fillna(-1, inplace=True)
    valid_df.fillna(-1, inplace=True)
    test_df.fillna(-1, inplace=True)
    
    
    print('the shape of train {}'.format(train_df.shape))
    print('the shape of valid {}'.format(valid_df.shape))
    print('the shape of test {}'.format(test_df.shape))
    
    dump_pickle(train_df, path='../data/train_final.pkl')
    dump_pickle(valid_df, path='../data/valid_final.pkl')
    dump_pickle(test_df, path='../data/test_final.pkl')
    del train_df; gc.collect()
    del valid_df; gc.collect()
    del test_df; gc.collect()

def data_onehot(add_features=True):

    train_data = load_pickle(path='../data/train_final.pkl')
    cv_data = load_pickle(path='../data/valid_final.pkl')
    test_data = load_pickle(path='../data/test_final.pkl')

#     cols = ['user_gender_id', 'user_age_level', 'user_occupation_id'
#         , 'item_city_id', 'item_price_level', 'time_discrete'
#         , 'context_page_id', 'shop_review_num_level']
    if add_features:
        cols = ['time_discrete', 'user_gender_id','user_age_level','user_occupation_id'
                ,'item_city_id','item_price_level'
                ,'context_page_id','shop_review_num_level']
    else:
        cols = ['user_gender_id','user_age_level','user_occupation_id'
                ,'item_city_id','item_price_level'
                ,'context_page_id','shop_review_num_level']

    data = pd.concat([train_data, cv_data, test_data], axis=0, ignore_index=True)

    for col in tqdm(cols):
        col_feature = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, col_feature], axis=1)

#     X = minmax_scale(data.values)
#     data = pd.DataFrame(data=X, columns=data.columns)
    
    
    train_data = data.loc[train_data.index]
    cv_data = data.loc[cv_data.index]
    test_data = data.loc[test_data.index]

    train_data.reset_index(inplace=True, drop=True)
    cv_data.reset_index(inplace=True, drop=True)
    test_data.reset_index(inplace=True, drop=True)

    dump_pickle(train_data, path='../data/train_final_onehot.pkl')
    dump_pickle(cv_data, path='../data/valid_final_onehot.pkl')
    dump_pickle(test_data, path='../data/test_final_onehot.pkl')


data_split(add_features=False)
data_onehot(add_features=False)




train_shape_before:(478138, 45)
test_shape_before:(18371, 45)
the shape of train (420717, 43)
the shape of valid (57421, 43)
the shape of test (18371, 43)


100%|██████████| 7/7 [00:04<00:00,  1.75it/s]


NameError: name 'train' is not defined