In [33]:
# %load 501_train_test_split.py
from glob import glob
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import minmax_scale
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder

def read_pickles(path):
   
    #   print([pd.read_pickle(f).shape for f in tqdm(sorted(glob(path+'/*.pkl')))])
    #   df = pd.concat([pd.read_pickle(f) for f in tqdm(sorted(glob(path+'/*.pkl')))], axis=1)
    f_list = [f for f in tqdm(sorted(glob(path+'/*.pkl')))]
    df = None   
    for f in f_list:
        if df is None:
#             print(pd.read_pickle(f).shape[0])
            df = pd.read_pickle(f)
#             print(f)
            print(df.shape)
        else:
#             print(pd.read_pickle(f).shape[0])
#             print(f)
            df = pd.merge(df, pd.read_pickle(f), on='instance_id', how='inner')            
            print(df.shape)
#             df = pd.concat([df, pd.read_pickle(f).reset_index()], axis=1)
#             print(df.shape)
    return df

def data_split(add_features=True):
    train = load_pickle('../data/train.pkl')
    test = load_pickle('../data/test.pkl')
    
    print('train_shape_before:{}'.format(train.shape))
    print('test_shape_before:{}'.format(test.shape))
    
    if add_features:
        train_feats = read_pickles('../data/train_feature')
        test_feats = read_pickles('../data/test_feature')

        train_feats.drop('instance_id', axis=1, inplace=True)
        test_feats.drop('instance_id', axis=1, inplace=True)

        print('train_feats_shape:{}'.format(train_feats.shape))
        print('test_feats_shape:{}'.format(test_feats.shape))

        train = pd.concat([train.reset_index(), train_feats.reset_index()], axis=1)
        test = pd.concat([test.reset_index(), test_feats.reset_index()], axis=1)

        print('train_shape:{}'.format(train.shape))
        print('test_shape:{}'.format(test.shape))
    
    
        del train_feats; gc.collect()
        del test_feats; gc.collect()
    
    drop_columns = ['time', 'realtime']
    train.drop(drop_columns, axis=1, inplace=True)
    test.drop(drop_columns, axis=1, inplace=True)

    train_df = train[train.day<24].copy()
    #train_df = handle_imbalance(train_df)
    valid_df = train[train.day==24].copy()
    test_df = test
    
    train_df.fillna(-1, inplace=True)
    valid_df.fillna(-1, inplace=True)
    test_df.fillna(-1, inplace=True)
    
    
    print('the shape of train {}'.format(train_df.shape))
    print('the shape of valid {}'.format(valid_df.shape))
    print('the shape of test {}'.format(test_df.shape))
    
    dump_pickle(train_df, path='../data/train_final.pkl')
    dump_pickle(valid_df, path='../data/valid_final.pkl')
    dump_pickle(test_df, path='../data/test_final.pkl')
    del train_df; gc.collect()
    del valid_df; gc.collect()
    del test_df; gc.collect()

def data_onehot(add_features=True):

    train_data = load_pickle(path='../data/train_final.pkl')
    valid_data = load_pickle(path='../data/valid_final.pkl')
    test_data = load_pickle(path='../data/test_final.pkl')

    if add_features:
        cols = ['time_discrete', 'user_gender_id','user_age_level','user_occupation_id'
                ,'item_city_id','item_price_level'
                ,'context_page_id','shop_review_num_level']
    else:
        cols = ['user_gender_id','user_age_level','user_occupation_id'
                ,'item_city_id','item_price_level'
                ,'context_page_id','shop_review_num_level']

    data = pd.concat([train_data, valid_data, test_data], axis=0, ignore_index=True)

    for col in tqdm(cols):
        col_feature = pd.get_dummies(data[col], prefix=col)
        data.drop([col], axis=1, inplace=True)
        data = pd.concat([data, col_feature], axis=1)

    print(train_data.index)
    print(test_data.index)
    train_data = data[data.day<23].copy()
    valid_data = data[(data.day==23)|(data.day==24)].copy()
    test_data = data[data.day==25].copy()
    
#     train_data.reset_index(inplace=True, drop=True)
#     valid_data.reset_index(inplace=True, drop=True)
#     test_data.reset_index(inplace=True, drop=True)

    print('train shap:',train_data.shape)
    print('cv shape', valid_data.shape)
    print('test shape', test_data.shape)
    
    dump_pickle(train_data, path='../data/train_final_onehot.pkl')
    dump_pickle(valid_data, path='../data/valid_final_onehot.pkl')
    dump_pickle(test_data, path='../data/test_final_onehot.pkl')


data_split(add_features=True)
data_onehot(add_features=True)




100%|██████████| 7/7 [00:00<00:00, 59313.39it/s]

train_shape_before:(478087, 36)
test_shape_before:(18371, 36)
(478087, 11)





(478087, 16)
(478087, 22)
(478087, 23)
(478087, 26)
(478087, 36)


100%|██████████| 7/7 [00:00<00:00, 40000.17it/s]

(478087, 42)
(18371, 11)
(18371, 16)
(18371, 22)
(18371, 23)
(18371, 26)
(18371, 36)
(18371, 42)





train_feats_shape:(478087, 41)
test_feats_shape:(18371, 41)
train_shape:(478087, 79)
test_shape:(18371, 79)
the shape of train (420676, 77)
the shape of valid (57411, 77)
the shape of test (18371, 77)


100%|██████████| 8/8 [00:07<00:00,  1.03it/s]


Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            420666, 420667, 420668, 420669, 420670, 420671, 420672, 420673,
            420674, 420675],
           dtype='int64', length=420676)
RangeIndex(start=0, stop=18371, step=1)
train shap: (357066, 280)
cv shape (121021, 280)
test shape (18371, 280)


In [22]:
train_data.dtypes

NameError: name 'train_data' is not defined