In [9]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
import warnings
import datetime

warnings.filterwarnings("ignore")

import time

from utils import raw_data_path, dump_pickle

path = '../data/'
train_file = 'round1_ijcai_18_train_20180301.txt'
test_file = 'round1_ijcai_18_test_a_20180301.txt'



# def load_data():
#     train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     test = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     df = pd.concat([train, test], axis=0, ignore_index=True)

def date_convert(data):
    # Transform into datetime format
    data['time'] = pd.to_datetime(data.context_timestamp, unit='s')

    # transform into Beijing datetime format
    data['realtime'] = data['time'].apply(lambda x: x + datetime.timedelta(hours=8))
    data['day'] = data['realtime'].dt.day
    data['hour'] = data['realtime'].dt.hour
    
    return data

def base_process(data):
    lbl = preprocessing.LabelEncoder()
    print("========================item==========================")
    # Divided into different category levels and LabelEncoder()
    '''
    item_id, item_category_list, item_property_list, item_brand_id, item_city_id, 
    item_price_level, item_sales_level, item_collected_level, item_pv_level
    '''
    for i in range(1, 3):
        data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else 'missing'))
    del data['item_category_list'] 
        
#     for i in range(10):
#         data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(
#             lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
#     del data['item_property_list']
    # train_item_properitem_property_listty = data['item_property_list'].str.split(';', expand=True).add_prefix('item_property_')
    # train_item_property.fillna('missing', inplace=True)
    # train_item_property = lbl.fit_transform(train_item_property)
    # data = pd.concat([data, train_item_property], axis=1)

    for col in ['item_id', 'item_brand_id', 'item_city_id']:
        data[col] = lbl.fit_transform(data[col])
    
    # Fill none with mean
    data['item_sales_level'][data.item_sales_level==-1] = None
    data['item_sales_level'].fillna(data['item_sales_level'].mean(), inplace=True)
    
    
    print("========================user==========================")
    # user_gender_id and user_occupation_id should be handled with one-hot
    data[data.user_age_level==-1]['user_age_level'] = None
    data['user_age_level'].fillna(data['user_age_level'].mode())
    data['user_age_level'] = data['user_age_level'].apply(lambda x: x%1000)
    
    data[data.user_star_level==-1]['user_star_level'] = None
    data['user_star_level'].fillna(data['user_star_level'].mean())
    data['user_star_level'] = data['user_star_level'].apply(lambda x: x%3000)
    
    
    print("=====================context==========================")
    data = date_convert(data)
    
    for i in range(5):
        data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
    del data['predict_category_property'] 
        
    print("=====================shop===============================")
    data['shop_score_service'][data.shop_score_service==-1] = None
    data['shop_score_service'].fillna(data['shop_score_service'].mean(), inplace=True)
    
    data['user_age_level'][data.user_age_level==-1] = None
    data['shop_score_delivery'].fillna(data['shop_score_delivery'].mean(), inplace=True)
    
    data['shop_score_description'][data.shop_score_description==-1] = None
    data['shop_score_description'].fillna(data['shop_score_description'].mean(), inplace=True)
    
    return data
    

if __name__ == "__main__":
    start = time.time()
    print("Load Data")
    train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
    test = pd.read_table(path + test_file, encoding='utf8', delim_whitespace=True)
    train.drop_duplicates('instance_id', inplace=True)
    test.drop_duplicates('instance_id', inplace=True)
    print('the shape of train {}'.format(train.shape))
    print('the shape of test {}'.format(test.shape))
    len_train = train.shape[0]
#     df = pd.concat([train, test], axis=0, ignore_index=True)
    df = train
    print("Start doing preprocessing")
    
    df = base_process(df)
    dump_pickle(df, path=raw_data_path + 'df.pkl')
    
    train = df[(df['day'] >= 18) & (df['day'] <= 23)]
    train.index = np.arange(0, len(train))
    print('the shape of train {}'.format(train.shape))
    print(train.index)
    dump_pickle(train, path='../data_valid/train.pkl')
    
    test = df[df['day'] == 24]
    test.index = np.arange(len(train), len(train)+len(test))
    print('the shape of test {}'.format(test.shape))
    dump_pickle(test, path='../data_valid/test.pkl')
    
    end = time.time()
    print("Preprocessing done and time elapsed %s" % (end-start))


Load Data
the shape of train (478087, 27)
the shape of test (18371, 26)
Start doing preprocessing
the shape of train (420676, 36)
Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            420666, 420667, 420668, 420669, 420670, 420671, 420672, 420673,
            420674, 420675],
           dtype='int64', length=420676)
the shape of test (57411, 36)
Preprocessing done and time elapsed 26.165168046951294


In [6]:
train.head()

Unnamed: 0,instance_id,item_id,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,user_id,...,item_category_list2,time,realtime,day,hour,predict_category_property0,predict_category_property1,predict_category_property2,predict_category_property3,predict_category_property4
0,108641074714126964,3760,2072967855524022579;5131280576272319091;263639...,448,50,3,3.0,4,14,4505772604969228686,...,2,2018-09-18 02:09:04,2018-09-18 10:09:04,18,10,3845,1832,2290,8618,3556
1,5754713551599725161,3760,2072967855524022579;5131280576272319091;263639...,448,50,3,3.0,4,14,2692638157208937547,...,2,2018-09-18 04:00:32,2018-09-18 12:00:32,18,12,4319,6083,0,0,0
2,842679481291040981,3760,2072967855524022579;5131280576272319091;263639...,448,50,3,3.0,4,14,5247924392014515924,...,2,2018-09-17 19:04:12,2018-09-18 03:04:12,18,3,4164,3931,7001,0,0
3,937088850059189027,3760,2072967855524022579;5131280576272319091;263639...,448,50,3,3.0,4,14,2681414445369714628,...,2,2018-09-17 22:17:50,2018-09-18 06:17:50,18,6,1827,3133,5292,3885,1019
4,7975697065017708072,3760,2072967855524022579;5131280576272319091;263639...,448,50,3,3.0,4,14,2729475788342039013,...,2,2018-09-18 11:48:40,2018-09-18 19:48:40,18,19,4319,9825,0,0,0
