In [1]:

# coding: utf-8

# In[12]:



import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
import warnings
import datetime

warnings.filterwarnings("ignore")

import time

from utils import raw_data_path, dump_pickle

path = '../data/'
train_file = 'round1_ijcai_18_train_20180301.txt'
test_file = 'round1_ijcai_18_test_a_20180301.txt'



# def load_data():
#     train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     test = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     df = pd.concat([train, test], axis=0, ignore_index=True)

def date_convert(data):
    # Transform into datetime format
    data['time'] = pd.to_datetime(data.context_timestamp, unit='s')

    # transform into Beijing datetime format
    data['realtime'] = data['time'].apply(lambda x: x + datetime.timedelta(hours=8))
    data['day'] = data['realtime'].dt.day
    data['hour'] = data['realtime'].dt.hour
    
    return data

def base_process(data):
    lbl = preprocessing.LabelEncoder()
    print("========================item==========================")
    # Divided into different category levels and LabelEncoder()
    '''
    item_id, item_category_list, item_property_list, item_brand_id, item_city_id, 
    item_price_level, item_sales_level, item_collected_level, item_pv_level
    '''
    for i in range(1, 3):
        data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else 'missing'))
    del data['item_category_list'] 
        
#     for i in range(10):
#         data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(
#             lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
#     del data['item_property_list']
    # train_item_property = data['item_property_list'].str.split(';', expand=True).add_prefix('item_property_')
    # train_item_property.fillna('missing', inplace=True)
    # train_item_property = lbl.fit_transform(train_item_property)
    # data = pd.concat([data, train_item_property], axis=1)

    for col in ['item_id', 'item_brand_id', 'item_city_id']:
        data[col] = lbl.fit_transform(data[col])
    
    # Fill none with mean
    data['item_sales_level'][data.item_sales_level==-1] = None
    data['item_sales_level'].fillna(data['item_sales_level'].mean(), inplace=True)
    
    
    print("========================user==========================")
    # user_gender_id and user_occupation_id should be handled with one-hot
    data[data.user_age_level==-1]['user_age_level'] = None
    data['user_age_level'].fillna(data['user_age_level'].mode())
    data['user_age_level'] = data['user_age_level'].apply(lambda x: x%1000)
    
    data[data.user_star_level==-1]['user_star_level'] = None
    data['user_star_level'].fillna(data['user_star_level'].mean())
    data['user_star_level'] = data['user_star_level'].apply(lambda x: x%3000)
    
    
    print("=====================context==========================")
    data = date_convert(data)
    
    for i in range(5):
        data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
    del data['predict_category_property'] 
        
    print("=====================shop===============================")
    data['shop_score_service'][data.shop_score_service==-1] = None
    data['shop_score_service'].fillna(data['shop_score_service'].mean(), inplace=True)
    
    data['user_age_level'][data.user_age_level==-1] = None
    data['shop_score_delivery'].fillna(data['shop_score_delivery'].mean(), inplace=True)
    
    data['shop_score_description'][data.shop_score_description==-1] = None
    data['shop_score_description'].fillna(data['shop_score_description'].mean(), inplace=True)
    
    return data
    

if __name__ == "__main__":
    start = time.time()
    print("Load Data")
    train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
    test = pd.read_table(path + test_file, encoding='utf8', delim_whitespace=True)
    train.drop_duplicates('instance_id', inplace=True)
    test.drop_duplicates('instance_id', inplace=True)
    print('the shape of train {}'.format(train.shape))
    print('the shape of test {}'.format(test.shape))
    len_train = train.shape[0]
    df = pd.concat([train, test], axis=0, ignore_index=True)
    print("Start doing preprocessing")
    
    df = base_process(df)
    dump_pickle(df, path=raw_data_path + 'df.pkl')
    
    train = df[(df['day'] >= 18) & (df['day'] <= 24)]
    print('the shape of train {}'.format(train.shape))
    dump_pickle(train, path=raw_data_path + 'train.pkl')
    
    test = df[df['day'] ==25]
    print('the shape of test {}'.format(test.shape))
    dump_pickle(test, path=raw_data_path + 'test.pkl')
    
    end = time.time()
    print("Preprocessing done and time elapsed %s" % (end-start))


Load Data
the shape of train (478087, 27)
the shape of test (18371, 26)
Start doing preprocessing
the shape of train (478087, 36)
the shape of test (18371, 36)
Preprocessing done and time elapsed 28.34149408340454


In [6]:
 train

Unnamed: 0,Unnamed: 1,",instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,user_id,user_gender_id,user_age_level,user_occupation_id,user_star_level,context_id,context_timestamp,context_page_id,predict_category_property,shop_id,shop_review_num_level,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description,is_trade,time,realtime,day,hour"
"0,108641074714126964,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;2636395404473730413;8070395809181082496;6434796455031995313;7839592306500064003;3657871859501171040;1782439090818545916;6936314038707413523;5117290135792467652;720840888466250585;8838717711606756251;9172976955054793469;6296485275518770064;8154150695030794787;8199832121759092112;773321667872991333;7270669313837600482;3408398779125901630;4359277201620147874;448184584262668803;9029095803741432716,1975590437749032870,3948283326616421003,3,3,4,14,4505772604969228686,1,1003,2005,3003,282924576738839389,1537236544,4006,5799347067982556520:-1;509660095530134768:-1;5755694407684602296:-1;8277336076276184272:9148482949976129397;7908382889764677758:-1,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0,2018-09-18","02:09:04,2018-09-18","10:09:04,18,10"
"1,5754713551599725161,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;2636395404473730413;8070395809181082496;6434796455031995313;7839592306500064003;3657871859501171040;1782439090818545916;6936314038707413523;5117290135792467652;720840888466250585;8838717711606756251;9172976955054793469;6296485275518770064;8154150695030794787;8199832121759092112;773321667872991333;7270669313837600482;3408398779125901630;4359277201620147874;448184584262668803;9029095803741432716,1975590437749032870,3948283326616421003,3,3,4,14,2692638157208937547,0,1002,2005,3006,4007979028023783431,1537243232,4001,""5799347067982556520:9172976955054793469;7908382889764677758:1787573075717641245,9172976955054793469,5195139481388729954,4621934203383159480,9148482949976129397"",6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0,2018-09-18","04:00:32,2018-09-18","12:00:32,18,12"
"2,842679481291040981,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;2636395404473730413;8070395809181082496;6434796455031995313;7839592306500064003;3657871859501171040;1782439090818545916;6936314038707413523;5117290135792467652;720840888466250585;8838717711606756251;9172976955054793469;6296485275518770064;8154150695030794787;8199832121759092112;773321667872991333;7270669313837600482;3408398779125901630;4359277201620147874;448184584262668803;9029095803741432716,1975590437749032870,3948283326616421003,3,3,4,14,5247924392014515924,0,1003,2005,3004,4884875192608989870,1537211052,4001,5799347067982556520:5131280576272319091;7258015885215914736:5131280576272319091;7908382889764677758:5131280576272319091,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0,2018-09-17","19:04:12,2018-09-18","03:04:12,18,3"
"3,937088850059189027,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;2636395404473730413;8070395809181082496;6434796455031995313;7839592306500064003;3657871859501171040;1782439090818545916;6936314038707413523;5117290135792467652;720840888466250585;8838717711606756251;9172976955054793469;6296485275518770064;8154150695030794787;8199832121759092112;773321667872991333;7270669313837600482;3408398779125901630;4359277201620147874;448184584262668803;9029095803741432716,1975590437749032870,3948283326616421003,3,3,4,14,2681414445369714628,1,1004,2005,3006,840119421106178602,1537222670,4016,""509660095530134768:-1;5799347067982556520:-1;7908382889764677758:1787573075717641245,9148482949976129397;7492960463130085436:9148482949976129397;1950314698730389427:8218883658021718578,6641007758333551406,3749629542652371225,4038060334629950706,6165347051143749031,360508730875480086,9148482949976129397"",6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0,2018-09-17","22:17:50,2018-09-18","06:17:50,18,6"
"4,7975697065017708072,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;2636395404473730413;8070395809181082496;6434796455031995313;7839592306500064003;3657871859501171040;1782439090818545916;6936314038707413523;5117290135792467652;720840888466250585;8838717711606756251;9172976955054793469;6296485275518770064;8154150695030794787;8199832121759092112;773321667872991333;7270669313837600482;3408398779125901630;4359277201620147874;448184584262668803;9029095803741432716,1975590437749032870,3948283326616421003,3,3,4,14,2729475788342039013,0,1002,2005,3001,1736769971710354684,1537271320,4001,5799347067982556520:9172976955054793469;7908382889764677758:9172976955054793469,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0,2018-09-18","11:48:40,2018-09-18","19:48:40,18,19"
"5,7764762765372067286,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;2636395404473730413;8070395809181082496;6434796455031995313;7839592306500064003;3657871859501171040;1782439090818545916;6936314038707413523;5117290135792467652;720840888466250585;8838717711606756251;9172976955054793469;6296485275518770064;8154150695030794787;8199832121759092112;773321667872991333;7270669313837600482;3408398779125901630;4359277201620147874;448184584262668803;9029095803741432716,1975590437749032870,3948283326616421003,3,3,4,14,4512655448325954611,1,1004,2005,3002,4434980272230296456,1537282855,4003,""5799347067982556520:9172976955054793469;8710739180200009128:-1;5755694407684602296:-1;4879721024980945592:5131280576272319091;7908382889764677758:9172976955054793469,5131280576272319091"",6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0,2018-09-18","15:00:55,2018-09-18","23:00:55,18,23"
"6,6956333474094867789,285660928590172217,7908382889764677758;8277336076276184272,2072967855524022579;5131280576272319091;2636395404473730413;9148482949976129397;7199361004668592209;4678095570925618478;2033679869864207699;3802510553218572927;1782439090818545916;820214312075361939;5320468090843686429;1134982063610307090;71803110314516845;1301076623647253687;3258780649701680217;514980552440578167;6411614163944830538;478060273908663971;1418324867218214039;6457208937217973355;4859870894564764703;6048265394477193459;5208530887565657358;2559145093180392146;8119922999783109966;90892965411648070;2935986643229018712;7125679806685817518;4998393090172019193;2653795394150816137;9132046213323956404;7004759496810158079;3674308596153344033;647428987512677251;793305798399120948;8560981779145170452;1364838863277739785;6402837117468309719;1556713051218509070;5905014002669999655;4774054090293502457;1723293555671106692;7527849185955020043;5767144546798382177;730203099166226972;8497879079912276476;7152599495073237509;615076485672811995;8382752466418861499;2296315626544640613,9057103201734987852,548352491538518780,8,9,8,13,8811056487516803043,1,1006,2005,3000,3622211816051289512,1537280317,4001,2011981573061447208:7199361004668592209;8277336076276184272:7199361004668592209;5755694407684602296:-1;7908382889764677758:7199361004668592209;7447522129235776380:-1;8190716055759423915:-1,4885989684392199728,15,0.985427135678392,5012,0.9748783863950958,0.9768625123639959,0.9692782281985368,0,2018-09-18","14:18:37,2018-09-18","22:18:37,18,22"
"7,8387099821892927911,285660928590172217,7908382889764677758;8277336076276184272,2072967855524022579;5131280576272319091;2636395404473730413;9148482949976129397;7199361004668592209;4678095570925618478;2033679869864207699;3802510553218572927;1782439090818545916;820214312075361939;5320468090843686429;1134982063610307090;71803110314516845;1301076623647253687;3258780649701680217;514980552440578167;6411614163944830538;478060273908663971;1418324867218214039;6457208937217973355;4859870894564764703;6048265394477193459;5208530887565657358;2559145093180392146;8119922999783109966;90892965411648070;2935986643229018712;7125679806685817518;4998393090172019193;2653795394150816137;9132046213323956404;7004759496810158079;3674308596153344033;647428987512677251;793305798399120948;8560981779145170452;1364838863277739785;6402837117468309719;1556713051218509070;5905014002669999655;4774054090293502457;1723293555671106692;7527849185955020043;5767144546798382177;730203099166226972;8497879079912276476;7152599495073237509;615076485672811995;8382752466418861499;2296315626544640613,9057103201734987852,548352491538518780,8,9,8,13,6507704883896466138,0,1002,2002,3002,7851031132945961016,1537261120,4001,2011981573061447208:7199361004668592209;8277336076276184272:7199361004668592209;5755694407684602296:-1;7908382889764677758:7199361004668592209;7447522129235776380:-1;8190716055759423915:-1,4885989684392199728,15,0.985427135678392,5012,0.9748783863950958,0.9768625123639959,0.9692782281985368,0,2018-09-18","08:58:40,2018-09-18","16:58:40,18,16"
"8,4021878205550012615,5202355029344881809,7908382889764677758;5755694407684602296,2072967855524022579;5131280576272319091;2636395404473730413;6556896890082244813;3657871859501171040;9173428221474519642;2253152183208719552;357954758031852148;3765394546174375916;8808957892062587408;6621287826538724367;7271630337430743106;8247723580205204722;4564647423883133001;4224097012885011005;542901723744518864;4444792543859265070;1665611025031010859;1266232729783845119;7860359346697893790;549707957212148300;2662182412610098459;6261681079656394086;4803012262423035870;5465522059880368612;1530873973648683953;6503778418253407380;2917822884506250839;2827133331729755680;8271380363138018088;3971663145700298114;3796012835045751390;8724187982730364941;6324941039683947995;5275977934691214612;5119621442439839441;644460148686282775;7123215832059766729;4682240176839116979;537208717985470679;7015116144025242710;4103724031807790102;1267974281451797475;6222435412919027584;4150063881530790669;7205702158058893390,5520678735822176314,548352491538518780,8,9,10,16,6203308008480593423,0,1003,2002,3007,8388974876851097582,1537208871,4001,5755694407684602296:-1;5799347067982556520:-1;7908382889764677758:1354874066266948599;8710739180200009128:-1;509660095530134768:-1;8277336076276184272:-1,4885989684392199728,15,0.985427135678392,5012,0.9748783863950958,0.9768625123639959,0.9692782281985368,0,2018-09-17","18:27:51,2018-09-18","02:27:51,18,2"
"9,6499571365974135517,285660928590172217,7908382889764677758;8277336076276184272,2072967855524022579;5131280576272319091;2636395404473730413;9148482949976129397;7199361004668592209;4678095570925618478;2033679869864207699;3802510553218572927;1782439090818545916;820214312075361939;5320468090843686429;1134982063610307090;71803110314516845;1301076623647253687;3258780649701680217;514980552440578167;6411614163944830538;478060273908663971;1418324867218214039;6457208937217973355;4859870894564764703;6048265394477193459;5208530887565657358;2559145093180392146;8119922999783109966;90892965411648070;2935986643229018712;7125679806685817518;4998393090172019193;2653795394150816137;9132046213323956404;7004759496810158079;3674308596153344033;647428987512677251;793305798399120948;8560981779145170452;1364838863277739785;6402837117468309719;1556713051218509070;5905014002669999655;4774054090293502457;1723293555671106692;7527849185955020043;5767144546798382177;730203099166226972;8497879079912276476;7152599495073237509;615076485672811995;8382752466418861499;2296315626544640613,9057103201734987852,548352491538518780,8,9,8,13,6041712044514783312,0,1003,2005,3006,1138535512266486347,1537285390,4003,""8277336076276184272:820214312075361939;509660095530134768:-1;5799347067982556520:938942550188415640,9148482949976129397;5755694407684602296:-1;3203673979138763595:-1;7908382889764677758:-1"",4885989684392199728,15,0.985427135678392,5012,0.9748783863950958,0.9768625123639959,0.9692782281985368,0,2018-09-18","15:43:10,2018-09-18","23:43:10,18,23"
