In [4]:

# coding: utf-8

# In[12]:



import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
import warnings
import datetime

warnings.filterwarnings("ignore")

import time

from utils import raw_data_path, dump_pickle

path = '../data/'
train_file = 'round1_ijcai_18_train_20180301.txt'
test_file = 'round1_ijcai_18_test_a_20180301.txt'



# def load_data():
#     train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     test = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     df = pd.concat([train, test], axis=0, ignore_index=True)

def date_convert(data):
    # Transform into datetime format
    data['time'] = pd.to_datetime(data.context_timestamp, unit='s')

    # transform into Beijing datetime format
    data['realtime'] = data['time'].apply(lambda x: x + datetime.timedelta(hours=8))
    data['day'] = data['realtime'].dt.day
    data['hour'] = data['realtime'].dt.hour
    
    return data

def base_process(data):
    lbl = preprocessing.LabelEncoder()
    print("========================item==========================")
    # Divided into different category levels and LabelEncoder()
    '''
    item_id, item_category_list, item_property_list, item_brand_id, item_city_id, 
    item_price_level, item_sales_level, item_collected_level, item_pv_level
    '''
    for i in range(1, 3):
        data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else 'missing'))
    del data['item_category_list'] 
        
#     for i in range(10):
#         data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(
#             lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
#     del data['item_property_list']
    # train_item_property = data['item_property_list'].str.split(';', expand=True).add_prefix('item_property_')
    # train_item_property.fillna('missing', inplace=True)
    # train_item_property = lbl.fit_transform(train_item_property)
    # data = pd.concat([data, train_item_property], axis=1)

    for col in ['item_id', 'item_brand_id', 'item_city_id']:
        data[col] = lbl.fit_transform(data[col])
    
    # Fill none with mean
    data['item_sales_level'][data.item_sales_level==-1] = None
    data['item_sales_level'].fillna(data['item_sales_level'].mean(), inplace=True)
    
    
    print("========================user==========================")
    # user_gender_id and user_occupation_id should be handled with one-hot
    data[data.user_age_level==-1]['user_age_level'] = None
    data['user_age_level'].fillna(data['user_age_level'].mode())
    data['user_age_level'] = data['user_age_level'].apply(lambda x: x%1000)
    
    data[data.user_star_level==-1]['user_star_level'] = None
    data['user_star_level'].fillna(data['user_star_level'].mean())
    data['user_star_level'] = data['user_star_level'].apply(lambda x: x%3000)
    
    
    print("=====================context==========================")
    data = date_convert(data)
    
    for i in range(5):
        data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
    del data['predict_category_property'] 
        
    print("=====================shop===============================")
    data['shop_score_service'][data.shop_score_service==-1] = None
    data['shop_score_service'].fillna(data['shop_score_service'].mean(), inplace=True)
    
    data['user_age_level'][data.user_age_level==-1] = None
    data['shop_score_delivery'].fillna(data['shop_score_delivery'].mean(), inplace=True)
    
    data['shop_score_description'][data.shop_score_description==-1] = None
    data['shop_score_description'].fillna(data['shop_score_description'].mean(), inplace=True)
    
    return data
    

if __name__ == "__main__":
    start = time.time()
    print("Load Data")
    train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
    test = pd.read_table(path + test_file, encoding='utf8', delim_whitespace=True)
    train.drop_duplicates('instance_id', inplace=True)
    test.drop_duplicates('instance_id', inplace=True)
    print('the shape of train {}'.format(train.shape))
    print('the shape of test {}'.format(test.shape))
    len_train = train.shape[0]
    df = pd.concat([train, test], axis=0, ignore_index=True)
    print("Start doing preprocessing")
    df = base_process(df)
    dump_pickle(df, path=raw_data_path + 'df.pkl')
    train = df[(df['day'] >= 18) & (df['day'] <= 24)]
    print('the shape of train {}'.format(train.shape))
    dump_pickle(train, path=raw_data_path + 'train.pkl')
    test = df.iloc[len_train:]
    print('the shape of test {}'.format(test.shape))
    dump_pickle(test, path=raw_data_path + 'test.pkl')
    
    end = time.time()
    print("Preprocessing done and time elapsed %s" % (end-start))


Load Data
the shape of train (478087, 27)
the shape of test (18371, 26)
Start doing preprocessing
the shape of train (478087, 36)
the shape of test (18371, 36)
Preprocessing done and time elapsed 29.794352054595947


In [3]:
train

Unnamed: 0,context_id,context_page_id,context_timestamp,instance_id,is_trade,item_brand_id,item_city_id,item_collected_level,item_id,item_price_level,...,item_property_list9,time,realtime,day,hour,predict_category_property0,predict_category_property1,predict_category_property2,predict_category_property3,predict_category_property4
0,282924576738839389,4006,1537236544,108641074714126964,0.0,453,50,4,3804,3,...,834,2018-09-18 02:09:04,2018-09-18 10:09:04,18,10,3970,1886,2373,8922,3703
1,4007979028023783431,4001,1537243232,5754713551599725161,0.0,453,50,4,3804,3,...,834,2018-09-18 04:00:32,2018-09-18 12:00:32,18,12,4458,6271,0,0,0
2,4884875192608989870,4001,1537211052,842679481291040981,0.0,453,50,4,3804,3,...,834,2018-09-17 19:04:12,2018-09-18 03:04:12,18,3,4298,4049,7225,0,0
3,840119421106178602,4016,1537222670,937088850059189027,0.0,453,50,4,3804,3,...,834,2018-09-17 22:17:50,2018-09-18 06:17:50,18,6,1892,3226,5464,4029,1056
4,1736769971710354684,4001,1537271320,7975697065017708072,0.0,453,50,4,3804,3,...,834,2018-09-18 11:48:40,2018-09-18 19:48:40,18,19,4458,10134,0,0,0
5,4434980272230296456,4003,1537282855,7764762765372067286,0.0,453,50,4,3804,3,...,834,2018-09-18 15:00:55,2018-09-18 23:00:55,18,23,4458,11362,2373,2318,6190
6,3622211816051289512,4001,1537280317,6956333474094867789,0.0,2030,9,8,339,8,...,1478,2018-09-18 14:18:37,2018-09-18 22:18:37,18,22,959,10875,2373,7776,3557
7,7851031132945961016,4001,1537261120,8387099821892927911,0.0,2030,9,8,339,8,...,1478,2018-09-18 08:58:40,2018-09-18 16:58:40,18,16,959,10875,2373,7776,3557
8,8388974876851097582,4001,1537208871,4021878205550012615,0.0,1267,9,10,5777,8,...,1595,2018-09-17 18:27:51,2018-09-18 02:27:51,18,2,2321,3226,4346,10206,2306
9,1138535512266486347,4003,1537285390,6499571365974135517,0.0,2030,9,8,339,8,...,1478,2018-09-18 15:43:10,2018-09-18 23:43:10,18,23,6709,1886,2914,2695,1656
