In [20]:
#encoding:utf-8
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
import warnings
import datetime

warnings.filterwarnings("ignore")

import time

from utils import raw_data_path, dump_pickle

path = '../data/'
train_file = 'round1_ijcai_18_train_20180301.txt'
test_file = 'round1_ijcai_18_test_a_20180301.txt'

# def load_data():
#     train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     test = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
#     df = pd.concat([train, test], axis=0, ignore_index=True)

def date_convert(data):
    # Transform into datetime format
    data['time'] = pd.to_datetime(data.context_timestamp, unit='s')
    
    # transform into Beijing datetime format
    data['realtime'] = data['time'].apply(lambda x: x + datetime.timedelta(hours=8))
    data['day'] = data['realtime'].dt.day
    data['hour'] = data['realtime'].dt.hour
    
    return data

def base_process(data):
    lbl = preprocessing.LabelEncoder()
    print(data.columns)
    print("========================item==========================")
    # Divided into different category levels and LabelEncoder()
    for i in range(1, 3):
        data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))  
    del data['item_category_list'] 
        
    for i in range(10):
        data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
    del data['item_property_list']
        
    for col in ['item_id', 'item_brand_id', 'item_city_id']:
        data[col] = lbl.fit_transform(data[col])
    
    # Fill none with mean
    data[data.item_sales_level==-1] = None;
    data['item_sales_level'].fillna(data['item_sales_level'].mean())
    
    
    print("========================user==========================")
    # user_gender_id and user_occupation_id should be handled with one-hot
    data[data.user_age_level==-1] = None;
    data['user_age_level'].fillna(data['user_age_level'].mode())
    data['user_age_level'] = data['user_age_level'].apply(lambda x: x%1000)
    
    data[data.user_star_level==-1] = None;
    data['user_star_level'].fillna(data['user_star_level'].mean())
    data['user_star_level'] = data['user_star_level'].apply(lambda x: x%3000)
    
   
    
    print("=====================context==========================")
    data = date_convert(data)
    
    for i in range(5):
        data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
    del data['predict_category_property'] 
        
    print("=====================shop===============================")
    data[data.shop_score_service==-1] = None;
    data['shop_score_service'].fillna(data['shop_score_service'].mean())
    
    data[data.user_age_level==-1] = None;
    data['shop_score_delivery'].fillna(data['shop_score_delivery'].mean())
    
    data[data.user_age_level==-1] = None;
    data['shop_score_description'].fillna(data['shop_score_description'].mean())
    
    return data
    

if __name__ == "__main__":
    start = time.time()
    print("Load Data")
    train = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
    test = pd.read_table(path + train_file, encoding='utf8', delim_whitespace=True)
    len_train = train.shape[0]
    df = pd.concat([train, test], axis=0, ignore_index=True)
    print("Start doing preprocessing")
    df = base_process(df)
    dump_pickle(df, path=raw_data_path + 'df.pkl')
    
    train = df[(df['day'] >= 18) & (df['day'] <= 23)]
    valid = df[(df['day'] == 24)]
    dump_pickle(train, path=raw_data_path + 'train.pkl')
    dump_pickle(valid, path=raw_data_path + 'valid.pkl')
    
    test = df.iloc[len_train:]
    dump_pickle(test, path=raw_data_path + 'test.pkl')
    
    end = time.time()
    print("Preprocessing done and time elapsed %s" % (end-start))
    
    
    
     


Load Data
Start doing preprocessing
Index(['instance_id', 'item_id', 'item_category_list', 'item_property_list',
       'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
       'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id',
       'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id',
       'context_timestamp', 'context_page_id', 'predict_category_property',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'is_trade'],
      dtype='object')
Preprocessing done and time elapsed 154.03813195228577


In [4]:
#encoding:utf-8
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
import warnings
import datetime

warnings.filterwarnings("ignore")

import time
from feature_convert import load_data
from utils import raw_data_path, dump_pickle

path = '../data/'
train_file = 'round1_ijcai_18_train_20180301.txt'
test_file = 'round1_ijcai_18_test_a_20180301.txt'

In [5]:
data = load_data()



### base_process

In [9]:
data.columns.values

array([u'context_id', u'context_page_id', u'context_timestamp',
       u'instance_id', u'is_trade', 'is_train', u'item_brand_id',
       u'item_category_list', u'item_city_id', u'item_collected_level',
       u'item_id', u'item_price_level', u'item_property_list',
       u'item_pv_level', u'item_sales_level',
       u'predict_category_property', u'shop_id', u'shop_review_num_level',
       u'shop_review_positive_rate', u'shop_score_delivery',
       u'shop_score_description', u'shop_score_service',
       u'shop_star_level', u'user_age_level', u'user_gender_id',
       u'user_id', u'user_occupation_id', u'user_star_level'],
      dtype=object)

In [15]:
data['item_property_list'].apply(lambda x:len(str(x).split(";"))).value_counts().plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0x7f9bd7fff950>

In [11]:
lbl = preprocessing.LabelEncoder()
for i in range(1,3):
    data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
        lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))  

# del data['item_category_list'] 

Unnamed: 0,context_id,context_page_id,context_timestamp,instance_id,is_trade,is_train,item_brand_id,item_category_list,item_city_id,item_collected_level,...,shop_score_description,shop_score_service,shop_star_level,user_age_level,user_gender_id,user_id,user_occupation_id,user_star_level,item_category_list1,item_category_list2
0,282924576738839389,4006,1537236544,108641074714126964,0.0,1,1975590437749032870,7908382889764677758;5799347067982556520,3948283326616421003,4,...,1.000000,1.000000,5002,1003,1,4505772604969228686,2005,3003,9,0
1,4007979028023783431,4001,1537243232,5754713551599725161,0.0,1,1975590437749032870,7908382889764677758;5799347067982556520,3948283326616421003,4,...,1.000000,1.000000,5002,1002,0,2692638157208937547,2005,3006,9,0
2,4884875192608989870,4001,1537211052,842679481291040981,0.0,1,1975590437749032870,7908382889764677758;5799347067982556520,3948283326616421003,4,...,1.000000,1.000000,5002,1003,0,5247924392014515924,2005,3004,9,0
3,840119421106178602,4016,1537222670,937088850059189027,0.0,1,1975590437749032870,7908382889764677758;5799347067982556520,3948283326616421003,4,...,1.000000,1.000000,5002,1004,1,2681414445369714628,2005,3006,9,0
4,1736769971710354684,4001,1537271320,7975697065017708072,0.0,1,1975590437749032870,7908382889764677758;5799347067982556520,3948283326616421003,4,...,1.000000,1.000000,5002,1002,0,2729475788342039013,2005,3001,9,0
5,4434980272230296456,4003,1537282855,7764762765372067286,0.0,1,1975590437749032870,7908382889764677758;5799347067982556520,3948283326616421003,4,...,1.000000,1.000000,5002,1004,1,4512655448325954611,2005,3002,9,0
6,3622211816051289512,4001,1537280317,6956333474094867789,0.0,1,9057103201734987852,7908382889764677758;8277336076276184272,548352491538518780,8,...,0.969278,0.974878,5012,1006,1,8811056487516803043,2005,3000,11,0
7,7851031132945961016,4001,1537261120,8387099821892927911,0.0,1,9057103201734987852,7908382889764677758;8277336076276184272,548352491538518780,8,...,0.969278,0.974878,5012,1002,0,6507704883896466138,2002,3002,11,0
8,8388974876851097582,4001,1537208871,4021878205550012615,0.0,1,5520678735822176314,7908382889764677758;5755694407684602296,548352491538518780,10,...,0.969278,0.974878,5012,1003,0,6203308008480593423,2002,3007,8,0
9,1138535512266486347,4003,1537285390,6499571365974135517,0.0,1,9057103201734987852,7908382889764677758;8277336076276184272,548352491538518780,8,...,0.969278,0.974878,5012,1003,0,6041712044514783312,2005,3006,11,0


In [None]:
def base_process(data):
    lbl = preprocessing.LabelEncoder()
    print(data.columns)
    print("========================item==========================")
    # Divided into different category levels and LabelEncoder()
    for i in range(1,3):
        data['item_category_list' + str(i)] = lbl.fit_transform(data['item_category_list'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))  
    del data['item_category_list'] 
        
    for i in range(10):
        data['item_property_list' + str(i)] = lbl.fit_transform(data['item_property_list'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
    del data['item_property_list']
        
    for col in ['item_id', 'item_brand_id', 'item_city_id']:
        data[col] = lbl.fit_transform(data[col])
    
    # Fill none with mean
    data[data.item_sales_level==-1] = None;
    data['item_sales_level'].fillna(data['item_sales_level'].mean())
    
    
    print("========================user==========================")
    # user_gender_id and user_occupation_id should be handled with one-hot
    data[data.user_age_level==-1] = None;
    data['user_age_level'].fillna(data['user_age_level'].mode())
    data['user_age_level'] = data['user_age_level'].apply(lambda x: x%1000)
    
    data[data.user_star_level==-1] = None;
    data['user_star_level'].fillna(data['user_star_level'].mean())
    data['user_star_level'] = data['user_star_level'].apply(lambda x: x%3000)
    
   
    
    print("=====================context==========================")
    data = date_convert(data)
    
    for i in range(5):
        data['predict_category_property' + str(i)] = lbl.fit_transform(data['predict_category_property'].map(
            lambda x: str(str(x).split(';')[i]) if len(str(x).split(';')) > i else ''))
    del data['predict_category_property'] 
        
    print("=====================shop===============================")
    data[data.shop_score_service==-1] = None;
    data['shop_score_service'].fillna(data['shop_score_service'].mean())
    
    data[data.user_age_level==-1] = None;
    data['shop_score_delivery'].fillna(data['shop_score_delivery'].mean())
    
    data[data.user_age_level==-1] = None;
    data['shop_score_description'].fillna(data['shop_score_description'].mean())
    
    return data
    