In [1]:
import time
import numpy as np 
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

### Features

In [2]:
df = pd.read_csv('./data/data.txt', sep=' ')
df.columns

Index(['instance_id', 'item_id', 'item_category_list', 'item_property_list',
       'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
       'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id',
       'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id',
       'context_timestamp', 'context_page_id', 'predict_category_property',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'is_trade'],
      dtype='object')

In [3]:
df['item_id'].values.shape

(478138,)

In [4]:
df.shape[1]

27

In [5]:
df.head(10)

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,context_page_id,predict_category_property,shop_id,shop_review_num_level,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description,is_trade
0,108641074714126964,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4006,5799347067982556520:-1;509660095530134768:-1;5...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
1,5754713551599725161,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:9172976955054793469;790838...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
2,842679481291040981,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:5131280576272319091;725801...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
3,937088850059189027,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4016,509660095530134768:-1;5799347067982556520:-1;7...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
4,7975697065017708072,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4001,5799347067982556520:9172976955054793469;790838...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
5,7764762765372067286,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,4003,5799347067982556520:9172976955054793469;871073...,6765930309048922341,4,1.0,5002,1.0,1.0,1.0,0
6,6956333474094867789,285660928590172217,7908382889764677758;8277336076276184272,2072967855524022579;5131280576272319091;263639...,9057103201734987852,548352491538518780,8,9,8,13,...,4001,2011981573061447208:7199361004668592209;827733...,4885989684392199728,15,0.985427,5012,0.974878,0.976863,0.969278,0
7,8387099821892927911,285660928590172217,7908382889764677758;8277336076276184272,2072967855524022579;5131280576272319091;263639...,9057103201734987852,548352491538518780,8,9,8,13,...,4001,2011981573061447208:7199361004668592209;827733...,4885989684392199728,15,0.985427,5012,0.974878,0.976863,0.969278,0
8,4021878205550012615,5202355029344881809,7908382889764677758;5755694407684602296,2072967855524022579;5131280576272319091;263639...,5520678735822176314,548352491538518780,8,9,10,16,...,4001,5755694407684602296:-1;5799347067982556520:-1;...,4885989684392199728,15,0.985427,5012,0.974878,0.976863,0.969278,0
9,6499571365974135517,285660928590172217,7908382889764677758;8277336076276184272,2072967855524022579;5131280576272319091;263639...,9057103201734987852,548352491538518780,8,9,8,13,...,4003,8277336076276184272:820214312075361939;5096600...,4885989684392199728,15,0.985427,5012,0.974878,0.976863,0.969278,0


### Change timestamp to date

In [6]:
df['context_timestamp'].head(10)

0    1537236544
1    1537243232
2    1537211052
3    1537222670
4    1537271320
5    1537282855
6    1537280317
7    1537261120
8    1537208871
9    1537285390
Name: context_timestamp, dtype: int64

In [7]:
df['date'] = df['context_timestamp'].apply(lambda x: time.strftime('%Y-%m-%d', time.localtime(x)))


In [8]:
df[['context_timestamp', 'date']].head()

Unnamed: 0,context_timestamp,date
0,1537236544,2018-09-17
1,1537243232,2018-09-18
2,1537211052,2018-09-17
3,1537222670,2018-09-17
4,1537271320,2018-09-18


In [9]:
unique_date = df['date'].unique()
for day in unique_date:
        print(day)
        date_pivot = pd.to_datetime(day)
        print(date_pivot)

2018-09-17
2018-09-17 00:00:00
2018-09-18
2018-09-18 00:00:00
2018-09-20
2018-09-20 00:00:00
2018-09-21
2018-09-21 00:00:00
2018-09-19
2018-09-19 00:00:00
2018-09-22
2018-09-22 00:00:00
2018-09-23
2018-09-23 00:00:00
2018-09-24
2018-09-24 00:00:00


In [10]:
df['item_id'].unique().size

10075

In [11]:
df['item_brand_id'].unique().size

2055

In [12]:
df['item_price_level'].unique()

array([ 3,  8,  7,  5,  4,  6,  9,  2, 10,  1, 11,  0, 17, 16])

In [13]:
df.groupby('date').is_trade.mean().reset_index()

Unnamed: 0,date,is_trade
0,2018-09-17,0.024691
1,2018-09-18,0.019019
2,2018-09-19,0.020203
3,2018-09-20,0.019152
4,2018-09-21,0.018855
5,2018-09-22,0.019382
6,2018-09-23,0.016242
7,2018-09-24,0.01609


In [14]:
df.groupby('item_id').is_trade.mean().reset_index().head(10)

Unnamed: 0,item_id,is_trade
0,696490723789804,0.071429
1,1097631460775571,0.016949
2,1637165183538885,0.0
3,3341342041473146,0.0
4,4055398786868336,0.0
5,4255654217639344,0.0
6,6536469184064787,0.0
7,7683654146703952,0.0
8,9393908124420502,0.040816
9,10102212873966760,0.051282


In [15]:
oh_encoder = OneHotEncoder(sparse=True, categories='auto')
oh_encoder.fit_transform(df['user_gender_id'].values.reshape((-1, 1))).toarray()

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]])

In [16]:
cv = CountVectorizer()
cv.fit_transform(df['item_category_list']).toarray()[0:10]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0]], dtype=int64)

In [17]:
space = oh_encoder.fit_transform(df['user_gender_id'].values.reshape((-1, 1))).toarray()
val = cv.fit_transform(df['item_category_list']).toarray()
np.hstack((space, val))[0:10]

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 0., 0.]])

In [18]:
feature_importance_1 = eval(open('feature_importance_1.txt', 'r').read())
sorted(feature_importance_1.items(), key=lambda x: x[1], reverse=True)

[('user_star_level', 1686),
 ('user_age_level', 1039),
 ('shop_score_description', 843),
 ('shop_score_delivery', 742),
 ('item_sales_level', 656),
 ('shop_score_service', 652),
 ('shop_review_positive_rate', 639),
 ('item_collected_level', 535),
 ('item_pv_level', 495),
 ('user_occupation_id_4', 386),
 ('item_price_level', 332),
 ('user_gender_id_2', 318),
 ('user_occupation_id_1', 304),
 ('shop_review_num_level', 286),
 ('shop_star_level', 194),
 ('user_gender_id_1', 184),
 ('item_category_list_7', 100),
 ('item_city_id_94', 95),
 ('item_category_list_13', 86),
 ('item_id_3693', 83),
 ('item_category_list_11', 77),
 ('item_category_list_8', 72),
 ('item_category_list_14', 68),
 ('item_id_3658', 66),
 ('item_city_id_92', 65),
 ('user_occupation_id_3', 63),
 ('item_category_list_9', 60),
 ('item_brand_id_1292', 59),
 ('item_category_list_5', 55),
 ('item_brand_id_47', 55),
 ('item_id_2444', 51),
 ('item_id_1937', 46),
 ('item_brand_id_557', 45),
 ('item_city_id_71', 44),
 ('item_brand_

In [19]:
feature_importance_2 = eval(open('feature_importance_2.txt', 'r').read())
sorted(feature_importance_2.items(), key=lambda x: x[1], reverse=True)

[('user_star_level', 1417),
 ('user_age_level', 892),
 ('item_convrate', 789),
 ('age_item_convraterate', 765),
 ('shop_score_service', 709),
 ('shop_score_description', 639),
 ('shop_score_delivery', 624),
 ('shop_review_positive_rate', 604),
 ('first_to_now', 518),
 ('item_sales_level', 499),
 ('prev_to_now', 491),
 ('item_pv_level', 321),
 ('item_price_level', 298),
 ('shop_review_num_level', 294),
 ('user_occupation_id_1', 277),
 ('user_occupation_id_4', 228),
 ('item_collected_level', 208),
 ('user_gender_id_1', 192),
 ('user_gender_id_2', 164),
 ('shop_star_level', 125),
 ('recent_15_minutes', 94),
 ('item_category_list_11', 86),
 ('item_city_id_96', 82),
 ('item_id_3901', 60),
 ('item_brand_id_288', 59),
 ('item_id_2532', 58),
 ('shop_id_1606', 53),
 ('item_category_list_9', 50),
 ('user_occupation_id_3', 50),
 ('item_category_list_5', 49),
 ('item_brand_id_1327', 49),
 ('item_category_list_8', 48),
 ('item_category_list_14', 48),
 ('item_category_list_13', 46),
 ('item_brand_id

In [20]:
def _build_date_buf(date_pivot, left, right):
    date_buf = []
    for i in range(left, right):
        date = date_pivot + pd.Timedelta(i, unit='d')
        date_buf.append(date.strftime('%Y-%m-%d'))

    return date_buf

In [25]:
df = df[['user_id', 'item_id', 'context_timestamp', 'instance_id', 'user_age_level','date', 'is_trade']]
unique_date = df['date'].unique()
col_list = [(['item_id'], 'item_convrate')]
data_buf = []
for day in unique_date:
    date_pivot = pd.to_datetime(day)
    lag_days = _build_date_buf(date_pivot, -3, 0)
    target_df = df[df['date'].isin([day])]
    lag_df = df[df['date'].isin(lag_days)]
    
    for cols, col_name in col_list:
        lag_g = lag_df.groupby(cols).is_trade.mean().reset_index()
        lag_cols = []
        lag_cols.extend(cols)
        lag_cols.append(col_name)
        lag_g.columns = lag_cols
        target_df = pd.merge(target_df, lag_g, on=cols, how='left').fillna(2)
        
target_df

Unnamed: 0,user_id,item_id,context_timestamp,instance_id,user_age_level,date,is_trade,item_convrate
0,3434106476873985199,3417228998968466891,1537794490,2137051386648397465,1003,2018-09-24,0,0.000000
1,4790424684324799775,3417228998968466891,1537796611,8636603765413923862,1003,2018-09-24,0,0.000000
2,3873895643821589922,932459418921071956,1537789162,345383580586199930,1003,2018-09-24,0,0.017391
3,1604978399583145196,932459418921071956,1537786370,8731890116021855314,1004,2018-09-24,0,0.017391
4,1382061003987642971,932459418921071956,1537789681,4313209774350468112,1003,2018-09-24,0,0.017391
5,9219322646637270156,932459418921071956,1537780785,3105730058528341650,1003,2018-09-24,0,0.017391
6,5286513253890269807,932459418921071956,1537776331,61137454093449956,1003,2018-09-24,0,0.017391
7,6515290327014674385,932459418921071956,1537785378,4900634264217133300,1002,2018-09-24,1,0.017391
8,988569127238007467,932459418921071956,1537780703,3878416316017114607,1003,2018-09-24,0,0.017391
9,5332786985460695035,932459418921071956,1537792987,2091851204363732790,1004,2018-09-24,0,0.017391
