In [1]:
# coding: utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from collections import Counter
import time
import lightgbm as lgb
import xgboost as xgb
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
warnings.filterwarnings('ignore')
np.random.seed(2018)

In [2]:
train = pd.read_table('../data/round2_train.txt', delim_whitespace=True, index_col=None)
test = pd.read_table('../data/round2_ijcai_18_test_a_20180425.txt', delim_whitespace=True, index_col=None)

train = train.drop_duplicates().reset_index(drop=True)
data = pd.concat([train, test]).reset_index(drop=True)

In [4]:
test_b = pd.read_table('../glq/2.data/round2_test_b.txt', delim_whitespace=True, index_col=None)
data = pd.concat([data, test_b]).reset_index(drop=True)

In [6]:
def timestamp2date(ts):
    # 将时间戳转为 [月， 日， 时， 分， 秒]
    return list(time.localtime(ts))[1:6]

data['day'] = data['context_timestamp'].apply(lambda x: timestamp2date(x)[1])
data['hour'] = data['context_timestamp'].apply(lambda x: timestamp2date(x)[2])  # 小时 ****************!
data['minute'] = data['context_timestamp'].apply(lambda x: timestamp2date(x)[3])
data['halfHour'] = data['hour'] * 2 + (data['minute'] / 30).astype('int')

In [30]:
def hour_range(x):
    if(x>=1 and x<=7):
        return 0
    if(x >=8 and x<=13):
        return 1
    if(x>=14 and x<=18):
        return 2
    return 3
data['hour_range'] = data['hour'].apply(hour_range)

In [31]:
data[['day', 'hour', 'minute', 'halfHour', 'total_hour', 'hour_range']].to_csv('featFile-b/timeFeat.csv', index=False)

In [5]:
timeFeat = pd.read_csv('featFile/timeFeat.csv')
data = pd.concat([data, timeFeat], axis=1)

In [40]:
catFeat = pd.read_csv('featFile/catFeat.csv')
data = pd.concat([data, catFeat], axis=1)

In [7]:
data.shape

(12161692, 31)

# 公共函数

In [8]:
def computeCVR(s):
    return s.sum()/float(s.shape[0])

# plotly的基本设置

In [9]:
xAxis=dict(
    showgrid=True,  #网格
    zeroline=False,  #是否显示基线,即沿着(0,0)画出x轴和y轴
    nticks=20,
    showline=True,
    mirror='all',
    zerolinecolor="#FF0000"
)

yAxis=dict(
    showgrid=True,  #网格
    zeroline=False,
    nticks=20,
    showline=True,
    mirror='all',
    zerolinecolor="#FF0000"
)

# 点击量和转化率的变化曲线

In [25]:
day_count_df = data.groupby('day', as_index=False)['instance_id'].agg({
    'count': 'count',
}).sort_values(by='day')

day_cvr_df = data[data.is_trade.notnull()].groupby('day', as_index=False)['is_trade'].agg({
    'cvr': computeCVR
}).sort_values(by='day')

hour_count_df = data.groupby('total_hour', as_index=False)['instance_id'].agg({
    'count': 'count',
}).sort_values(by='total_hour')

hour_cvr_df = data[data.is_trade.notnull()].groupby('total_hour', as_index=False)['is_trade'].agg({
    'cvr': computeCVR
}).sort_values(by='total_hour')

In [16]:
data[data.day==7].shape

(2806831, 32)

# 天级点击量/转化率的变化曲线

In [26]:
dayClick_trace = go.Scatter(
    x = day_count_df['day'],
    y = day_count_df['count'],
    name = '点击量'
)

dayCVR_trace = go.Scatter(
    x = day_cvr_df['day'],
    y = day_cvr_df['cvr'],
    name = '转化率',
    yaxis= 'y2'
)

_data = [dayClick_trace, dayCVR_trace]
layout = go.Layout(
    title='天级点击量/转化率变化曲线图',
    yaxis=dict(
        title='点击量'
    ),
    yaxis2=dict(
        title='转化率',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    ),
    xaxis=xAxis
)
fig = go.Figure(data=_data, layout=layout)
py.iplot(fig)

### 对小时进行12个小时的偏移 => 取6号后12小时和7号前12小时作为1天

In [27]:
data['day_shift'] = data['total_hour'].apply(lambda x: int((x+12)/24))
shifted_day_count_cvr_df = data[data.is_trade.notnull()].groupby('day_shift', as_index=False)['is_trade'].agg({
    'count': 'count',
    'cvr': computeCVR
}).sort_values(by='day_shift')

shiftDayClick_trace = go.Scatter(
    x = shifted_day_count_cvr_df['day_shift'],
    y = shifted_day_count_cvr_df['count'],
    name = '点击量'
)

shiftDayCVR_trace = go.Scatter(
    x = shifted_day_count_cvr_df['day_shift'],
    y = shifted_day_count_cvr_df['cvr'],
    name = '转化率',
    yaxis= 'y2'
)

_data = [shiftDayClick_trace, shiftDayCVR_trace]
layout = go.Layout(
    title='天级点击量/转化率变化曲线图',
    yaxis=dict(
        title='点击量'
    ),
    yaxis2=dict(
        title='转化率',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    ),
    xaxis=xAxis
)
fig = go.Figure(data=_data, layout=layout)
py.iplot(fig)

# 小时级的点击量/转化率变化曲线

In [28]:
hourClick_trace = go.Scatter(
    x = hour_count_df['total_hour'],
    y = hour_count_df['count'],
    name = '点击量'
)

hourCVR_trace = go.Scatter(
    x = hour_cvr_df['total_hour'],
    y = hour_cvr_df['cvr'],
    name = '转化率',
    yaxis= 'y2'
)

_data = [hourClick_trace, hourCVR_trace]
layout = go.Layout(
    title='小时级点击量/转化率变化曲线图',
    height=450,
    yaxis=dict(
        title='点击量'
    ),
    yaxis2=dict(
        title='转化率',
        titlefont=dict(
            color='rgb(148, 103, 189)'
        ),
        tickfont=dict(
            color='rgb(148, 103, 189)'
        ),
        overlaying='y',
        side='right'
    ),
    xaxis=xAxis
)
fig = go.Figure(data=_data, layout=layout)
py.iplot(fig)

# 7号0点的突变情况

In [9]:
train_7 = data[(data.is_trade.notnull()) & (data.day==7)]
test_7 = data[data.is_trade.isnull()]

In [7]:
last_17_hour = data[(data.is_trade.notnull()) & (data.total_hour>7*24-5)]

In [17]:
test_7.head()

Unnamed: 0,context_id,context_page_id,context_timestamp,instance_id,is_trade,item_brand_id,item_category_list,item_city_id,item_collected_level,item_id,...,user_occupation_id,user_star_level,day,hour,minute,halfHour,total_hour,hour_range,hourBetween1111AndNow,dayBefore1111
10432036,6619815129008424951,4014,1536320389,93294255633855,,3434999509929816124,836752724084922533;768579787521575291,196257267849351217,11,2615021910672727866,...,2002,3003,7,19,39,39,187,3,19,0
10432037,3917195934496477562,4001,1536322611,558322259509454,,2274097622688424543,836752724084922533;5685690139879409547;7497531...,3122721854741763495,12,3748469724871724205,...,2002,3008,7,20,16,40,188,3,20,0
10432038,6767420158915495601,4002,1536335648,594573634113186,,7838285046767229711,836752724084922533;768579787521575291,1019055478500227370,13,2111357467597282780,...,2005,3006,7,23,54,47,191,3,23,0
10432039,6218693273798012600,4001,1536310587,667327653735176,,1962186990434139903,836752724084922533;2211060154630359130;7848078...,2174699138227015967,11,3717651022681022321,...,2005,3005,7,16,56,33,184,2,16,0
10432040,5779090872698966722,4003,1536320745,697732672924394,,5390915051549817288,836752724084922533;1909641874861640857,196257267849351217,12,8284923347262844848,...,2005,3007,7,19,45,39,187,3,19,0


### shop相关特征的突变情况

In [18]:
top_shops = last_17_hour.groupby('shop_id').size().sort_values(ascending=False).index.tolist()[:10]
for _shop in top_shops:
    target_df = last_17_hour[last_17_hour.shop_id==_shop]
    _test_shop = test_7[test_7.shop_id==_shop].groupby('total_hour', as_index=False)['instance_id'].agg({'count': 'count'}).sort_values(by='total_hour')
    _shop_click_cvr = target_df.groupby('total_hour', as_index=False)['is_trade'].agg({
        'count': 'count',
        'cvr': computeCVR
    })
    click_trace = go.Scatter(
        x = _shop_click_cvr['total_hour'].tolist() + _test_shop['total_hour'].tolist(),
        y = _shop_click_cvr['count'].tolist() + _test_shop['count'].tolist(),
        name = '点击量',
    )
    
    cvr_trace = go.Scatter(
        x = _shop_click_cvr['total_hour'],
        y = _shop_click_cvr['cvr'],
        name = '转化率',
        yaxis= 'y2'
    )
    _data = [click_trace, cvr_trace]
    xAxis['domain'] =[0, 0.85]
    layout = go.Layout(
        title='shop-' + str(_shop) + ' 训练集最后17个小时级点击量/转化率变化曲线图',
        height = 400,
        yaxis=dict(
            title='点击量'
        ),
        yaxis2=dict(
            title='转化率',
            titlefont=dict(
                color='rgb(148, 103, 189)'
            ),
            tickfont=dict(
                color='rgb(148, 103, 189)'
            ),
            anchor='free',
            overlaying='y',
            side='right',
            position=0.9
        ),
#         yaxis3=dict(
#             title='level',
#             titlefont=dict(
#                 color='rgb(100, 55, 189)'
#             ),
#             tickfont=dict(
#                 color='rgb(100, 55, 189)'
#             ),
#             anchor='free',
#             overlaying='y',
#             side='right',
#             position=1
#         ),
        xaxis=xAxis
    )
    fig = go.Figure(data=_data, layout=layout)
    py.iplot(fig)

### item相关属性的突变情况

In [19]:
item_top = last_17_hour.groupby('item_id').size().sort_values(ascending=False).index.tolist()[:10]
for _item in item_top:
    target_df = last_17_hour[last_17_hour.item_id==_item]
    _test_item = test_7[test_7.item_id==_item].groupby('total_hour', as_index=False)['instance_id'].agg({'count': 'count'}).sort_values(by='total_hour')
    _item_click_cvr = target_df.groupby('total_hour', as_index=False)['is_trade'].agg({
        'count': 'count',
        'cvr': computeCVR
    })
    click_trace = go.Scatter(
        x = _item_click_cvr['total_hour'].tolist() + _test_item['total_hour'].tolist(),
        y = _item_click_cvr['count'].tolist() + _test_item['count'].tolist(),
        name = '点击量',
    )
    
    cvr_trace = go.Scatter(
        x = _item_click_cvr['total_hour'],
        y = _item_click_cvr['cvr'],
        name = '转化率',
        yaxis= 'y2'
    )
    
    
    # item_sales_level变化 => 在第7天0点突变
    _item_sales = target_df['item_sales_level'].groupby(target_df['total_hour']).mean()
    sales_trace = go.Scatter(
        x = _item_sales.index.tolist(),
        y = _item_sales,
        name = 'item_sales_level',
        yaxis= 'y3'
    )
    # item_collected_level变化 => 无变化
#     _item_collect = target_df['item_collected_level'].groupby(target_df['total_hour']).mean()
#     collected_trace = go.Scatter(
#         x = _item_collect.index.tolist(),
#         y = _item_collect,
#         name = 'item_collected_level',
#         yaxis= 'y3'
#     )
    # item_pv_level变化 => 部分有变化
    _item_pv = target_df['item_pv_level'].groupby(target_df['total_hour']).mean()
    pv_trace = go.Scatter(
        x = _item_pv.index.tolist(),
        y = _item_pv,
        name = 'item_pv_level',
        yaxis= 'y3'
    )
    
    
    

    _data = [click_trace, cvr_trace, sales_trace, pv_trace]
    xAxis['domain'] =[0, 0.85]
    layout = go.Layout(
        title='item-' + str(_item) + ' 训练集最后17个小时级点击量/转化率变化曲线图',
        height = 400,
        yaxis=dict(
            title='点击量'
        ),
        yaxis2=dict(
            title='转化率',
            titlefont=dict(
                color='rgb(148, 103, 189)'
            ),
            tickfont=dict(
                color='rgb(148, 103, 189)'
            ),
            anchor='free',
            overlaying='y',
            side='right',
            position=1
        ),
        yaxis3=dict(
            title='level',
            titlefont=dict(
                color='rgb(100, 55, 189)'
            ),
            tickfont=dict(
                color='rgb(100, 55, 189)'
            ),
            anchor='free',
            overlaying='y',
            side='right',
            position=0.9
        ),
        xaxis=xAxis
    )
    fig = go.Figure(data=_data, layout=layout)
    py.iplot(fig)

### item在天级的变化

In [28]:
# top_items = data.groupby('item_id').size().sort_values(ascending=False).tolist()[:10]
for _item in item_top:
    _itemDayClickCountCvr = data[data.item_id==_item].groupby('day', as_index=False)['is_trade'].agg({'count': 'count', 'cvr': computeCVR}).sort_values(by='day')
    
    click_trace = go.Scatter(
        x = _itemDayClickCountCvr['day'],
        y = _itemDayClickCountCvr['count'],
        name = '点击量',
    )
    
    cvr_trace = go.Scatter(
        x = _itemDayClickCountCvr['day'],
        y = _itemDayClickCountCvr['cvr'],
        name = '转化率',
        yaxis= 'y2'
    )


    _data = [click_trace, cvr_trace]
    xAxis['domain'] =[0, 0.85]
    layout = go.Layout(
        title='item-' + str(_item) + ' 训练集天级点击量/转化率变化曲线图',
        height = 400,
        yaxis=dict(
            title='点击量'
        ),
        yaxis2=dict(
            title='转化率',
            titlefont=dict(
                color='rgb(148, 103, 189)'
            ),
            tickfont=dict(
                color='rgb(148, 103, 189)'
            ),
            anchor='free',
            overlaying='y',
            side='right',
            position=1
        ),
        yaxis3=dict(
            title='level',
            titlefont=dict(
                color='rgb(100, 55, 189)'
            ),
            tickfont=dict(
                color='rgb(100, 55, 189)'
            ),
            anchor='free',
            overlaying='y',
            side='right',
            position=0.9
        ),
        xaxis=xAxis
    )
    fig = go.Figure(data=_data, layout=layout)
    py.iplot(fig)

### user的行为

In [25]:
top_users = last_17_hour.groupby('user_id').size().sort_values(ascending=False).index.tolist()[:10]
for _user in top_users:
    target_df = last_17_hour[last_17_hour.user_id==_user]
    _test_user = test_7[test_7.user_id==_user].groupby('total_hour', as_index=False)['instance_id'].agg({'count': 'count'}).sort_values(by='total_hour')
    _user_click_cvr = target_df.groupby('total_hour', as_index=False)['is_trade'].agg({
        'count': 'count',
        'cvr': computeCVR
    })
    click_trace = go.Scatter(
        x = _user_click_cvr['total_hour'].tolist() + _test_user['total_hour'].tolist(),
        y = _user_click_cvr['count'].tolist() + _test_user['count'].tolist(),
        name = '点击量',
    )
    
    cvr_trace = go.Scatter(
        x = _user_click_cvr['total_hour'],
        y = _user_click_cvr['cvr'],
        name = '转化率',
        yaxis= 'y2'
    )
    _data = [click_trace, cvr_trace]
    xAxis['domain'] =[0, 0.85]
    layout = go.Layout(
        title='user-' + str(_user) + ' 训练集最后17个小时级点击量/转化率变化曲线图',
        height = 400,
        yaxis=dict(
            title='点击量'
        ),
        yaxis2=dict(
            title='转化率',
            titlefont=dict(
                color='rgb(148, 103, 189)'
            ),
            tickfont=dict(
                color='rgb(148, 103, 189)'
            ),
            anchor='free',
            overlaying='y',
            side='right',
            position=0.9
        ),
#         yaxis3=dict(
#             title='level',
#             titlefont=dict(
#                 color='rgb(100, 55, 189)'
#             ),
#             tickfont=dict(
#                 color='rgb(100, 55, 189)'
#             ),
#             anchor='free',
#             overlaying='y',
#             side='right',
#             position=1
#         ),
        xaxis=xAxis
    )
    fig = go.Figure(data=_data, layout=layout)
    py.iplot(fig)

In [34]:
data[(data.user_id == 2241738602037446802) & (data.day<=5)][['day', 'hour', 'minute',  'item_id', 'is_trade']].sort_values(by=['item_id','day', 'hour', 'minute'])

Unnamed: 0,day,hour,minute,item_id,is_trade
3356767,1,15,38,291863597853999747,0.0
6074440,3,11,59,306724454677942978,0.0
8267140,5,18,46,429734166178108124,0.0
8182643,5,19,12,494414263491800088,0.0
7773433,5,19,6,918942946050817604,0.0
8789158,5,19,0,1175098266167111859,0.0
8174017,5,18,47,1538997591983548946,0.0
2928673,1,15,41,1895536662906439704,0.0
8309364,5,19,2,2136739390345783023,0.0
8803118,5,18,47,3409810223079620877,0.0


In [42]:
top_trade_users = data[data.is_trade == 1].groupby('user_id').size().sort_values(ascending=False).index.tolist()[:10]
for _user in top_trade_users:
    print(data[(data.is_trade == 1) & (data.user_id==_user)][['day', 'hour', 'item_cat_1', 'item_cat_2', 'item_id', 'item_price_level']].sort_values(by=['day', 'hour']))
    print('*********************************')

          day  hour           item_cat_1           item_cat_2  \
7178285     0    19   453525480366550911  9030622377694665158   
617149      6    22  3348197449185791127                   -1   
10172087    6    22   768579787521575291                   -1   
1771806     7     6   768579787521575291                   -1   
4814128     7     6  2871729383671301763  3492642177859571113   
4996263     7     6  2871729383671301763  3492642177859571113   
5064549     7     6  2871729383671301763  3492642177859571113   
5139543     7     6   453525480366550911  9030622377694665158   

                      item_id  item_price_level  
7178285   7675805080042249571                 4  
617149    3661650547622731995                 4  
10172087  8600958267759957157                 6  
1771806   3276757363875879966                 6  
4814128   2602568242504526258                 4  
4996263   3247428869924416739                 6  
5064549   4475772017721756076                 6  
5139543   8211

# shop相关特征

In [29]:
_t = (data[['shop_score_service', 'shop_score_delivery', 'shop_score_description', 'shop_review_positive_rate']] * 100).astype('int')
for _score in ['shop_score_service', 'shop_score_delivery', 'shop_score_description', 'shop_review_positive_rate']:
    _s = _t.replace({-1: np.nan}).groupby(_score).size()
    trace = go.Bar(
        x = _s.index.tolist(),
        y = _s,
    )
    _data = [trace]
    layout = go.Layout(
        title=str(_score) + ' 训练集最后17个小时级点击量/转化率变化曲线图',
        height = 400,
        yaxis=dict(
            title='点击量'
        )
    )
    fig = go.Figure(data=_data, layout=layout)
    py.iplot(fig)

In [34]:
data_7 = data[data.day==7]
data_7.shape

(2806831, 34)

In [None]:
def func(row, df):
    _user = row['user_id']
    
    _item = row['item_id']
    _shop = row['shop_id']
    _brand = row['item_brand_id']
    
    _time = row['context_timestamp']
    _df = df[(df['context_timestamp'] < _time) &(df['user_id']==_user)]
    row['already_buy_item'] = np.nan if _item==-1 else int(_df[(_df['item_id']==_item)].shape[0] > 0)
    row['already_buy_brand'] = np.nan if _brand==-1 else int(_df[_df['item_brand_id']==_brand].shape[0] > 0)
    row['already_buy_shop'] = np.nan if _shop==-1 else int(_df[_df['shop_id']==_shop].shape[0] > 0)
    return row

def already_buy(df):
#     df.sort_values(by="context_timestamp", inplace=True)
# #     cat_1_res_click = []
# #     cat_1_res_buy = []
#     item_click = []
#     item_buy = []
#     item_brand_click = []
#     item_brand_buy = []
#     shop_click = []
#     shop_buy = []
#     for i in list(range(len(df))):
#         _item = df.iloc[:i].item_id
#         _shop = df.iloc[:i].shop_id
#         _brand = df.iloc[:i].item_brand_id
#         pre_df = df.iloc[:i]
#         item_click.append(pre_df[(pre_df.item_id)==_item].shape[0])
#         item_buy.append(pre_df[(pre_df.is_trade==1)&(pre_df.item_id)==_item].shape[0])
        
#         item_brand_click.append(pre_df[(pre_df.item_brand_id)==_brand].shape[0])
#         item_brand_buy.append(pre_df[(pre_df.is_trade==1)&(pre_df.item_brand_id)==_brand].shape[0])
        
#         shop_click.append(pre_df[(pre_df.shop_id)==_shop].shape[0])
#         shop_buy.append(pre_df[(pre_df.is_trade==1)&(pre_df.shop_id)==_shop].shape[0])
    return df.apply(func, args=(df,), axis=1)
        
    
train_data_7 = data_7[data_7.is_trade.notnull()]
train_data_7 = train_data_7.apply(func, args=(train_data_7, ), axis=1)