In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import datetime
% matplotlib inline

## 读取数据，并示例
items = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'tianchi_fresh_comp_train_item.csv'))
items.head(5)

Unnamed: 0,item_id,item_geohash,item_category
0,100002303,,3368
1,100003592,,7995
2,100006838,,12630
3,100008089,,7791
4,100012750,,9614


In [None]:
## 将数据按照用户-商品对分组统计，行为、时间之间用‘|’连接
users = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'tianchi_fresh_comp_train_user.csv'), dtype={'user_id': object, 'item_id': object, 'behavior_type': object, 'item_category': object})
print('Read Done!\n')
users['date'] = users.time.map(lambda x:x.split(' ')[0])
users['hours'] = users.time.map(lambda x:x.split(' ')[-1])
users = users.drop(['time'], axis = 1)
users[['behavior_type', 'date', 'hours', 'item_category', 'user_geohash']] += '|'
users = users.groupby(['user_id', 'item_id']).sum().reset_index()
users.to_csv('users_items_behavior.csv', index = False)
users.head(5)

In [3]:
# 通过时间划分训练集和测试集
def SplitTrainandTestData(users, date):
    test_data = users[users['date'] == date]
    train_data = users[users['date'] < date]
    return(train_data, test_data)

train_data_12_18, test_data_12_18 = SplitTrainandTestData(users, '2014-12-18')
print(train_data_12_18.head(5))
test_data_12_18.head(5)

    user_id    item_id  behavior_type user_geohash  item_category  \
0  10001082  285259775              1      97lk14c           4076   
1  10001082    4368907              1          NaN           5503   
2  10001082    4368907              1          NaN           5503   
3  10001082   53616768              1          NaN           9762   
4  10001082  151466952              1          NaN           5232   

            time        date hours  
0  2014-12-08 18  2014-12-08    18  
1  2014-12-12 12  2014-12-12    12  
2  2014-12-12 12  2014-12-12    12  
3  2014-12-02 15  2014-12-02    15  
4  2014-12-12 11  2014-12-12    11  


Unnamed: 0,user_id,item_id,behavior_type,user_geohash,item_category,time,date,hours
217,100029775,247380548,1,9t4qcck,10223,2014-12-18 13,2014-12-18,13
232,100029775,247380548,2,9t4qc3g,10223,2014-12-18 13,2014-12-18,13
250,100029775,205264014,1,9t4qccb,1863,2014-12-18 13,2014-12-18,13
274,100029775,205264014,1,9t4qcc0,1863,2014-12-18 13,2014-12-18,13
317,100029775,87557153,1,9t4qcbm,10894,2014-12-18 13,2014-12-18,13


In [16]:
users.behavior_type = users.behavior_type + '|'
users[['date', 'hours']] += '|'
users_test = users.groupby(['user_id', 'item_id']).sum().reset_index()
users_test.head(5)

Unnamed: 0,user_id,item_id,behavior_type,date,hours,item_category
0,10001082,110790001,1||||,2014-12-14|,16|,13230
1,10001082,115464321,1||||1||||1||||,2014-12-10|2014-12-10|2014-12-10|,13|13|13|,600060006000
2,10001082,117708332,1||||1||||1||||1||||,2014-12-08|2014-12-08|2014-12-08|2014-12-08|,18|19|19|18|,5176517651765176
3,10001082,120438507,1||||1||||1||||,2014-12-02|2014-12-02|2014-12-02|,15|12|12|,666966696669
4,10001082,125083630,1||||,2014-12-14|,03|,4722


In [5]:
# users = pd.read_csv('users_items_behavior.csv')
users.head(5)

Unnamed: 0,user_id,item_id,behavior_type,date,hours,item_category
0,100002985,101814844,1|1|,2014-11-30|2014-11-30|,14|14|,3381|3381|
1,100002985,11906278,1|1|3|,2014-11-30|2014-11-30|2014-11-30|,14|14|14|,3798|3798|3798|
2,100002985,143305462,1|1|,2014-11-29|2014-11-29|,08|08|,3381|3381|
3,100002985,147599246,4|1|1|,2014-12-17|2014-12-17|2014-12-17|,17|16|16|,10431|10431|10431|
4,100002985,157287038,1|1|,2014-11-30|2014-11-30|,22|22|,3798|3798|


In [152]:
## 将字符串转为时间
def strtodatetime(datestr,format):      
    return datetime.datetime.strptime(datestr,format)

## 计算时间相差天数
def datediff(beginDate,endDate): 
    format="%Y-%m-%d"; 
    bd=strtodatetime(beginDate,format) 
    ed=strtodatetime(endDate,format)     
    oneday=datetime.timedelta(days=1) 
    count=0
    while bd!=ed: 
        ed=ed-oneday 
        count+=1
    return count

## Input: hehavior_type, date, 划分时间
def get_label(x ,y, time):
    last_day = max(y.split('|'))
    if last_day != time:
        return(0)
    days = y.split('|')
    behaviors = x.split('|')
    flag = 0
    for i in range(len(days)):
        if days[i] == time and  behaviors[i] == '4':
            flag = 1
            break
    if flag:
        return(1)
    return(0)

## bahavior_type, date, item_category
## Basic features including: 
def get_basic_features(x, y, time):
    x = x.split('|')
    y = y.split('|')
    basic_features = []
    if len(x) == 0:
        return([0]*5)
    click_value, store_value, cart_value, buy_value = 0, 0, 0, 0
    last_buy = []
    for i in range(len(x)-1):
        if y[i] >= time:
            continue
        if x[i] == '1':
            click_value += np.exp(-0.1*datediff(y[i],  time))
        if x[i] == '2':
            store_value += np.exp(-0.1*datediff(y[i],  time))
        if x[i] == '3':
            cart_value += np.exp(-0.1*datediff(y[i],  time))
        if x[i] == '4':
            buy_value += 1
        last_buy.append(datediff(y[i],  time))
    if len(last_buy) == 0:
        return([0]*5)
    basic_features += [click_value, store_value, cart_value, buy_value, min(last_buy)]
    return(basic_features)
    
        
users_test = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'users_items_behavior.csv'))
users_test.loc[:,'item_category'] = users_test['item_category'].apply(lambda x: x.split('|')[0])
#users_test.loc[:,'last_day'] = users_test['date'].apply(lambda x: max(x.split('|')))
users_test.loc[:,'label'] = users_test[['behavior_type', 'date']].apply(lambda x: get_label(x['behavior_type'], x['date'], '2014-12-18'), axis = 1)
#users_test[['click_value', 'store_value', 'cart_value', 'buy_value', 'last_buy']] = 
users_test.loc[:,'basic_features'] = users_test.apply(lambda x: get_basic_features(x['behavior_type'], x['date'], '2014-12-18'), axis = 1)
users_test.loc[:, 'click_value'] = users_test['basic_features'].apply(lambda x:x[0])
users_test.loc[:, 'store_value'] = users_test['basic_features'].apply(lambda x:x[1])
users_test.loc[:, 'cast_value'] = users_test['basic_features'].apply(lambda x:x[2])
users_test.loc[:, 'buy_value'] = users_test['basic_features'].apply(lambda x:x[3])
users_test.loc[:, 'last_buy'] = users_test['basic_features'].apply(lambda x:x[4])
users_test = users_test.drop(['basic_features'], axis = 1)
users_test.head(5)

Unnamed: 0,user_id,item_id,behavior_type,date,hours,item_category,label,click_value,store_value,cast_value,buy_value,last_buy
0,100002985,101814844,1|1|,2014-11-30|2014-11-30|,14|14|,3381,0,0.330598,0,0.0,0,18
1,100002985,11906278,1|1|3|,2014-11-30|2014-11-30|2014-11-30|,14|14|14|,3798,0,0.330598,0,0.165299,0,18
2,100002985,143305462,1|1|,2014-11-29|2014-11-29|,08|08|,3381,0,0.299137,0,0.0,0,19
3,100002985,147599246,4|1|1|,2014-12-17|2014-12-17|2014-12-17|,17|16|16|,10431,0,1.809675,0,0.0,1,1
4,100002985,157287038,1|1|,2014-11-30|2014-11-30|,22|22|,3798,0,0.330598,0,0.0,0,18


In [None]:
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
# X_train = users_test.loc[:,['click_value', 'store_value', 'cast_value', 'buy_value', 'last_buy']]
# y_train = users_test.loc[:, ['label']]
# X_test = users_test.loc[:,['click_value', 'store_value', 'cast_value', 'buy_value', 'last_buy']]
# y_test = users_test.loc[:, ['label']]


clf = RandomForestClassifier(n_estimators=100)
y_pred = clf.fit(users_test.loc[0:4000000,['click_value', 'store_value', 'cast_value', 'buy_value', 'last_buy']], users_test.loc[0:4000000, ['label']]).predict(users_test.loc[:,['click_value', 'store_value', 'cast_value', 'buy_value', 'last_buy']])
#scores = cross_val_score(clf, X_train, y_train)
#scores.mean()   
print(metrics.classification_report(users_test.loc[:, ['label']], y_pred))

In [150]:
users_test = users_test.loc[:, ['user_id', 'item_id']]
users_test ['label'] = y_pred
predict = users_test[users_test['label'] == 1].drop(['label'], axis = 1)
predict

Unnamed: 0,user_id,item_id
13884,100198255,371505655
19776,100260715,195684081
51217,100572118,288185494
79588,100925722,366882456


In [151]:
print(len(users_test))

100001
