In [1]:
import pandas as pd
import numpy as np
import time
from collections import Counter
import datetime

In [2]:
def read_data(train_data_path, test_data_path):
    train = pd.read_csv(train_data_path, sep=' ', encoding='utf-8')
    test = pd.read_csv(test_data_path, sep=' ', encoding='utf-8')
    train.drop_duplicates(inplace=True)
    test['is_trade'] = np.nan
    print('shape of train data: ', train.shape)
    print('shape of test data: ', test.shape)
    # 将训练集和测试集拼接在一起，方便特征处理
    data = pd.concat([train, test], axis=0)
    print('shape of whole data: ', data.shape)
    return train, test, data

In [3]:
train, test, data = read_data('./data/round1_ijcai_18_train_20180301.txt', './data/round1_ijcai_18_test_a_20180301.txt')

shape of train data:  (478111, 27)
shape of test data:  (18371, 27)
shape of whole data:  (496482, 27)


In [4]:
# 处理时间数据
def timestamp_datetime(value):
    format = '%Y-%m-%d %H:%M:%S'
    dt = time.localtime(value)
    dt = time.strftime(format, dt)
    dt = datetime.datetime.strptime(dt, format)
    return dt

In [5]:
data['time'] = data.context_timestamp.apply(timestamp_datetime)
data['day'] = data.time.apply(lambda x: int(x.day))
data['hour'] = data.time.apply(lambda x: int(x.hour))

In [6]:
# 计算历史CTR
def history_ctr(train_data):
    # item_trade_data = train_data[['item_id', 'is_trade']]
    item_id_df = pd.DataFrame(train_data.item_id.unique()).reset_index(drop=True).rename(columns={0: 'item_id'})
    item_id_list = train_data.item_id.unique()
    history_ctr_list = []
    for item in item_id_list:
        display_data = train_data[train_data.item_id == item]
        # 为了避免某些商品展示次数很低，但却有点击，导致历史CTR过高而不真实，故而需要设定阈值干预处理
        history_ctr_value = np.sum(display_data['is_trade']) / len(display_data)
        if len(display_data) < 100 and history_ctr_value > 0.01:
            history_ctr_value = 0.01
        history_ctr_list.append(history_ctr_value)
    item_id_df['history_ctr'] = history_ctr_list
    return item_id_df

In [7]:
history_ctr_df = history_ctr(train)

In [8]:
data = pd.merge(data, history_ctr_df, on='item_id', how='left')
# 对缺失值补0
data['history_ctr'].fillna(0., inplace=True)
data.head()

Unnamed: 0,instance_id,item_id,item_category_list,item_property_list,item_brand_id,item_city_id,item_price_level,item_sales_level,item_collected_level,item_pv_level,...,shop_review_positive_rate,shop_star_level,shop_score_service,shop_score_delivery,shop_score_description,is_trade,time,day,hour,history_ctr
0,108641074714126964,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,1.0,5002,1.0,1.0,1.0,0.0,2018-09-18 10:09:04,18,10,0.0
1,5754713551599725161,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,1.0,5002,1.0,1.0,1.0,0.0,2018-09-18 12:00:32,18,12,0.0
2,842679481291040981,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,1.0,5002,1.0,1.0,1.0,0.0,2018-09-18 03:04:12,18,3,0.0
3,937088850059189027,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,1.0,5002,1.0,1.0,1.0,0.0,2018-09-18 06:17:50,18,6,0.0
4,7975697065017708072,3412720377098676069,7908382889764677758;5799347067982556520,2072967855524022579;5131280576272319091;263639...,1975590437749032870,3948283326616421003,3,3,4,14,...,1.0,5002,1.0,1.0,1.0,0.0,2018-09-18 19:48:40,18,19,0.0


In [9]:
# 对于item_category_list，由于第一个类别都一样，第三个类别只有少量几个样本有，所以只取第二个类别
data['item_category_list'] = data['item_category_list'].apply(lambda x: x.strip().split(';')[1])

In [10]:
# item_property_list取值太多，舍弃不用
item_property = []

item_property_list = data['item_property_list']
for p in item_property_list:
    p_list = p.strip().split(';')
    p_list.sort()
    if len(p_list) < 6:
        print(p)
    new_p_list = []
    for i in range(5):
        try:
            new_p_list.append(p_list[i])
        except:
            new_p_list.append(np.nan)
    item_property.append(new_p_list)
# for i in range(len(data)):
#     item = data.loc([i, 'item_id'])
#     print(item)
len(data['item_property_list'].unique())

11181

In [11]:
item_property_df = pd.DataFrame(item_property).rename(columns={0:'item_property_0', 1: 'item_property_1', 
                                                             2: 'item_property_2', 3: 'item_property_3',
                                                             4: 'item_property_4'})

In [12]:
item_property_df.head()

Unnamed: 0,item_property_0,item_property_1,item_property_2,item_property_3,item_property_4
0,1782439090818545916,2072967855524022579,2636395404473730413,3408398779125901630,3657871859501171040
1,1782439090818545916,2072967855524022579,2636395404473730413,3408398779125901630,3657871859501171040
2,1782439090818545916,2072967855524022579,2636395404473730413,3408398779125901630,3657871859501171040
3,1782439090818545916,2072967855524022579,2636395404473730413,3408398779125901630,3657871859501171040
4,1782439090818545916,2072967855524022579,2636395404473730413,3408398779125901630,3657871859501171040


In [13]:
print(len(item_property_df['item_property_0'].unique()))
print(len(item_property_df['item_property_1'].unique()))
print(len(item_property_df['item_property_2'].unique()))
print(len(item_property_df['item_property_3'].unique()))
print(len(item_property_df['item_property_4'].unique()))

2263
2405
2597
2524
2584


In [14]:
len(data) == len(item_property_df)

True

In [15]:
#data = pd.concat([data, item_property_df], axis=1)

In [16]:
len(data.item_brand_id.unique())

2075

In [17]:
len(data.item_city_id.unique())

128

In [18]:
columns = data.columns
columns

Index(['instance_id', 'item_id', 'item_category_list', 'item_property_list',
       'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level',
       'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id',
       'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id',
       'context_timestamp', 'context_page_id', 'predict_category_property',
       'shop_id', 'shop_review_num_level', 'shop_review_positive_rate',
       'shop_star_level', 'shop_score_service', 'shop_score_delivery',
       'shop_score_description', 'is_trade', 'time', 'day', 'hour',
       'history_ctr'],
      dtype='object')

In [19]:
data.context_page_id.unique()

array([4006, 4001, 4016, 4003, 4010, 4005, 4020, 4014, 4007, 4002, 4004,
       4015, 4011, 4012, 4009, 4018, 4008, 4017, 4013, 4019], dtype=int64)

In [20]:
other_features = ['instance_id', 'item_id', 'shop_id', 'day', 'shop_review_positive_rate', 'shop_score_service', 
                  'shop_score_delivery', 'shop_score_description', 'history_ctr']
#'item_property_0', 'item_property_1',
#                        'item_property_2', 'item_property_3', 'item_property_4',
#  ,'item_brand_id'
categorical_features = ['item_category_list', 'item_city_id',
                        'item_price_level', 'item_sales_level', 
                        'item_collected_level',  'item_pv_level', 'user_gender_id', 'user_age_level', 
                        'user_occupation_id', 'user_star_level', 'hour', 'context_page_id', 
                        'shop_review_num_level', 'shop_star_level', ]

In [21]:
new_data = data[other_features]

In [22]:
for feat in categorical_features:
    categorical_data = data[feat].astype('str')
    categorical_data = pd.get_dummies(categorical_data, dummy_na=True, prefix=feat)
    new_data = pd.concat([new_data, categorical_data], axis=1)
    print(feat, ' finished!')

item_category_list  finished!
item_city_id  finished!
item_price_level  finished!
item_sales_level  finished!
item_collected_level  finished!
item_pv_level  finished!
user_gender_id  finished!
user_age_level  finished!
user_occupation_id  finished!
user_star_level  finished!
hour  finished!
context_page_id  finished!
shop_review_num_level  finished!
shop_star_level  finished!


In [23]:
new_data.head()

Unnamed: 0,instance_id,item_id,shop_id,day,shop_review_positive_rate,shop_score_service,shop_score_delivery,shop_score_description,history_ctr,item_category_list_1968056100269760729,...,shop_star_level_5012,shop_star_level_5013,shop_star_level_5014,shop_star_level_5015,shop_star_level_5016,shop_star_level_5017,shop_star_level_5018,shop_star_level_5019,shop_star_level_5020,shop_star_level_nan
0,108641074714126964,3412720377098676069,6765930309048922341,18,1.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
1,5754713551599725161,3412720377098676069,6765930309048922341,18,1.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,842679481291040981,3412720377098676069,6765930309048922341,18,1.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,937088850059189027,3412720377098676069,6765930309048922341,18,1.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,7975697065017708072,3412720377098676069,6765930309048922341,18,1.0,1.0,1.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
len(new_data)

496482

In [25]:
len(data)

496482

In [26]:
new_data['is_trade'] = data['is_trade']

In [27]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 496482 entries, 0 to 496481
Columns: 358 entries, instance_id to is_trade
dtypes: float64(6), int64(4), uint8(348)
memory usage: 206.4 MB


In [28]:
#new_data.to_csv('./feature_data/new_data_0412__.txt', sep=' ', encoding='utf-8', index=False)

## Data

In [29]:
new_data.shape

(496482, 358)

In [30]:
new_data.columns

Index(['instance_id', 'item_id', 'shop_id', 'day', 'shop_review_positive_rate',
       'shop_score_service', 'shop_score_delivery', 'shop_score_description',
       'history_ctr', 'item_category_list_1968056100269760729',
       ...
       'shop_star_level_5013', 'shop_star_level_5014', 'shop_star_level_5015',
       'shop_star_level_5016', 'shop_star_level_5017', 'shop_star_level_5018',
       'shop_star_level_5019', 'shop_star_level_5020', 'shop_star_level_nan',
       'is_trade'],
      dtype='object', length=358)

In [31]:
train = new_data[new_data.day < 24]
test = new_data[new_data.day == 24]
all_train = new_data[new_data.day <25]
predict = new_data[new_data.day == 25]

In [32]:
test.head()

Unnamed: 0,instance_id,item_id,shop_id,day,shop_review_positive_rate,shop_score_service,shop_score_delivery,shop_score_description,history_ctr,item_category_list_1968056100269760729,...,shop_star_level_5013,shop_star_level_5014,shop_star_level_5015,shop_star_level_5016,shop_star_level_5017,shop_star_level_5018,shop_star_level_5019,shop_star_level_5020,shop_star_level_nan,is_trade
420693,2137051386648397465,3417228998968466891,3011322251466986596,24,0.981128,0.974508,0.97722,0.966373,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
420694,8636603765413923862,3417228998968466891,3011322251466986596,24,0.981128,0.974508,0.97722,0.966373,0.0,0,...,0,0,0,0,0,0,0,0,0,0.0
420695,345383580586199930,932459418921071956,6244933171089048032,24,1.0,0.962925,0.960578,0.983604,0.025822,0,...,0,1,0,0,0,0,0,0,0,0.0
420696,8731890116021855314,932459418921071956,6244933171089048032,24,1.0,0.962925,0.960578,0.983604,0.025822,0,...,0,1,0,0,0,0,0,0,0,0.0
420697,4313209774350468112,932459418921071956,6244933171089048032,24,1.0,0.962925,0.960578,0.983604,0.025822,0,...,0,1,0,0,0,0,0,0,0,0.0


In [33]:
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression



In [34]:
features = []
target = 'is_trade'
for feat in train.columns:
    if feat not in ['instance_id', 'item_id', 'shop_id', 'day', 'is_trade']:
        features.append(feat)

In [35]:
len(train)

420693

In [36]:
clf = LogisticRegression(penalty='l1')
clf.fit(train[features], train[target])
test['predicted_score'] = clf.predict_proba(test[features],)[:, 1]
print(log_loss(test[target], test['predicted_score']))  # 0.08209604257681523  0.08155333572147315 0.08101538254488101

0.08101573523204039


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [37]:
#df_train.to_csv('./data/df_train.csv', encoding='utf-8', index=False)

In [38]:
clf.fit(all_train[features], all_train[target])
predict['predicted_score'] = clf.predict_proba(predict[features])[:, 1]
predict[['instance_id', 'predicted_score']].to_csv('result4_13.csv', index=False,sep=' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [39]:
test['predicted_score'].sort_values(ascending=False)

424526    0.522402
424520    0.517349
467375    0.475316
424513    0.459833
467340    0.457995
424522    0.451904
424533    0.448447
467354    0.411257
467372    0.405989
424519    0.403224
424524    0.398789
467355    0.394073
424532    0.391327
424534    0.390335
424517    0.387762
424523    0.380598
424525    0.376557
424521    0.372563
424518    0.365309
424516    0.362662
424514    0.353896
467346    0.350967
467374    0.346296
424535    0.342942
467344    0.342471
467360    0.337325
422642    0.334112
424531    0.332849
424536    0.328914
467366    0.328598
            ...   
473883    0.001443
435884    0.001439
422478    0.001430
477588    0.001421
453623    0.001418
455173    0.001412
447066    0.001412
430035    0.001409
425074    0.001397
475070    0.001390
450251    0.001385
446984    0.001372
472137    0.001367
476365    0.001332
467618    0.001300
458237    0.001264
470372    0.001197
466213    0.001157
466561    0.001149
442325    0.001133
425075    0.001123
432813    0.

In [40]:
predict['predicted_score'].sort_values(ascending=False)

482650    0.445052
492751    0.426743
492747    0.426569
482646    0.405096
492748    0.378967
482645    0.361000
482649    0.358835
482651    0.349419
492750    0.329901
492746    0.328682
482648    0.327199
480580    0.292561
480582    0.273051
480585    0.264565
480584    0.247102
490082    0.241640
480587    0.238029
490083    0.237976
490076    0.230525
480586    0.215666
494749    0.211532
491179    0.198725
480581    0.196385
493366    0.187227
490081    0.184684
480062    0.181478
493365    0.180330
494750    0.178608
480542    0.174193
490079    0.173962
            ...   
478394    0.001967
483061    0.001964
491152    0.001943
496405    0.001924
486893    0.001917
487945    0.001900
478632    0.001871
490663    0.001867
490265    0.001843
478630    0.001816
492024    0.001804
492568    0.001788
489189    0.001761
486925    0.001700
481666    0.001666
484907    0.001660
486801    0.001650
490665    0.001647
486805    0.001603
495671    0.001555
480037    0.001552
493830    0.