In [51]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
import gc
import numpy as np
import lightgbm as lgb
import time,datetime
from scipy.stats import entropy,kurtosis
from sklearn.metrics import f1_score, auc
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
# app = pd.read_csv('dataset/app.csv')
# user = pd.read_csv('dataset/user.csv')

In [3]:
def reduce_mem(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('{:.2f} Mb, {:.2f} Mb ({:.2f} %)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    gc.collect()
    return df

In [102]:
print('train:',train['date'].min(), train['date'].max())
print('test:', test['date'].min(), test['date'].max())

train: 2019-11-07 23:59:59 2019-11-10 23:59:59
test: 2019-11-10 23:59:59 2019-11-11 23:59:59


In [4]:
print('before:{}'.format(id(train)))
train = reduce_mem(train)
print('after:{}'.format(id(train)))
test = reduce_mem(test)

before:2751646425216
1301.96 Mb, 933.07 Mb (28.33 %)
after:2751646425216
362.37 Mb, 296.17 Mb (18.27 %)


In [14]:
print('train deviceid',len((set(train['deviceid']))))
print('test deviceid',len((set(test['deviceid']))))
print('train&test deviceid',len((set(train['deviceid'])&set(test['deviceid'])))/len(set(test['deviceid'])))
print('train guid',len((set(train['guid']))))
print('test guid',len((set(test['guid']))))
print('train&test guid',len((set(train['guid'])&set(test['guid'])))/len(set(test['guid'])))

NameError: name 'train' is not defined

In [101]:
print('train deviceid ratio:{}'.format(len((set(train['deviceid'])&set(test['deviceid'])))/len(set(train['deviceid']))))
print('test deviceid ratio:{}'.format(len((set(train['deviceid'])&set(test['deviceid'])))/len(set(test['deviceid']))))
print('train guid ratio:{}'.format(len((set(train['guid'])&set(test['guid'])))/len(set(train['guid']))))
print('test guid ratio:{}'.format(len((set(train['guid'])&set(test['guid'])))/len(set(test['guid']))))

train deviceid ratio:0.4471528414298808
test deviceid ratio:0.8262557117905471
train guid ratio:0.44716436793727776
test guid ratio:0.8204920771706442


In [18]:
print('data size:{}'.format(len(data)))
print('8: {}, ratio: {}%'.format(len(data[data['day']==8]),len(data[data['day']==8])/len(data)*100))
print('9: {}, ratio: {}%'.format(len(data[data['day']==9]),len(data[data['day']==9])/len(data)*100))
print('10: {}, ratio: {}%'.format(len(data[data['day']==10]),len(data[data['day']==10])/len(data)*100))
print('11: {}, ratio: {}%'.format(len(data[data['day']==11]),len(data[data['day']==11])/len(data)*100))

data size:15030273
8: 3674871, ratio: 24.44979542287755%
9: 3743690, ratio: 24.907664684467143%
10: 3958141, ratio: 26.334458462597453%
11: 3653560, ratio: 24.30800824442776%


In [5]:
def transform_time(x):    
    try:
        date_style = time.localtime(x/1000)
        date_style = time.strftime('%Y-%m-%d %H:%M:%S',date_style)
        return date_style
    except:
        return np.nan

In [6]:
train['date'] = train['ts'].apply(transform_time)
test['date'] = test['ts'].apply(transform_time)

In [7]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [8]:
# 年月都是一样，增加天的时间特征
train['day'] = train['date'].dt.day
test['day'] = test['date'].dt.day
train['hour'] = train['date'].dt.hour
test['hour'] = test['date'].dt.hour
train['minute'] = train['date'].dt.minute
test['minute'] = test['date'].dt.minute
train['second'] = train['date'].dt.second
test['second'] = test['date'].dt.second
train['flag'] = train['day']
train[train['day']==7]['flag'] = 8
test['flag'] = 11

In [9]:
print(train['date'].max())
print(train['date'].min())

2019-11-10 23:59:59
2019-11-07 23:59:59


In [10]:
# 将测试机和训练集进行融合，从上往下进行合并
data = pd.concat([train,test],axis=0,sort=False)
# 删除原始数据，减少累存占用
del train,test
gc.collect()

145

In [12]:
# 构建点击特征
click_data = data[data['flag']!=11]
print(click_data.shape)
click_data['total_hour'] = click_data['hour'] + 24*(click_data['day']-8)
# click_data['timestamp'] = click_data['timestamp'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(x/1000)))
click_data['exposure_click_gap'] = click_data['timestamp'] - click_data['ts'] # 从点击到曝光时间差
# 回收内存
del data['date']
gc.collect()

(11376681, 21)


8909

In [15]:
data = reduce_mem(data)

2078.43 Mb, 1490.73 Mb (28.28 %)


In [16]:
click_data = click_data.sort_values('timestamp').reset_index(drop=True)

In [17]:
data['guid'] = data['guid'].fillna('null')

In [24]:
range(2)

range(0, 2)

In [34]:
# 处理其他特征 对其进行简单的地址映射
cate_cols = [
    'deviceid', 'newsid', 'guid', 'pos', 'app_version', 'device_vendor',
    'netmodel', 'osversion', 'device_version'
]

for f in cate_cols:
    print(f)
    map_dict = dict(zip(data[f].unique(), range(data[f].nunique())))
    click_data[f] = click_data[f].map(map_dict).fillna(-1).astype('int32')
    data[f] = data[f].map(map_dict).fillna(-1).astype('int32')
data = reduce_mem(data)
click_data = reduce_mem(click_data)

deviceid
newsid
guid
pos
app_version
device_vendor
netmodel
osversion
device_version
1075.05 Mb, 845.71 Mb (21.33 %)
1280.26 Mb, 607.58 Mb (52.54 %)


In [37]:
sort_data = data.sort_values('ts').reset_index(drop=True)

Unnamed: 0,id,target,timestamp,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,osversion,lng,lat,device_version,ts,day,hour,minute,second,flag
0,6176945,0.0,,78814,1962,75987,0,0,2,1,0,277,1164,45,1573142399626,7,23,59,59,7
1,3148710,0.0,,78814,1216,75987,2,0,2,1,0,277,1164,45,1573142399631,7,23,59,59,7
2,5419483,0.0,,78814,903,75987,1,0,2,1,0,277,1164,45,1573142399645,7,23,59,59,7
3,9074543,0.0,,75020,10115,72333,2,2,7,2,2,430,651,162,1573142399650,7,23,59,59,7
4,8090764,0.0,,52006,972186,50180,1,0,33,2,5,2,2,583,1573142399767,7,23,59,59,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15030268,test_3609640,,,71873,3513,69293,3,0,3,2,9,189,397,41,1573487999641,11,23,59,59,11
15030269,test_1587182,,,12939,38584,12502,2,0,0,2,0,437,343,10,1573487999651,11,23,59,59,11
15030270,test_3609632,,,71873,604610,69293,2,0,3,2,9,189,397,41,1573487999666,11,23,59,59,11
15030271,test_475380,,,110368,1405,109896,2,4,3,0,1,283,686,3,1573487999684,11,23,59,59,11


In [80]:
mode_cols = []
# for f in ['deviceid']:
f = 'deviceid'
    # 对样本反应时间进行统计
print('exposure click gap stas')
# as_index表示保留 deviceid和day数据
tmp = click_data.groupby([f,'day'], as_index=False)['exposure_click_gap'].agg({
    f + '_prev_day_exposure_click_gap_max':'max',   #反应最大时间
    f + '_prev_day_exposure_click_gap_min':'min',   #反应最小时间
    f + '_prev_day_exposure_click_gap_median':'median',
    f + '_prev_day_exposure_click_gap_std':'std',
    f + '_pre_day_exposure_click_gap_skew':'skew', # 不对称度
    f + '_prev_day_exposure_click_gap_kurt':kurtosis, #峰值
    f + '_prev_day_exposure_click_gap_mean': 'mean',
    f + '_prev_day_exposure_click_gap_q1': lambda x: np.quantile(x, q=0.25), #分位数
    f + '_prev_day_exposure_click_gap_q3': lambda x: np.quantile(x, q=0.75)
})
#最大反应时间-最小反应时间
tmp[f+ '_prev_day_exposure_click_gap_ptp'] = tmp[f + '_prev_day_exposure_click_gap_max'] - tmp[f + '_prev_day_exposure_click_gap_min']
#最大分位数-最小分位数
tmp[f + '_prev_day_exposure_click_gap_iqr'] = tmp[f + '_prev_day_exposure_click_gap_q3'] - tmp[f + '_prev_day_exposure_click_gap_q1']
tmp[f + '_prev_day_exposure_click_gap_mean_ratio_std'] = tmp[f + '_prev_day_exposure_click_gap_mean'] / tmp[f + '_prev_day_exposure_click_gap_std']
tmp[f + '_prev_day_exposure_click_gap_mean_ratio_ptp'] = tmp[f + '_prev_day_exposure_click_gap_mean'] / tmp[f + '_prev_day_exposure_click_gap_ptp']
tmp[f + '_prev_day_exposure_click_gap_mean_ratio_iqr'] = tmp[f + '_prev_day_exposure_click_gap_mean'] / tmp[f + '_prev_day_exposure_click_gap_iqr']

exposure click gap stas


In [82]:
# 构建历史特征,将前一天统计量放置第二天
tmp['day'] += 1
data = data.merge(tmp, on=[f,'day'],how='left')
print('tmp1:{}'.format(id(tmp)))
# 优化内存
data = reduce_mem(data)
del tmp
gc.collect()

tmp1:2753546862152
2623.12 Mb, 2680.46 Mb (-2.19 %)


In [89]:
# 对前一天点击次数进行统计
tmp = click_data.groupby([f,'day'], as_index=False)['id'].agg({f + '_prev_day_click_count':'count'})
tmp['day'] += 1
data = data.merge(tmp,on=[f, 'day'], how='left')
data[f + '_prev_day_click_count'] = data[f + '_prev_day_click_count'].fillna(0)

In [90]:
# 对前一个小时进行统计、暂时确实
tmp = click_data.groupby([f,'hour'], as_index=False)['id'].agg({f + '_prev_day_click_count':'count'})
tmp['day'] += 1
data = data.merge(tmp,on=[f, 'day'], how='left')
data[f + '_prev_day_click_count'] = data[f + '_prev_day_click_count'].fillna(0)
del tmp
gc.collect()

384

In [94]:
# 对前一天曝光量进行统计
tmp = data.groupby([f,'day'], as_index=False)['id'].agg({f + '_prev_day_count':'count'})
tmp['day'] += 1
data = data.merge(tmp,on=[f, 'day'], how='left')
data[f + '_prev_day_count'] = data[f + '_prev_day_count'].fillna(0)

In [95]:
# 计算前一天的点击率

15030273

In [93]:
tmp

Unnamed: 0,deviceid,hour,deviceid_prev_day_count
0,0,19,9
1,1,17,19
2,2,5,14
3,2,6,21
4,2,17,11
...,...,...,...
386610,114579,16,1
386611,114580,14,2
386612,114581,20,1
386613,114582,7,2


In [55]:
# 构建历史特征
history_feature8 = data[data['day']==8]
history_feature9 = data[data['day']==9]
history_feature10 = data[data['day']==10]
history_feature11 = data[data['day']==11]

In [117]:
# 每一天特征几乎相同
print('data size of day {}:{}'.format(8, len(history_feature8)))
print('data size of day {}:{}'.format(9, len(history_feature9)))
print('data size of day {}:{}'.format(10, len(history_feature10)))
print('data size of day {}:{}'.format(11, len(history_feature11)))

data size of day 8:3674871
data size of day 9:3743690
data size of day 10:3958141
data size of day 11:3653560


In [27]:
print(len(set(history_feature8['guid'])))
print(len(set(history_feature9['guid'])))
print(len(set(history_feature10['guid'])))
print(len(set(history_feature11['guid'])))
print(len(set(history_feature8['guid'])&set(history_feature9['guid']))/len(set(history_feature9['guid'])))
print(len(set(history_feature9['guid'])&set(history_feature10['guid']))/len(set(history_feature10['guid'])))
print(len(set(history_feature10['guid'])&set(history_feature11['guid']))/len(set(history_feature11['guid'])))

61277
64284
66286
56861
0.6501773380623483
0.6388528497721992
0.7410879161463921


In [31]:
# 每天点击新闻中，超过一半是前一天出现的
print(len(set(history_feature8['newsid'])))
print(len(set(history_feature9['newsid'])))
print(len(set(history_feature10['newsid'])))
print(len(set(history_feature11['newsid'])))
print(len(set(history_feature8['newsid'])&set(history_feature9['newsid']))/len(set(history_feature9['newsid'])))
print(len(set(history_feature9['newsid'])&set(history_feature10['newsid']))/len(set(history_feature10['newsid'])))
print(len(set(history_feature10['newsid'])&set(history_feature11['newsid']))/len(set(history_feature11['newsid'])))

640066
631547
658787
626907
0.5474525253069051
0.5321021817370409
0.5567141537740048


In [32]:
# deviceid guid timestamp ts 时间特征
def get_history_visit_time(data1,data2):
    data1 = data1.sort_values(['ts','timestamp'])
    data1['timestamp_ts'] = data1['timestamp'] - data1['ts']
    data1_tmp = data1[data1['target']==1].copy()
    del data1
    for col in ['deviceid','guid']:
        for ts in ['timestamp_ts']:
            f_tmp = data1_tmp.groupby([col],as_index=False)[ts].agg({
                '{}_{}_max'.format(col,ts):'max',
                '{}_{}_mean'.format(col,ts):'mean',
                '{}_{}_min'.format(col,ts):'min',
                '{}_{}_median'.format(col,ts):'median'
            })
        data2 = pd.merge(data2,f_tmp,on=[col],how='left',copy=False)
    return data2

In [33]:
history_feature9 = get_history_visit_time(history_feature8, history_feature9)
history_feature10 = get_history_visit_time(history_feature9, history_feature10)
history_feature11 = get_history_visit_time(history_feature10, history_feature11)

In [35]:
history_feature9.head()

Unnamed: 0,id,target,timestamp,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,...,second,flag,deviceid_timestamp_ts_max,deviceid_timestamp_ts_mean,deviceid_timestamp_ts_min,deviceid_timestamp_ts_median,guid_timestamp_ts_max,guid_timestamp_ts_mean,guid_timestamp_ts_min,guid_timestamp_ts_median
0,1,0.0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,g4,...,46,9,,,,,,,,
1,2,0.0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,w,...,47,9,,,,,,,,
2,6,0.0,,04813dbae7d339a61f38d648e77b2c28,3734327341629052372,3bc11f585ac7b18d7997fa83e19aa439,1,2.1.5,OPPO,o,...,7,9,,,,,,,,
3,7,0.0,,04813dbae7d339a61f38d648e77b2c28,5518070787661276860,3bc11f585ac7b18d7997fa83e19aa439,2,2.1.5,OPPO,o,...,58,9,,,,,,,,
4,8,0.0,,04813dbae7d339a61f38d648e77b2c28,6167225445325229993,,0,2.1.5,OPPO,w,...,2,9,,,,,179832163.0,599228.966301,-3752356.0,8293.0


In [37]:
data = pd.concat([history_feature9,history_feature10],axis=0,sort=False,ignore_index=True)
data = pd.concat([data,history_feature11],axis=0,sort=False,ignore_index=True)
del history_feature9,history_feature10,history_feature11
gc.collect()

8532

In [57]:
data = data.sort_values('ts')
data['ts_next'] = data.groupby(['deviceid'])['ts'].shift(-1)
data['ts_next_ts'] = data['ts_next'] - data['ts']

In [58]:
# 前一天的leak
for col in [['deviceid'],['guid'],['newsid']]:
    print(col)
    data['{}_days_count'.format('_'.join(col))] = data.groupby(['day'] + col)['id'].transform('count')

['deviceid']
['guid']
['newsid']


In [63]:
print('train and predict')
X_train = data[data['flag'].isin([9])]
X_valid = data[data['flag'].isin([10])]
X_test = data[data['flag'].isin([11])]

train and predict


In [51]:
lgb_param = {
    'learning_rate': 0.1,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': -1,
    'seed':42,
    'boost_from_average':'false',
    }

In [52]:
feature = [
       'pos','netmodel',  'hour', 'minute',
       'deviceid_timestamp_ts_max', 'deviceid_timestamp_ts_mean',
       'deviceid_timestamp_ts_min', 'deviceid_timestamp_ts_median',
       'guid_timestamp_ts_max', 'guid_timestamp_ts_mean',
       'guid_timestamp_ts_min', 'guid_timestamp_ts_median',
       'deviceid_days_count', 'guid_days_count','newsid_days_count',
        'ts_next_ts'
           ]

In [53]:
target = 'target'

In [61]:
data['netmodel'] = data['netmodel'].map({'o':1, 'w':2, 'g4':4, 'g3':3, 'g2':2})

In [64]:
lgb_train = lgb.Dataset(X_train[feature].values, X_train[target].values)
lgb_valid = lgb.Dataset(X_valid[feature].values, X_valid[target].values, reference=lgb_train)
lgb_model = lgb.train(lgb_param, lgb_train, num_boost_round=10000, valid_sets=[lgb_train,lgb_valid],
                      early_stopping_rounds=50,verbose_eval=10)

Training until validation scores don't improve for 50 rounds
[10]	training's auc: 0.903714	valid_1's auc: 0.898323
[20]	training's auc: 0.90742	valid_1's auc: 0.902218
[30]	training's auc: 0.912422	valid_1's auc: 0.907078
[40]	training's auc: 0.915565	valid_1's auc: 0.910117
[50]	training's auc: 0.918159	valid_1's auc: 0.911839
[60]	training's auc: 0.919773	valid_1's auc: 0.913268
[70]	training's auc: 0.921015	valid_1's auc: 0.914002
[80]	training's auc: 0.92184	valid_1's auc: 0.914434
[90]	training's auc: 0.922651	valid_1's auc: 0.914666
[100]	training's auc: 0.923296	valid_1's auc: 0.914864
[110]	training's auc: 0.923897	valid_1's auc: 0.914991
[120]	training's auc: 0.924411	valid_1's auc: 0.915047
[130]	training's auc: 0.924925	valid_1's auc: 0.915128
[140]	training's auc: 0.925329	valid_1's auc: 0.915197
[150]	training's auc: 0.92577	valid_1's auc: 0.915162
[160]	training's auc: 0.926137	valid_1's auc: 0.91517
[170]	training's auc: 0.926506	valid_1's auc: 0.915262
[180]	training's 

In [67]:
p_test = lgb_model.predict(X_valid[feature].values,num_iteration=lgb_model.best_iteration)
xx_score = X_valid[[target]].copy()
xx_score['predict'] = p_test
xx_score = xx_score.sort_values('predict',ascending=False)
xx_score = xx_score.reset_index()
xx_score.loc[xx_score.index<=int(xx_score.shape[0]*0.103),'score'] = 1
xx_score['score'] = xx_score['score'].fillna(0)
print(f1_score(xx_score['target'],xx_score['score']))

0.6057699062735438


In [69]:
X_train_2 = data[data['flag'].isin([9,10])]


lgb_train_2 = lgb.Dataset(X_train_2[feature].values, X_train_2[target].values)
lgb_model_2 = lgb.train(lgb_param, lgb_train_2, num_boost_round=lgb_model.best_iteration, valid_sets=[lgb_train_2],verbose_eval=10)

p_predict = lgb_model_2.predict(X_test[feature].values)

submit_score = X_test[['id']].copy()
submit_score['predict'] = p_predict
submit_score = submit_score.sort_values('predict',ascending=False)
submit_score = submit_score.reset_index()
submit_score.loc[submit_score.index<=int(submit_score.shape[0]*0.103),'target'] = 1
submit_score['target'] = submit_score['target'].fillna(0)

submit_score = submit_score.sort_values('id')
submit_score['target'] = submit_score['target'].astype(int)

sample = pd.read_csv('./dataset/sample.csv')
sample.columns = ['id','non_target']
submit_score = pd.merge(sample,submit_score,on=['id'],how='left')

submit_score[['id','target']].to_csv('./baseline.csv',index=False)

[10]	training's auc: 0.901509
[20]	training's auc: 0.905507
[30]	training's auc: 0.909716
[40]	training's auc: 0.912916
[50]	training's auc: 0.915402
[60]	training's auc: 0.916933
[70]	training's auc: 0.918138
[80]	training's auc: 0.918973
[90]	training's auc: 0.919631
[100]	training's auc: 0.920158
[110]	training's auc: 0.920575
[120]	training's auc: 0.920958
[130]	training's auc: 0.921352
[140]	training's auc: 0.921586
[150]	training's auc: 0.921916
[160]	training's auc: 0.922259
[170]	training's auc: 0.922526
[180]	training's auc: 0.922746
