In [1]:
# %load 102_user_smooth.py
from utils import BayesianSmoothing, load_pickle, dump_pickle, raw_data_path
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm


# 载入数据
train = load_pickle('../data/train.pkl')
test = load_pickle('../data/test.pkl')
df = load_pickle('../data/df.pkl')

# 贝叶斯平滑参数
iter_num = 10000
epsilon = 0.001

'''
1. 定义需要计算平滑点击率的变量
2. 对于每一天，找出在这之前的所有点击行为
3. 统计该变量的点击次数和购买次数
'''

# smooth_cols = ['user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level']
smooth_cols = ['user_gender_id']

# 保存最后结果的dataframe
smooth_train = train[smooth_cols + ['instance_id', 'day']]
smooth_test = test[smooth_cols + ['instance_id','day']]

# 开始对每个特征进行贝叶斯平滑以获取历史交易率
for col in tqdm(smooth_cols):
    # 定义特征名
    col_I = '{}_I'.format(col)
    col_C = '{}_C'.format(col)
    col_smooth_rate = '{}_smooth_rate'.format(col)
    
#     train[col_smooth_rate] = -1
    smooth_all = pd.DataFrame({'day': train.day, '{}'.format(col): train[col]})
    CVR_all = None
    for day in tqdm(range(19, 26)):
        # 统计总浏览数和购买数
        I = train[train.day<day].groupby(col)['is_trade'].count().reset_index()
        I.columns = [col, col_I]
        C = train[train.day<day].groupby(col)['is_trade'].sum().reset_index()
        C.columns = [col, col_C]
        CVR = pd.concat([I, C[col_C]], axis=1)
        
        # CVR的columns：[col, col_I, col_C, 'day']
        CVR['day'] = day
        
        # 贝叶斯平滑过程
        smooth = BayesianSmoothing(1, 1)
        smooth.update(CVR[col_I].values, CVR[col_C].values, iter_num, epsilon)
        alpha = smooth.alpha
        beta = smooth.beta
        CVR[col_smooth_rate] = (CVR[col_C] + alpha) / (CVR[col_I] + alpha + beta)
        
        # 把不同天算的concat起来
        CVR_all = pd.concat([CVR_all, CVR], axis=0)
        # print(CVR.head())
        # smooth_all[col_smooth_rate] = -1
        # print((pd.merge(train[train.day == day], CVR[[col, col_smooth_rate]], on=col, how='inner')).columns[-1])
        # smooth_all[col_smooth_rate][smooth_all.day == day] = (pd.merge(train[train.day == day], CVR[[col, col_smooth_rate]], on=col, how='left')).iloc[:, -1].values

    smooth_train = pd.merge(smooth_train, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')
    smooth_test = pd.merge(smooth_test, CVR_all[[col, 'day', col_smooth_rate]], on=[col, 'day'], how='left')


smooth_train.drop(['day', smooth_cols[0]], axis=1, inplace=True)
smooth_test.drop(['day', smooth_cols[0]], axis=1, inplace=True)
print(smooth_train.columns)
print('the shape of train {}'.format(smooth_train.shape))
print('the shape of test {}'.format(smooth_test.shape))
dump_pickle(smooth_train, path='../data/train_feature/102_smooth_features.pkl')
dump_pickle(smooth_test, path='../data/test_feature/102_smooth_features.pkl')








  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/7 [00:00<?, ?it/s][A

iter_num: 0
difference of alpha is -0.34616350134938834
difference of beta is 0.11515777903884317
iter_num: 50
difference of alpha is 0.0016961739145337495
difference of beta is 0.04267966087034436
iter_num: 100
difference of alpha is 0.0015383463780433426
difference of beta is 0.04886239574387918
iter_num: 150
difference of alpha is 0.001494210175787436
difference of beta is 0.054221365075898476
iter_num: 200
difference of alpha is 0.0014893077849100766
difference of beta is 0.05888458948003361
iter_num: 250
difference of alpha is 0.0015017593999668843
difference of beta is 0.06297887690388038
iter_num: 300
difference of alpha is 0.0015227398996352504
difference of beta is 0.06660431579108206
iter_num: 350
difference of alpha is 0.001548012970723267
difference of beta is 0.06983892669390457
iter_num: 400
difference of alpha is 0.0015753185821336668
difference of beta is 0.07274422934062841
iter_num: 450
difference of alpha is 0.0016033696574739853
difference of beta is 0.0753693786561

iter_num: 4350
difference of alpha is 0.002171487361859903
difference of beta is 0.10901870233209365
iter_num: 4400
difference of alpha is 0.0021700403279929503
difference of beta is 0.1089283274641275
iter_num: 4450
difference of alpha is 0.002168525587354253
difference of beta is 0.10883463271750315
iter_num: 4500
difference of alpha is 0.0021669453084491863
difference of beta is 0.10873773085245375
iter_num: 4550
difference of alpha is 0.0021653015914715468
difference of beta is 0.1086377309371187
iter_num: 4600
difference of alpha is 0.0021635964708117683
difference of beta is 0.1085347384868669
iter_num: 4650
difference of alpha is 0.0021618319174177003
difference of beta is 0.10842885559674187
iter_num: 4700
difference of alpha is 0.0021600098410825552
difference of beta is 0.10832018107123531
iter_num: 4750
difference of alpha is 0.002158132092578313
difference of beta is 0.10820881054155507
iter_num: 4800
difference of alpha is 0.0021562004657997846
difference of beta is 0.1080


 14%|█▍        | 1/7 [00:00<00:05,  1.09it/s][A


iter_num: 7200
difference of alpha is 0.002024966801736028
difference of beta is 0.10082911651863924
iter_num: 7250
difference of alpha is 0.002021792165864511
difference of beta is 0.1006585473440964
iter_num: 7300
difference of alpha is 0.002018609711722519
difference of beta is 0.10048769753268516
iter_num: 7350
difference of alpha is 0.0020154198501725062
difference of beta is 0.10031658700529533
iter_num: 7400
difference of alpha is 0.0020122229808006153
difference of beta is 0.10014523511290463
iter_num: 7450
difference of alpha is 0.002009019492346553
difference of beta is 0.09997366065022106
iter_num: 7500
difference of alpha is 0.002005809762938071
difference of beta is 0.09980188187489603
iter_num: 7550
difference of alpha is 0.002002594160465776
difference of beta is 0.09962991652071196
iter_num: 7600
difference of alpha is 0.001999373042746555
difference of beta is 0.09945778181327114
iter_num: 7650
difference of alpha is 0.001996146757898387
difference of beta is 0.099285

difference of alpha is 0.0018562389596983309
difference of beta is 0.09690028427937136
iter_num: 2750
difference of alpha is 0.0018594680657715301
difference of beta is 0.09706640207963346
iter_num: 2800
difference of alpha is 0.0018625316490945565
difference of beta is 0.09722335438581808
iter_num: 2850
difference of alpha is 0.0018654347445492547
difference of beta is 0.09737144341087856
iter_num: 2900
difference of alpha is 0.001868182185080336
difference of beta is 0.0975109576016564
iter_num: 2950
difference of alpha is 0.0018707786126626047
difference of beta is 0.0976421725277703
iter_num: 3000
difference of alpha is 0.0018732284885496497
difference of beta is 0.09776535169729073
iter_num: 3050
difference of alpha is 0.0018755361028288675
difference of beta is 0.09788074730545304
iter_num: 3100
difference of alpha is 0.0018777055833680834
difference of beta is 0.09798860092320183
iter_num: 3150
difference of alpha is 0.0018797409041537705
difference of beta is 0.0980891441306539


 29%|██▊       | 2/7 [00:01<00:04,  1.04it/s][A

iter_num: 6000
difference of alpha is 0.0018571955822999087
difference of beta is 0.09643341175672049
iter_num: 6050
difference of alpha is 0.0018553238539649186
difference of beta is 0.0963273690575761
iter_num: 6100
difference of alpha is 0.001853422981154651
difference of beta is 0.09621984741545475
iter_num: 6150
difference of alpha is 0.0018514936983127228
difference of beta is 0.09611088589576866
iter_num: 6200
difference of alpha is 0.0018495367232915783
difference of beta is 0.09600052265454906
iter_num: 6250
difference of alpha is 0.0018475527578090123
difference of beta is 0.095888794965731
iter_num: 6300
difference of alpha is 0.0018455424878123239
difference of beta is 0.0957757392413896
iter_num: 6350
difference of alpha is 0.0018435065839774722
difference of beta is 0.09566139105947968
iter_num: 6400
difference of alpha is 0.001841445701993294
difference of beta is 0.09554578518111612
iter_num: 6450
difference of alpha is 0.001839360483044672
difference of beta is 0.09542

iter_num: 1850
difference of alpha is 0.0016562937178981052
difference of beta is 0.08738266193239497
iter_num: 1900
difference of alpha is 0.0016639571747969484
difference of beta is 0.08781317384148224
iter_num: 1950
difference of alpha is 0.0016713741731306797
difference of beta is 0.08822818517842279
iter_num: 2000
difference of alpha is 0.0016785535933303564
difference of beta is 0.08862840053885179
iter_num: 2050
difference of alpha is 0.001685503868085192
difference of beta is 0.08901447396723938
iter_num: 2100
difference of alpha is 0.0016922330067301594
difference of beta is 0.08938701378986025
iter_num: 2150
difference of alpha is 0.0016987486189936618
difference of beta is 0.08974658689186299
iter_num: 2200
difference of alpha is 0.0017050579379049857
difference of beta is 0.0900937225116536
iter_num: 2250
difference of alpha is 0.0017111678417220944
difference of beta is 0.09042891561480815
iter_num: 2300
difference of alpha is 0.0017170848747709577
difference of beta is 0.

difference of alpha is 0.001847919424848854
difference of beta is 0.09736996088747674
iter_num: 6800
difference of alpha is 0.0018470552386347094
difference of beta is 0.09731832106251659
iter_num: 6850
difference of alpha is 0.0018461641574987198
difference of beta is 0.09726527161888043
iter_num: 6900
difference of alpha is 0.0018452466467042683
difference of beta is 0.09721083775036732
iter_num: 6950
difference of alpha is 0.0018443031632227047
difference of beta is 0.09715504419079934
iter_num: 7000
difference of alpha is 0.0018433341559269678
difference of beta is 0.09709791522527667
iter_num: 7050
difference of alpha is 0.0018423400657692213
difference of beta is 0.09703947470063667
iter_num: 7100
difference of alpha is 0.001841321325981582
difference of beta is 0.09697974603545845
iter_num: 7150
difference of alpha is 0.0018402783621933594
difference of beta is 0.09691875222870294
iter_num: 7200
difference of alpha is 0.0018392115926797459
difference of beta is 0.096856515870854


 43%|████▎     | 3/7 [00:02<00:03,  1.00it/s][A


difference of beta is 0.09346765302154836
iter_num: 9300
difference of alpha is 0.0017775343906087926
difference of beta is 0.09336931551422367
iter_num: 9350
difference of alpha is 0.0017757560545987872
difference of beta is 0.0932704193836571
iter_num: 9400
difference of alpha is 0.0017739668745484494
difference of beta is 0.09317097556538556
iter_num: 9450
difference of alpha is 0.001772167053122331
difference of beta is 0.09307099482828107
iter_num: 9500
difference of alpha is 0.0017703567899509665
difference of beta is 0.09297048777546024
iter_num: 9550
difference of alpha is 0.0017685362816806105
difference of beta is 0.09286946484928649
iter_num: 9600
difference of alpha is 0.0017667057220336346
difference of beta is 0.09276793633193847
iter_num: 9650
difference of alpha is 0.001764865301833396
difference of beta is 0.09266591234882071
iter_num: 9700
difference of alpha is 0.001763015209082397
difference of beta is 0.09256340287333842
iter_num: 9750
difference of alpha is 0.001

iter_num: 3950
difference of alpha is 0.0016254840997227404
difference of beta is 0.08676387411475162
iter_num: 4000
difference of alpha is 0.001625811080431916
difference of beta is 0.08677930317719529
iter_num: 4050
difference of alpha is 0.001626068961259719
difference of beta is 0.08679092754078965
iter_num: 4100
difference of alpha is 0.0016262593044036322
difference of beta is 0.08679883738989247
iter_num: 4150
difference of alpha is 0.0016263836272703003
difference of beta is 0.08680312012495506
iter_num: 4200
difference of alpha is 0.0016264434042829734
difference of beta is 0.0868038604864978
iter_num: 4250
difference of alpha is 0.0016264400686054614
difference of beta is 0.08680114067141176
iter_num: 4300
difference of alpha is 0.0016263750137426314
difference of beta is 0.0867950404431781
iter_num: 4350
difference of alpha is 0.0016262495951071543
difference of beta is 0.08678563723600519
iter_num: 4400
difference of alpha is 0.0016260651314761176
difference of beta is 0.08


 57%|█████▋    | 4/7 [00:04<00:03,  1.06s/it][A


iter_num: 8600
difference of alpha is 0.0014856826450095895
difference of beta is 0.07899080635206701
iter_num: 8650
difference of alpha is 0.0014831864097466507
difference of beta is 0.07885456289409376
iter_num: 8700
difference of alpha is 0.0014806809370071505
difference of beta is 0.07871784636381562
iter_num: 8750
difference of alpha is 0.0014781664588934262
difference of beta is 0.07858066922676699
iter_num: 8800
difference of alpha is 0.001475643203738386
difference of beta is 0.07844304374202693
iter_num: 8850
difference of alpha is 0.0014731113961428122
difference of beta is 0.07830498196278768
iter_num: 8900
difference of alpha is 0.0014705712571103646
difference of beta is 0.0781664957448811
iter_num: 8950
difference of alpha is 0.0014680230040315934
difference of beta is 0.07802759674780191
iter_num: 9000
difference of alpha is 0.0014654668508295998
difference of beta is 0.07788829643936879
iter_num: 9050
difference of alpha is 0.0014629030079440497
difference of beta is 0

difference of beta is 0.07975230365832431
iter_num: 4200
difference of alpha is 0.0014862763085812247
difference of beta is 0.07970707479910288
iter_num: 4250
difference of alpha is 0.0014853961789551562
difference of beta is 0.07965867761237178
iter_num: 4300
difference of alpha is 0.0014844600354821935
difference of beta is 0.07960718992626425
iter_num: 4350
difference of alpha is 0.0014834691921956633
difference of beta is 0.07955268728716192
iter_num: 4400
difference of alpha is 0.0014824249276355062
difference of beta is 0.07949524305553268
iter_num: 4450
difference of alpha is 0.0014813284861512344
difference of beta is 0.07943492849500444
iter_num: 4500
difference of alpha is 0.0014801810791951198
difference of beta is 0.07937181285797124
iter_num: 4550
difference of alpha is 0.001478983886513241
difference of beta is 0.0793059634668225
iter_num: 4600
difference of alpha is 0.0014777380573107735
difference of beta is 0.07923744578863534
iter_num: 4650
difference of alpha is 0.00


 71%|███████▏  | 5/7 [00:05<00:02,  1.07s/it][A

difference of alpha is 0.0013560873039057952
difference of beta is 0.07258635736502583
iter_num: 7450
difference of alpha is 0.0013533248972183998
difference of beta is 0.07243598466754975
iter_num: 7500
difference of alpha is 0.0013505504747559627
difference of beta is 0.0722849797367644
iter_num: 7550
difference of alpha is 0.0013477643679742357
difference of beta is 0.07213336059373887
iter_num: 7600
difference of alpha is 0.001344966902372846
difference of beta is 0.07198114492928198
iter_num: 7650
difference of alpha is 0.0013421583976178653
difference of beta is 0.07182835011121824
iter_num: 7700
difference of alpha is 0.0013393391677212207
difference of beta is 0.07167499319143644
iter_num: 7750
difference of alpha is 0.0013365095210922107
difference of beta is 0.07152109091123293
iter_num: 7800
difference of alpha is 0.0013336697606760595
difference of beta is 0.07136665970983813
iter_num: 7850
difference of alpha is 0.0013308201840551703
difference of beta is 0.071211715728622

difference of alpha is 0.0013645225874907574
difference of beta is 0.07398458424948728
iter_num: 1850
difference of alpha is 0.0013691535335307314
difference of beta is 0.07427246090458084
iter_num: 1900
difference of alpha is 0.0013735771990806
difference of beta is 0.07454604579952218
iter_num: 1950
difference of alpha is 0.001377800962274467
difference of beta is 0.07480602987121188
iter_num: 2000
difference of alpha is 0.0013818318749159175
difference of beta is 0.07505305324544054
iter_num: 2050
difference of alpha is 0.0013856766731805692
difference of beta is 0.07528771014381164
iter_num: 2100
difference of alpha is 0.001389341789543419
difference of beta is 0.07551055322511502
iter_num: 2150
difference of alpha is 0.0013928333654105174
difference of beta is 0.07572209743509006
iter_num: 2200
difference of alpha is 0.0013961572640943665
difference of beta is 0.07592282342875478
iter_num: 2250
difference of alpha is 0.0013993190838714753
difference of beta is 0.07611318061967154


iter_num: 6450
difference of alpha is 0.0013700904074003262
difference of beta is 0.07464429995673072
iter_num: 6500
difference of alpha is 0.0013679653180727058
difference of beta is 0.07452649545047052
iter_num: 6550
difference of alpha is 0.001365820411846741
difference of beta is 0.07440760858287376
iter_num: 6600
difference of alpha is 0.0013636561250720547
difference of beta is 0.07428766377250895
iter_num: 6650
difference of alpha is 0.0013614728859057124
difference of beta is 0.07416668496551893
iter_num: 6700
difference of alpha is 0.0013592711145093972
difference of beta is 0.07404469564812644
iter_num: 6750
difference of alpha is 0.0013570512232359278
difference of beta is 0.07392171885749121
iter_num: 6800
difference of alpha is 0.0013548136168104463
difference of beta is 0.07379777719336289
iter_num: 6850
difference of alpha is 0.001352558692520489
difference of beta is 0.07367289282785805
iter_num: 6900
difference of alpha is 0.0013502868403669765
difference of beta is 0.


 86%|████████▌ | 6/7 [00:06<00:01,  1.10s/it][A


iter_num: 9100
difference of alpha is 0.001238424850997788
difference of beta is 0.06736965236746073
iter_num: 9150
difference of alpha is 0.0012357026099145685
difference of beta is 0.06721967028170184
iter_num: 9200
difference of alpha is 0.0012329756663245206
difference of beta is 0.0670694434768393
iter_num: 9250
difference of alpha is 0.001230244195532748
difference of beta is 0.06691898156066145
iter_num: 9300
difference of alpha is 0.0012275083700270528
difference of beta is 0.0667682939858878
iter_num: 9350
difference of alpha is 0.0012247683595614234
difference of beta is 0.06661739005051004
iter_num: 9400
difference of alpha is 0.0012220243311773515
difference of beta is 0.06646627890120271
iter_num: 9450
difference of alpha is 0.0012192764492500174
difference of beta is 0.06631496953627902
iter_num: 9500
difference of alpha is 0.0012165248755309221
difference of beta is 0.06616347080932883
iter_num: 9550
difference of alpha is 0.0012137697692082838
difference of beta is 0.0

iter_num: 4100
difference of alpha is 0.0014006102621939576
difference of beta is 0.07815891307274114
iter_num: 4150
difference of alpha is 0.0014005751378380893
difference of beta is 0.07815707782373238
iter_num: 4200
difference of alpha is 0.001400486625840358
difference of beta is 0.07815214457849606
iter_num: 4250
difference of alpha is 0.0014003458804108249
difference of beta is 0.07814418401221701
iter_num: 4300
difference of alpha is 0.001400154023619038
difference of beta is 0.07813326464849979
iter_num: 4350
difference of alpha is 0.0013999121466312658
difference of beta is 0.07811945295406986
iter_num: 4400
difference of alpha is 0.0013996213109379596
difference of beta is 0.07810281342898406
iter_num: 4450
difference of alpha is 0.0013992825494781869
difference of beta is 0.07808340869013364
iter_num: 4500
difference of alpha is 0.0013988968677400848
difference of beta is 0.078061299551905
iter_num: 4550
difference of alpha is 0.0013984652447813772
difference of beta is 0.07


100%|██████████| 7/7 [00:07<00:00,  1.12s/it][A
[A

difference of beta is 0.07604470700647425
iter_num: 6250
difference of alpha is 0.0013620914361744951
difference of beta is 0.07595629723658703
iter_num: 6300
difference of alpha is 0.0013605180634002068
difference of beta is 0.0758665987322047
iter_num: 6350
difference of alpha is 0.0013589222077659002
difference of beta is 0.0757756368722653
iter_num: 6400
difference of alpha is 0.0013573043007575336
difference of beta is 0.0756834365429313
iter_num: 6450
difference of alpha is 0.0013556647658052867
difference of beta is 0.07559002214890143
iter_num: 6500
difference of alpha is 0.0013540040185127111
difference of beta is 0.0754954176285878
iter_num: 6550
difference of alpha is 0.0013523224668183786
difference of beta is 0.07539964646377939
iter_num: 6600
difference of alpha is 0.0013506205112303604
difference of beta is 0.07530273169362545
iter_num: 6650
difference of alpha is 0.0013488985449683355
difference of beta is 0.07520469592435575
iter_num: 6700
difference of alpha is 0.0013

100%|██████████| 1/1 [00:07<00:00,  7.92s/it]

Index(['instance_id', 'user_gender_id_smooth_rate'], dtype='object')
the shape of train (478087, 2)
the shape of test (18371, 2)





In [3]:
smooth_train.isnull().sum()/smooth_train.shape[0]


instance_id                   0.000000
user_gender_id_smooth_rate    0.163693
dtype: float64