In [11]:
import os
import sys
import pandas as pd
import numpy as np
import pickle

In [256]:
data0117 = pd.read_csv('../datasets/nfp/data0117.csv')
data0117.head()

Unnamed: 0,id,hour,app,app_cat,times
0,E0A0D252AB03C498807BFA8CEE739FB6,17,微信,通讯聊天,20
1,FB80F10790E7D793E46918CB0720E91E,10,微信,通讯聊天,2
2,C876D14B19676A576E84C5D1953CFE9B,19,微信,通讯聊天,280
3,CCDA8CF84778FE5F15725D1F1BA46F52,15,腾讯视频,视频服务,22
4,E1A0CF5C94706D3686452D8BBDCC3BC3,7,QQ,通讯聊天,10


In [10]:
len(data0117)

950284

In [125]:
data0117.dtypes

id                object
hour               int64
app               object
app_cat           object
times              int64
installed_apps    object
dtype: object

### 1. 用户的App安装列表

In [261]:
installed_apps = data0117.groupby('id', as_index=False)['app'].agg({'installed_apps': lambda x: set(x.tolist())})
installed_apps.head()

Unnamed: 0,id,installed_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,"{微信, 百度云, 支付宝, Keep, 开心消消乐, 金山WPS OFFICE手机版, 百..."
1,B02B18790004038B27DDBEB461D2EAD8,"{微信, 大众点评网, 腾讯视频, QQ, 贪吃蛇大作战, 百度搜索}"
2,B02B7DD46B57A2374C936C799377A183,"{钉钉, 微信, 爱奇艺, 百度搜索}"
3,B02BA3E7E93A7BC84B5669C974FB0FAE,"{微信, 携程旅行, 京东商城, 支付宝, 网易云音乐, Keep, 腾讯视频, QQ, 大..."
4,B02BCC791EB79100420BF041BAF31D4F,"{微信, 支付宝, 京东商城, Keep, 腾讯视频, 大众点评网, QQ, 开心消消乐, ..."


In [262]:
len(installed_apps)

25413


---
2. 用户当前时刻使用的App列表
---
这一特征需要对数据进行按用户和时间进行排序，然后取时间偏置merge到一起

---

In [178]:
positive_apps = data0117.groupby(['id', 'hour'], as_index=False)['app'].agg({'positive_apps': lambda x: set(x.tolist())})
positive_apps.head()

Unnamed: 0,id,hour,positive_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,{百度搜索}
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,"{微信, 支付宝, Keep, 开心消消乐, 百度搜索, 搜狗输入法, 今日头条}"
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,8,"{微信, 支付宝, Keep, 百度搜索, 搜狗输入法, 今日头条}"
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,9,"{微信, 搜狗输入法, 今日头条}"
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,10,"{微信, 搜狗输入法, 今日头条, 支付宝}"


In [179]:
temp = positive_apps['positive_apps'].apply(lambda x: len(x))
min(temp), max(temp)

(1, 19)

In [287]:
app_cat = data0117.groupby('app', as_index=False).agg({'app_cat': lambda x: ','.join(set(x.tolist()))})
print(len(app_cat))
app_cat.head()

55


Unnamed: 0,app,app_cat
0,58同城,便捷生活
1,Keep,健康医疗
2,QQ,通讯聊天
3,QQ空间,社交网络
4,一号店,电子商务


---
3. 负采样构建训练集
---
将用户当前时刻使用过app作为正例，对未使用过的app做负采样

---

In [174]:
950000, 18*25413*55  # 所以要进行负采样

(950000, 25158870)

In [175]:
applist = list(set(data0117['app']))
len(applist)

55

In [237]:
import random
def neg_sampling(pos_samples, num_samples=5):
    num_samples = min(num_samples, len(applist) - len(pos_samples))
    neg_samples = []
    while len(neg_samples) < num_samples:
        sample = random.choice(applist)
        if sample in pos_samples or sample in neg_samples:
            continue
        neg_samples.append(sample)
    return neg_samples

In [238]:
neg_sampling(set(['微信', '搜狗输入法', '今日头条']))

['苏宁', '美柚', '网易新闻', '京东商城', '美颜相机']

In [243]:
negative_apps = positive_apps.copy()
negative_apps['negative_apps'] = negative_apps['positive_apps'].apply(lambda x: set(neg_sampling(x)))
negative_apps.drop('positive_apps', axis=1, inplace=True)
negative_apps.head()

Unnamed: 0,id,hour,negative_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,"{QQ空间, 一号店, QQ, 开心消消乐, 南方航空}"
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,"{美柚, 网易新闻, 中华万年历, 贪吃蛇大作战, 新浪微博}"
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,8,"{柚宝宝孕育, 蘑菇街, 中华万年历, 金山WPS OFFICE手机版, 中国国际航空}"
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,9,"{咕咚运动, 钉钉, 腾讯视频, 美柚, 洋码头}"
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,10,"{百度地图, 海南航空, 饿了么, 蘑菇街, 追书神器}"


# TODO 
当前负样本是根据某一时刻的正样本产生的，如果当前时刻没有使用手机还要不要产生负样本

---
4. dataframe文本内容列转行，一行转多行
---
```python
df = pd.DataFrame({'A':['1','2','3'],'B':['1','2,3','4,5,6'],'C':['3','3','3']})
df = (df.set_index(['A','C'])['B']
       .str.split(',', expand=True)
       .stack()
       .reset_index(level=2, drop=True)
       .reset_index(name='B'))
print(df)
```

---

In [244]:
pos = positive_apps.set_index(['id','hour'])['positive_apps'].apply(lambda x: ','.join(x)).str.split(',', expand=True)\
.stack().reset_index(level=2, drop=True).reset_index(name='app')
pos['click'] = 1
pos.head()

Unnamed: 0,id,hour,app,click
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,百度搜索,1
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,微信,1
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,支付宝,1
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,Keep,1
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,开心消消乐,1


In [252]:
neg = negative_apps.set_index(['id','hour'])['negative_apps'].apply(lambda x: ','.join(x)).str.split(',', expand=True)\
.stack().reset_index(level=2, drop=True).reset_index(name='app')
neg['click'] = 0
neg.head(7)

Unnamed: 0,id,hour,app,click
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,QQ空间,0
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,一号店,0
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,QQ,0
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,开心消消乐,0
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,南方航空,0
5,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,美柚,0
6,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,网易新闻,0


In [247]:
len(pos), len(neg)

(950284, 1485725)

In [253]:
data = pd.concat([pos, neg], axis=0)
data.head()

Unnamed: 0,id,hour,app,click
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,百度搜索,1
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,微信,1
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,支付宝,1
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,Keep,1
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,开心消消乐,1


In [254]:
data.tail()

Unnamed: 0,id,hour,app,click
1485720,FFFF65C708A33E03D2DF17A588444F82,22,支付宝,0
1485721,FFFF65C708A33E03D2DF17A588444F82,22,去哪儿旅行网,0
1485722,FFFF65C708A33E03D2DF17A588444F82,22,大智慧,0
1485723,FFFF65C708A33E03D2DF17A588444F82,22,太平洋汽车,0
1485724,FFFF65C708A33E03D2DF17A588444F82,22,新浪微博,0


In [255]:
len(data)

2436009

---
5. 添加特征
---
将之前生成用户App安装列表，上一时刻使用的App等特征扩展到数据集中

---

In [264]:
temp = installed_apps.copy()
temp['installed_apps'] = temp['installed_apps'].apply(lambda x: len(x))
temp.head()

Unnamed: 0,id,installed_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,9
1,B02B18790004038B27DDBEB461D2EAD8,6
2,B02B7DD46B57A2374C936C799377A183,4
3,B02BA3E7E93A7BC84B5669C974FB0FAE,12
4,B02BCC791EB79100420BF041BAF31D4F,10


In [265]:
data = pd.merge(data, temp, on=['id'], how='left')
data.head()

Unnamed: 0,id,hour,app,click,installed_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,百度搜索,1,9
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,微信,1,9
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,支付宝,1,9
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,Keep,1,9
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,开心消消乐,1,9


In [276]:
# 上一时刻使用app列表，与原dataframe将来merge到一起
temp = positive_apps.copy()
temp['hour'] += 1  # 记录上一时刻使用的App列表
temp.rename(columns={'positive_apps': 'last_apps'}, inplace=True)
temp['last_apps'] = temp['last_apps'].apply(lambda x: len(x))
temp.head()

Unnamed: 0,id,hour,last_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,1
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,8,7
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,9,6
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,10,3
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,11,4


In [277]:
data = pd.merge(data, temp, on=['id', 'hour'], how='left')
data.head()

Unnamed: 0,id,hour,app,click,installed_apps,positive_apps,last_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,百度搜索,1,9,,
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,微信,1,9,1.0,1.0
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,支付宝,1,9,1.0,1.0
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,Keep,1,9,1.0,1.0
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,开心消消乐,1,9,1.0,1.0


In [281]:
data.fillna(0, inplace=True)
data.head()

Unnamed: 0,id,hour,app,click,installed_apps,last_apps
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,百度搜索,1,9,0.0
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,微信,1,9,1.0
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,支付宝,1,9,1.0
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,Keep,1,9,1.0
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,开心消消乐,1,9,1.0


In [288]:
data = pd.merge(data, app_cat, on=['app'], how='left')
data.head()

Unnamed: 0,id,hour,app,click,installed_apps,last_apps,app_cat
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,百度搜索,1,9,0.0,实用工具
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,微信,1,9,1.0,通讯聊天
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,支付宝,1,9,1.0,金融理财
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,Keep,1,9,1.0,健康医疗
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,开心消消乐,1,9,1.0,游戏服务


In [290]:
node2id['B02AD69BFD4A4FEEDFAB6DBB36DCC167'], node2id['微信']

(0, 25433)

---
6.添加graph embedding特征
---
该特征是通过对User-App网络进行网络表示学习得到节点的embedding表示，不仅包含低维稠密的信息，更包含节点之间的语义信息

---

In [291]:
node2id = pickle.load(open('./node2id.pkl', 'rb'))
len(node2id)

25468

In [295]:
def hadamard(vec1, vec2):
    return list(map(lambda x: x[0]*x[1], zip(vec1, vec2)))

In [293]:
def get_embeddings(file_name):
    embeddings_dict = {}
    embeddings_file = os.path.join("/Users/chenhui/fsdownload/user_app", file_name)
    with open(embeddings_file, 'r') as f:
        num_nodes, emb_size = list(map(int, f.readline().strip().split()))
        print(num_nodes, emb_size)
        for i in range(num_nodes):
            line = list(map(float, f.readline().strip().split()))
            embeddings_dict[line[0]] = line[1:]
    return embeddings_dict

In [296]:
embeddings = get_embeddings('user_app0117.embeddings')

25468 128


In [298]:
userid = node2id['B02AD69BFD4A4FEEDFAB6DBB36DCC167']
appid = node2id['微信']
hadamard(embeddings[userid], embeddings[appid])

[0.00547985938768246,
 0.08373862471515771,
 0.0081825954678621,
 0.02133262384388,
 0.010413039525126,
 4.45402708464813e-05,
 0.0013803316024664398,
 -0.00029128177872351596,
 0.0006449705735577931,
 -0.004441708427,
 -0.011481535397941,
 0.0201514202245125,
 0.0021068330482201204,
 -0.0036660780152397696,
 -0.00472219062326581,
 0.00465142562418688,
 0.0166023902956265,
 -0.009229382901456,
 0.0014234747308017601,
 0.0362908186130871,
 -0.008459967581522461,
 0.0090443487200418,
 0.0027976032952724697,
 0.017538177499626,
 0.0024467861676866097,
 0.049183265648834,
 -0.00451394536758224,
 0.0063135516659559,
 0.005049091436492999,
 0.0004921299257190401,
 0.0018871195055477398,
 0.000513286643622082,
 0.00186529558777864,
 -0.001074308423321588,
 0.00921637968394352,
 0.00073164137754246,
 -0.0012919357864508302,
 -0.0034681163169911398,
 -0.013759772096600039,
 0.00350278641350145,
 0.000443601911097,
 -0.0175489555458028,
 0.00729503111558574,
 -0.018615899576826002,
 0.0187350259

In [294]:
data.columns

Index(['id', 'hour', 'app', 'click', 'installed_apps', 'last_apps', 'app_cat'], dtype='object')

In [317]:
user_apps = data.loc[:, ['id', 'app']].drop_duplicates()
user_apps.head()

Unnamed: 0,id,app
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,百度搜索
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,微信
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,支付宝
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,Keep
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,开心消消乐


In [308]:
len(temp)

959677

In [310]:
temp = user_apps.apply(lambda row: hadamard(embeddings[node2id[row['id']]], embeddings[node2id[row['app']]]), axis=1)

In [315]:
temp = user_apps['graph_emb'].apply(lambda col: ','.join(map(str, col)))

In [316]:
temp.head()

0    -0.0051594073935908,0.0666825147279899,0.01853...
1    0.00547985938768246,0.08373862471515771,0.0081...
2    -0.0058435140981085995,0.1172621817009695,-0.0...
3    -0.0454774110441126,0.12108299415182061,-0.015...
4    0.00423594427914244,-0.02816071338299894,0.027...
Name: graph_emb, dtype: object

In [320]:
temp = temp.str.split(',', expand=True)
temp.columns = ['emb_%s' % i for i in range(128)]
temp.head()

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,-0.0051594073935908,0.0666825147279899,0.018539364552579,0.0114876858774192,0.0063361840954523,-0.0059881853462732,0.0088235055821909,-0.0009744207487502,0.0027596708412773,0.001258217567854,...,0.0532692745031955,0.0053896737044286,-0.018579164025534,-0.0027798116243703,-0.0001637171336776,0.0232853278067449,-0.0004981661970483,0.0072735408459075,-2.95532105771628e-05,-0.0058625028675305
1,0.0054798593876824,0.0837386247151577,0.0081825954678621,0.02133262384388,0.010413039525126,4.45402708464813e-05,0.0013803316024664,-0.0002912817787235,0.0006449705735577,-0.004441708427,...,0.05624127880245,0.0050387043105246,0.0174908688997336,-0.0007727657783177,-0.001254686781139,0.0141222844821219,0.0001524598415045,0.0066193911949005,-7.976739848230561e-05,-0.0040666413826326
2,-0.0058435140981085,0.1172621817009695,-0.0004031217864355,0.0064984914666523,0.0048014106873092,0.0030457186715793,0.0187365228474078,-0.0014447045104649,0.002002165914987,-0.01653579242614,...,0.0647455255691999,0.0037219867270488,0.020390728582534,-0.0048556097269823,0.0035372942365255,0.013201818435589,-0.0030404271371136,-0.0032332441818288,-7.10689383652776e-05,-0.0047330230107115
3,-0.0454774110441126,0.1210829941518206,-0.0150667818984648,0.021571585867524,0.0053595031181847,-0.0058139890027163,0.0259050511611456,0.0070046126723843,0.0033656174420928,-0.000864143795372,...,0.0106611927907857,0.0022276670574426,0.0185119821638586,-0.0052887212912299,0.0006997969256685,-0.0107331734344963,-0.0020549772552446,-0.0024286700911163,0.0002093160423742,-0.0039006221963837
4,0.0042359442791424,-0.0281607133829989,0.02708287429059,-0.018418750754026,0.0073998160262664,-0.0120328013794302,0.0005910400806254,0.0015856612144474,0.0028360615043254,0.007034839892912,...,0.03266895085413,0.0034214489954958,0.057234973404225,0.0007363980335044,-0.0033633517557612,-0.0152770808816899,-0.0026509078813809,0.0015645956417787,-0.0002149809650651,-0.0031446312667613


In [324]:
user_apps = pd.concat([user_apps, temp], axis=1)

In [325]:
data = pd.merge(data, user_apps, on=['id', 'app'], how='left')

In [327]:
data.head()

Unnamed: 0,id,hour,app,click,installed_apps,last_apps,app_cat,emb_0,emb_1,emb_2,...,emb_118,emb_119,emb_120,emb_121,emb_122,emb_123,emb_124,emb_125,emb_126,emb_127
0,B02AD69BFD4A4FEEDFAB6DBB36DCC167,6,百度搜索,1,9,0.0,实用工具,-0.0051594073935908,0.0666825147279899,0.018539364552579,...,0.0532692745031955,0.0053896737044286,-0.018579164025534,-0.0027798116243703,-0.0001637171336776,0.0232853278067449,-0.0004981661970483,0.0072735408459075,-2.95532105771628e-05,-0.0058625028675305
1,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,微信,1,9,1.0,通讯聊天,0.0054798593876824,0.0837386247151577,0.0081825954678621,...,0.05624127880245,0.0050387043105246,0.0174908688997336,-0.0007727657783177,-0.001254686781139,0.0141222844821219,0.0001524598415045,0.0066193911949005,-7.976739848230561e-05,-0.0040666413826326
2,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,支付宝,1,9,1.0,金融理财,-0.0058435140981085,0.1172621817009695,-0.0004031217864355,...,0.0647455255691999,0.0037219867270488,0.020390728582534,-0.0048556097269823,0.0035372942365255,0.013201818435589,-0.0030404271371136,-0.0032332441818288,-7.10689383652776e-05,-0.0047330230107115
3,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,Keep,1,9,1.0,健康医疗,-0.0454774110441126,0.1210829941518206,-0.0150667818984648,...,0.0106611927907857,0.0022276670574426,0.0185119821638586,-0.0052887212912299,0.0006997969256685,-0.0107331734344963,-0.0020549772552446,-0.0024286700911163,0.0002093160423742,-0.0039006221963837
4,B02AD69BFD4A4FEEDFAB6DBB36DCC167,7,开心消消乐,1,9,1.0,游戏服务,0.0042359442791424,-0.0281607133829989,0.02708287429059,...,0.03266895085413,0.0034214489954958,0.057234973404225,0.0007363980335044,-0.0033633517557612,-0.0152770808816899,-0.0026509078813809,0.0015645956417787,-0.0002149809650651,-0.0031446312667613


In [328]:
data.to_csv('./data.csv', index=False)

In [330]:
len(data)

2436009

---
训练
---

---

In [336]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
# import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score

In [338]:
from sklearn.model_selection import train_test_split

In [None]:
X = data.iloc[:, 4:].values
y = data['click'].values
X.shape, y.shape

In [313]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [314]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 32,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

'[1, 2, 3]'

In [None]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=5000,
                    valid_sets=lgb_eval,
                    early_stopping_rounds=50,
                    verbose_eval=50,
                    )

In [None]:
f1_score(y_test, np.where(gbm.predict(X_test, num_iteration=gbm.best_iteration) > 0.5, 1, 0))