In [3]:
import pandas as pd
import numpy as np
import os
import pickle

In [2]:
node2id = pickle.load(open('./node2id.pkl', 'rb'))
id2node = pickle.load(open('./id2node.pkl', 'rb'))

In [3]:
id2node[0],id2node[25467]

('B02AD69BFD4A4FEEDFAB6DBB36DCC167', '高德地图')

In [4]:
node2id[id2node[0]], node2id[id2node[25467]]

(0, 25467)

In [38]:
base_dir = "/Users/chenhui/fsdownload"
all_data1 = pd.read_csv(os.path.join(base_dir, "input/all_data1.csv"), encoding='gbk')
all_data2 = pd.read_csv(os.path.join(base_dir, "input/all_data2.csv"), encoding='gbk')

In [7]:
all_data1.head()

Unnamed: 0,id,app,link
0,B960ABDB6F588FE3986A7BECB9C4C534,高德地图,1
1,B05F998104D00D8218156EED80145B66,百度搜索,1
2,D353C7D37EDD8B5D1EFE2C1FDE8011A2,开心消消乐,1
3,F5CC6B2CCAC733CB342C924A9D8B2D60,腾讯视频,1
4,FF4DB3AE6F35F5F0362BFC22428CDF89,携程旅行,1


In [8]:
def hadamard(v1, v2):
    v1 = np.asarray(v1)
    v2 = np.asarray(v2)
    return v1 * v2

In [33]:
def get_embeddings(file_name):
    embeddings_dict = {}
    embeddings_file = os.path.join("/Users/chenhui/fsdownload/user_app", file_name)
    with open(embeddings_file, 'r') as f:
        num_nodes, emb_size = list(map(int, f.readline().strip().split()))
        print(num_nodes, emb_size)
        for i in range(num_nodes):
            line = list(map(float, f.readline().strip().split()))
            embeddings_dict[line[0]] = line[1:]
    return embeddings_dict

In [40]:
def generate_features(all_data, embeddings_dict, file_name):
    dir_path = os.path.join('./', "user_app_features")
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
    features_file = os.path.join('./user_app_features', "%s" % file_name)
    if os.path.exists(features_file):
        return pd.read_csv(features_file)
    features = []
    for i in range(len(all_data)):
        sample = all_data.iloc[i]
        userid = node2id[sample['id']]
        appid = node2id[sample['app']]
        feature_vec = hadamard(embeddings_dict[userid], embeddings_dict[appid])
        features.append([userid, appid, sample['link']] + feature_vec.tolist())
    feature_data = pd.DataFrame(features, columns=['user', 'app', 'label'] + ['feature_%s' % i for i in range(128)])
    feature_data.to_csv(features_file, index=False)
    return feature_data

In [39]:
embeddings1 = get_embeddings("user_app0117.embeddings")
feature_data1 = generate_features(all_data1, embeddings1, "user_app0117.features")
embeddings2 = get_embeddings("user_app0118.embeddings")
feature_data2 = generate_features(all_data1, embeddings1, "user_app0118.features")

25468 128
25468 128


### 模型

In [4]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, roc_auc_score

In [46]:
features = ["feature_%s" % i for i in range(128)]
X_train = feature_data1.loc[:, features].values
y_train = feature_data1['label'].values
X_test = feature_data2.loc[:, features].values
y_test = feature_data2['label'].values

In [47]:
X_train.shape, y_train.shape

((380480, 128), (380480,))

In [70]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

In [71]:
accuracy_score(y_true=y_test, y_pred=lr_preds),\
precision_score(y_true=y_test, y_pred=lr_preds),\
recall_score(y_true=y_test, y_pred=lr_preds),\
f1_score(y_true=y_test, y_pred=lr_preds)

(0.77175935660218675,
 0.73766716161983004,
 0.84348191757779645,
 0.78703383533324012)

In [85]:
lr_proba = lr_model.predict_proba(X_test)[:, 1]
roc_auc_score(y_true=y_test, y_score=lr_proba)

0.82071448520208368

In [86]:
naive_auc(preds=lr_proba, labels=y_test), approximate_auc(preds=lr_proba, labels=y_test)

(0.8207144852020837, 0.8207046353094953)

In [67]:
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

0.88016487200929983

In [69]:
accuracy_score(y_true=y_test, y_pred=xgb_preds),\
precision_score(y_true=y_test, y_pred=xgb_preds),\
recall_score(y_true=y_test, y_pred=xgb_preds),\
f1_score(y_true=y_test, y_pred=xgb_preds)

(0.87835103027754413,
 0.86723401650008414,
 0.89348717409587886,
 0.88016487200929983)

In [77]:
xgb_proba = xgb_model.predict_proba(X_test)[:, 1]
roc_auc_score(y_true=y_test, y_score=xgb_proba)

0.94323150970028746

In [84]:
naive_auc(preds=xgb_proba, labels=y_test), approximate_auc(preds=xgb_proba, labels=y_test)

(0.9432314057249008, 0.9431825190429415)

In [75]:
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [89]:
xgb_model.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 1}

In [5]:
xgb_model = xgb.XGBClassifier()

In [None]:
xgb_model.

In [83]:
def naive_auc(preds, labels):
    M = sum(labels)
    N = len(labels) - M
    total_pairs = M * N
    accumulated_neg = 0
    satisfied_pairs = 0
    preds_labels = sorted(zip(preds, labels), key=lambda x: x[0])
    for i in range(len(labels)):
        if preds_labels[i][1] == 1:
            satisfied_pairs += accumulated_neg
        else:
            accumulated_neg += 1
    return satisfied_pairs/float(total_pairs)
    
def approximate_auc(preds, labels, bins=100):
    M = sum(labels)
    N = len(labels) - M
    total_pairs = M * N
    accumulated_neg = 0
    satisfied_pairs = 0
    pos = [0] * bins
    neg = [0] * bins
    width = 1.0/bins
    for i in range(len(labels)):
        bin = int(preds[i]/width)
        if labels[i] == 1:
            pos[bin] += 1
        else:
            neg[bin] += 1
    for i in range(bins):
        satisfied_pairs += pos[i] * accumulated_neg + 0.5 * pos[i] * neg[i] # n1 + 0.5*n2
        accumulated_neg += neg[i]
    return satisfied_pairs/float(total_pairs)

In [34]:
import scipy.io as sio
import scipy.sparse as sparse

In [36]:
blog = sio.loadmat('../deepwalk/example_graphs/blogcatalog.mat')

In [37]:
network = blog['network']

In [41]:
type(network), network

(scipy.sparse.csc.csc_matrix,
 <10312x10312 sparse matrix of type '<class 'numpy.float64'>'
 	with 667966 stored elements in Compressed Sparse Column format>)

In [40]:
network[0].todense()

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [25]:
from functools import reduce
a = [1, 2, 3]
reduce(lambda x,y : x*y , a)

6

In [29]:
b = [1, 1, 2, 2, 3]
reduce(lambda x,y : x^y, b)

3

In [19]:
a

[[0, 0, 0], [0, 0, 0]]

In [20]:
a[0][1] = 1

In [21]:
a

[[0, 1, 0], [0, 1, 0]]

In [6]:
5**10

9765625

In [13]:
2.3*3.3

7.589999999999999