# 1. Packages

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence,pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
torch.set_default_tensor_type(torch.DoubleTensor)
import numpy as np
import pandas as pd


import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance

import datetime
from datetime import timedelta,date,datetime
import copy
from scipy import stats
import math
import collections
import importlib

import sklearn
from sklearn import manifold
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score,accuracy_score,f1_score,precision_score,recall_score
from sklearn.preprocessing import MinMaxScaler,RobustScaler,OneHotEncoder
from sklearn.model_selection import GridSearchCV,cross_val_score,train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.mixture import GaussianMixture

import networkx as nx
import community
import dgl

import matplotlib.pyplot as plt 
import seaborn as sns

import os
import argparse
import time
import random

plt.rcParams['font.sans-serif'] = ['SimHei']  # font family set
plt.rcParams['axes.unicode_minus'] = False  # for minus display
plt.style.use('seaborn')
sns.set_style(style='ticks') #set background
plt.rcParams['figure.dpi'] = 200
%config InlineBackend.figure_format = 'svg' #show svg pictures
import warnings
warnings.filterwarnings("ignore")

# 2. Dataset

## 2.1 transaction logs & profile

In [None]:
profile = pd.read_csv('new_dataset/portrait.csv')
profile2 = pd.read_csv('new_dataset/portrait2.csv')
profile2['ds'] = pd.to_datetime(profile2['ds'], format='%m/%d/%y').astype(str)
profile.drop(columns=['online_time'],inplace=True)
profile = profile.merge(profile2[['ds','roleid','online_time']],on=['ds','roleid'])
print(profile.columns)

In [None]:
trade = pd.read_csv('new_dataset/trade.csv')
positive_samples = pd.read_csv('new_dataset/label.csv')
new_positive_samples = pd.read_csv('new_dataset/label2.csv')
new_positive_samples = new_positive_samples[new_positive_samples['is_mask']==1]
print(len(trade.values))
print(len(profile.values))
print(len(positive_samples.values)) #very few RMT logs
print(len(new_positive_samples))

In [None]:
#merge the positive_samples and trade
label_trade=trade.merge(positive_samples,on=['ts','roleid_src','roleid_dst'],how='left',indicator=True)
label_trade['label'] = (label_trade['_merge']=='both').astype(int)
label_trade.drop(columns='_merge',inplace=True)
print(np.sum(label_trade['label'])) 
label_trade['ds'] = label_trade['ts'].map(lambda x:x.split()[0])
#add a constant for reverse sum
label_trade['trade_money']=label_trade['trade_money']+0.1 


label_trade=label_trade.merge(new_positive_samples,on=['ts','roleid_src','roleid_dst'],how='left',indicator=True)
label_trade['label2'] = (label_trade['_merge']=='both').astype(int)
print(np.sum(label_trade['label2']))
label_trade.drop(columns='_merge',inplace=True)
print(label_trade.head(5))

In [None]:
samples = merger.merge(profile,left_on=['roleid_src','ds'],right_on=['roleid','ds'])
print(len(samples.values))
samples = samples.merge(profile,left_on=['roleid_dst','ds'],right_on=['roleid','ds'])
print(len(samples.values))
samples.sort_values(by='ts',inplace=True)
samples.drop(columns=['ds','os_x','os_y','channel_x','channel_y','roleid_x','roleid_y','create_time_x','create_time_y'],inplace=True)

In [None]:
#add samples in the reverse trading direction
label_trade_re=label_trade.copy(deep=True)
label_trade['reverse']=0
label_trade_re['reverse']=1
label_trade_re['tmp']=label_trade_re['roleid_src']
label_trade_re['roleid_src']=label_trade_re['roleid_dst']
label_trade_re['roleid_dst']=label_trade_re['tmp']
label_trade_re['trade_money']=-label_trade_re['trade_money'] #the values are positive
label_trade_re.drop(columns=['tmp'],inplace=True)
merger=pd.concat([label_trade_re,label_trade])

#sort transaction logs by time
merger.sort_values(by='ts',inplace=True)
merger.drop(columns='ds',inplace=True)

In [None]:
#feature:accumulated virtual money (net money) of a trader
node_money=merger.groupby(['roleid_dst'])['trade_money'].agg('sum').reset_index()
node_money.columns=['Id','net_money']

In [None]:
#reserve the logs from the buyer to the seller
merger_1 = merger.groupby(['roleid_src','roleid_dst'])['trade_money'].agg('sum').reset_index()
merger_1 = merger_1[merger_1['trade_money']>=0]
merger_1.drop(columns=['trade_money'],inplace=True)

merger['reverse']=merger['reverse'].map(lambda x: 1 if x==0 else -1)
merger['trade_money']=merger['trade_money']*merger['reverse']
merger['ds']=merger['ts'].map(lambda x:x.split()[0])

In [None]:
#construct the transaction log for use
samples = merger.merge(profile,left_on=['roleid_src','ds'],right_on=['roleid','ds'])
print(len(samples.values))
samples = samples.merge(profile,left_on=['roleid_dst','ds'],right_on=['roleid','ds'])
print(len(samples.values))
samples.sort_values(by='ts',inplace=True)
samples.drop(columns=['ds','os_x','os_y','channel_x','channel_y','roleid_x','roleid_y','create_time_x','create_time_y'],inplace=True)

In [None]:
#feature: trade amount related
samples['price']=samples['trade_money']/samples['trade_cnt']
tmp = samples.groupby('trade_item',as_index=False)['price'].mean()
tmp.columns=['trade_item','avg_price']
samples=samples.merge(tmp,on='trade_item')
samples['diff_price']=samples['price']-samples['avg_price']

In [None]:
#feature: trade pattern related
both_samples_4=merger.groupby(['roleid_src','roleid_dst'])['label'].agg('count').reset_index()
both_samples_4.columns=['roleid_src','roleid_dst','trade_n_times']
print(both_samples_4.columns)

both_samples_5=merger.groupby(['roleid_src','roleid_dst','trade_money'])['label'].agg('count').reset_index()
both_samples_5.columns=['roleid_src','roleid_dst','trade_money','trade_money_times']
print(both_samples_5.columns) 

both_samples_6=merger.groupby(['roleid_src','roleid_dst','trade_item'])['label'].agg('count').reset_index()
both_samples_6.columns=['roleid_src','roleid_dst','trade_item','trade_item_times']
print(both_samples_6.columns)

merger['ts_pre']=merger.groupby(['roleid_src','roleid_dst'])['ts'].shift(1)
merger['ts_between']=pd.to_timedelta(pd.to_datetime(merger['ts']) - pd.to_datetime(merger['ts_pre'])).dt.total_seconds()
merger.drop(columns=['ts_pre'],inplace=True)
merger.sort_values(by=['roleid_src','roleid_dst','ts'],inplace=True)
merger['ts_between'].fillna(method='bfill',inplace=True) 
print(merger.columns)

In [None]:
# feature combination for transaction logs
samples_1=samples.merge(merger[['roleid_src','roleid_dst','ts','ts_between']],how='left',on=['roleid_src','roleid_dst','ts'])
samples_1=samples_1.merge(both_samples_4,how='left',on=['roleid_src','roleid_dst'])
samples_1=samples_1.merge(both_samples_5,how='left',on=['roleid_src','roleid_dst','trade_money'])
samples_1=samples_1.merge(both_samples_6,how='left',on=['roleid_src','roleid_dst','trade_item'])
samples_1=samples_1.merge(node_money,how='left',left_on='roleid_src',right_on='Id')
samples_1=samples_1.merge(node_money,how='left',left_on='roleid_dst',right_on='Id')
samples_1.drop(columns=['Id_x','Id_y'],inplace=True)
print(samples_1.columns)
samples_label=samples_1.label
samples_1.drop(columns=['label'],inplace=True)
samples_1.insert(len(samples_1.columns),'label',samples_label)
samples_label=samples_1.label2
samples_1.drop(columns=['label2'],inplace=True)
samples_1.insert(len(samples_1.columns),'label2',samples_label)
print(samples_1.head(5))
print(samples_1.info())

#filter the transaction sequences with length>1
samples_1.sort_values(by='ts',inplace=True)
samples_1 = samples_1[samples_1['trade_n_times']>1]
print(len(samples_1))

## 2.2 transaction sequences

In [None]:
#aggeragte the transaction sequences between pairs of traders
#define the aggregating ways of features
agg_dict = {}
for column in samples_1.columns:
    if 'roleid' in column or column =='ts' or column =='trade_item':
        continue
    elif column in ['trade_money','price','trade_money_times']:
        agg_dict[column]=['max','sum']
    elif column =='trade_item_times':
        agg_dict[column]=['max']
    elif column =='ts_between':
        agg_dict[column]=['min','mean']
        #agg_dict[column]=['min']
    elif column =='reverse': 
        agg_dict[column]=['sum']
    else:
        agg_dict[column]=['mean']
print(agg_dict)
        
def f(x):
    d = {}
    for (column,how_list) in agg_dict.items():
        for how in how_list:
            if how == 'mean':
                d[column+'_'+how] = x[column].mean()
            elif how == 'min':
                d[column+'_'+how] = x[column].min()
            elif how == 'max':
                d[column+'_'+how] = x[column].max()
            elif how == 'sum':
                d[column+'_'+how] = x[column].sum()

    return pd.Series(d)

samples_2 = samples_1.groupby(['roleid_src','roleid_dst']).apply(f).reset_index()
samples_2['label']=(samples_2['label_mean']>0).astype(int)
samples_2.drop(columns='label_mean',inplace=True)
samples_2['label2']=(samples_2['label2_mean']>0).astype(int)
samples_2.drop(columns=['label2_mean','is_mask_mean'],inplace=True)
print(samples_2.head(5))

In [None]:
print(len(samples_2))
samples_2 = samples_2.merge(merger_1,on=['roleid_src','roleid_dst']) #reserve the samples from the buyer to the seller
print(len(samples_2))
print(samples_2.columns)
print(np.sum(samples_2['label'])) #label for training
print(np.sum(samples_2['label2'])) #label for new_recall test

samples_3 = samples_2.drop(columns=['level1_x_mean', 'level2_x_mean',
       'level3_x_mean', 'level4_x_mean', 'score1_x_mean', 'score2_x_mean',
       'exp1_x_mean', 'exp2_x_mean', 'exp3_x_mean', 'online_time_x_mean',
       'level1_y_mean', 'level2_y_mean', 'level3_y_mean', 'level4_y_mean',
       'score1_y_mean', 'score2_y_mean', 'exp1_y_mean', 'exp2_y_mean',
       'exp3_y_mean', 'online_time_y_mean'])  #drop the player profile for cold start reason

# 3. Reweighted XGBoost

## 3.1 train&valid split

In [None]:
train_1 = samples_3.iloc[:,0:-2] #reserve the role IDs
train = samples_3.iloc[:,2:-2]
target = samples_3.iloc[:,-2]
print(train.shape)
print(target.shape)
X_train_1, X_test_1, y_train, y_test = train_test_split(train_1, target, test_size = 0.2, random_state = 7)
X_train = X_train_1.iloc[:,2:]
X_test = X_test_1.iloc[:,2:]

new_pos_X = samples_3[(samples_3['label2']==1)&(samples_3['label']==0)].iloc[:,2:-2]
new_pos = samples_3[(samples_3['label2']==1)&(samples_3['label']==0)]

In [None]:
pos_X_train = X_train[y_train==1]
pos_X_train_1 = X_train_1[y_train==1] #reserve the role IDs
un_X_train = X_train[y_train==0]
un_X_train_1 = X_train_1[y_train==0] #reserve the role IDs
pos_X_test = X_test[y_test==1]
pos_X_test_1 = X_test_1[y_test==1]
un_X_test = X_test[y_test==0]
un_X_test_1 = X_test_1[y_test==0]

print(pos_X_train.shape)
print(pos_X_test.shape)
print(un_X_train.shape)
print(un_X_test.shape)

In [None]:
#a transation graph constructed by positive samples
G0 = nx.Graph() #undirected graph
for index, row in pos_X_train_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G0.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
for index, row in pos_X_test_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G0.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G0.number_of_nodes())
print(G0.number_of_edges())
partition = community.best_partition(G0) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G0)
print(mod) #modularity

## 3.2 training

In [None]:
#warm start for an epoch
bst = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(un_X_train)//len(pos_X_train)) #sklearn api


bst.fit(X_train, y_train)
xgb_normal = copy.deepcopy(bst)

train_preds = bst.predict_proba(X_train)[:,1]
train_predictions = [1 if value>=0.4 else 0 for value in train_preds]

train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)
train_auc = roc_auc_score(y_train,train_preds)

print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
print ("Train Precision: %.2f%%" % (train_precision * 100.0))
print ("Train Recall: %.2f%%" % (train_recall * 100.0))
print ("Train f1: %.4f" % (train_f1))
print ("Train AUC: %.4f" % (train_auc))


# make prediction
preds = bst.predict_proba(X_test)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] 

test_accuracy = accuracy_score(y_test, predictions)
test_precision = precision_score(y_test, predictions)
test_recall = recall_score(y_test, predictions)
test_f1=f1_score(y_test, predictions)
test_auc=roc_auc_score(y_test,preds)

print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
print ("Test Precision: %.2f%%" % (test_precision * 100.0))
print ("Test Recall: %.2f%%" % (test_recall * 100.0))
print ("Test f1: %.4f" % (test_f1))
print ("Test AUC: %.4f" % (test_auc))


# make prediction
preds = bst.predict_proba(new_pos_X)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] 
new_test_recall = recall_score([1]*len(predictions), predictions)
print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new_recall

#feature importance
plot_importance(xgb_normal)
plt.show()



In [None]:
thr = 0.4
preds = xgb_normal.predict_proba(train)[:,1]
predictions = pd.Series([1 if value>=thr else 0 for value in preds])
G_new = copy.deepcopy(G0) 

fp_X_1 = train_1[(predictions==1)]
print(len(fp_X_1))

for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity

In [None]:
#save the model
xgb_normal.save_model('./data_gnn/xgb_normal.json')

predict_prob = xgb_normal.predict_proba(train)[:,1] 
predict_label = [1 if value>=thr else 0 for value in predict_prob] 
samples_2['predict_label'] = predict_label
samples_2['predict_prob'] = predict_prob
merger['trade_money']=merger['trade_money']*merger['reverse'] 
tmp=merger.groupby(['roleid_src','roleid_dst'])['trade_money'].agg('sum').reset_index()
tmp=tmp[tmp['trade_money']>=0]
merger['trade_money']=merger['trade_money']*merger['reverse']

samples_pos=samples_2[samples_2['predict_label']>0]
samples_2.drop(columns=['predict_label','predict_prob'],inplace=True)
samples_pos=samples_pos.merge(tmp,on=['roleid_src','roleid_dst']) 

#save the edges(predicted RMTs) for graph construction
ns = samples_pos[['roleid_src','roleid_dst','trade_n_times_mean','trade_money']]
ns.columns = ['Source','Target','Weight','trade_money'] 
ns.to_csv('./data_gnn/test_graph_1.csv')



# 4. TEDn+ without GAT embeddings

## 4.1 TEDn functions

In [None]:
#compulate the confidence interval
def DKW_bound(x,y,t,m,n,delta=0.1, gamma= 0.01):
#     from scipy.stats import norm
    
#     sigma = 1.0/x - 1.0/m + 1.0/y - 1.0/n 
    temp = np.sqrt(np.log(1/delta)/2/n) + np.sqrt(np.log(1/delta)/2/m)
    bound = temp*(1+gamma)/(y/n)
    estimate = t

    return estimate, t - bound, t + bound

In [None]:
#Mixture Propotion Estimation (BBE)
def top_bin_estimator_count(pdata_probs, udata_probs, udata_targets):
    
    p_indices = np.argsort(pdata_probs)
    sorted_p_probs = pdata_probs[p_indices]
    u_indices = np.argsort(udata_probs)
    sorted_u_probs = udata_probs[u_indices]
    sorted_u_targets = udata_targets[u_indices]

    sorted_u_probs = sorted_u_probs[::-1] #sort by asecnding order
    sorted_p_probs = sorted_p_probs[::-1]
    sorted_u_targets = sorted_u_targets[::-1]
    num = len(sorted_u_probs)

    plot_arr = []
    plot_ratio = []
    j = 0
    num_u_samples = 0
    ideal_plot_arr = []

    upper_cfb = []
    lower_cfb = []            

    i = 0
    while (i < num):
        start_interval =  sorted_u_probs[i]   
        k = i 
        if (i<num-1 and start_interval> sorted_u_probs[i+1]): 
            pass
        else: 
            i += 1
            continue
        if (sorted_u_targets[i]==1):
            num_u_samples += 1

        while ( j<len(sorted_p_probs) and sorted_p_probs[j] >= start_interval):
            j += 1

        if j>1 and i > 1:
            t = (i)*1.0*len(sorted_p_probs)/j/len(sorted_u_probs)
            plot_ratio.append(t)
            ideal_plot_arr.append( (i-num_u_samples)*1.0*len(sorted_p_probs)/j/len(sorted_u_probs))
            estimate, lower , upper = DKW_bound(i, j, t, len(sorted_u_probs), len(sorted_p_probs),0.2)
            plot_arr.append(estimate)
            upper_cfb.append( upper)
            lower_cfb.append( lower)

        i+=1
    if (len(upper_cfb) != 0): 
        mpe_estimate = np.min(upper_cfb)
        idx = np.argmin(upper_cfb)

        if (plot_arr[idx] != plot_ratio[idx]): 
            print("fallback...")

        return mpe_estimate, idx, plot_arr[idx], lower_cfb, upper_cfb, plot_arr, ideal_plot_arr
    else: 
        # print("")
        return 1.0, 1.0, 1.0, [1.0], [1.0], [1.0], [1.0]

In [None]:
#determine the reliable negative and suspicious positive samples
def rank_inputs(classifier, u_x, alpha):

    u_size = len(u_x)
    output_probs = classifier.predict_proba(u_x)[:,1]
    keep_samples = np.ones_like(output_probs)

    #sort the unlabeled samples by asecnding order of prediction scores
    sorted_idx = np.argsort(output_probs)
    #set the reliable negative samples to 1, and the suspicious positive samples to 0
    keep_samples[sorted_idx[u_size - int(alpha*u_size):]] = 0  

    return keep_samples

## 4.2 without RAR

In [None]:
alpha_default = 0.8 
epochs = 20
thr = 0.4
if_pu = True

#after warm start
for epoch in range(epochs):

    pos_probs = np.array(bst.predict_proba(pos_X_test)[:,1])
    unlabeled_probs = np.array(bst.predict_proba(un_X_test)[:,1])
    unlabeled_targets = np.array([1 if i>=thr else 0 for i in unlabeled_probs])

    #step1:MPE
    our_mpe_estimate, index, alpha_estimate, lower_bound, \
            upper_bound, plot_arr, ideal_plot_arr = top_bin_estimator_count(pos_probs, unlabeled_probs, unlabeled_targets)
    alpha_used = our_mpe_estimate
    if alpha_used >=1:
        alpha_used = 0.8
    
    #step2: PvN training
    #select the reliable negative samples and suspicious positive samples
    keep_samples = rank_inputs(bst, un_X_train, alpha_used)
    unlabeled_new = un_X_train[keep_samples == 1]
    pos_new = un_X_train[keep_samples ==0]
    
    X_train_new = pd.concat([pos_X_train,pos_new,unlabeled_new],axis=0)
    y_train_new = pd.DataFrame([1]*(len(pos_X_train)+len(pos_new))+[0]*len(unlabeled_new)).iloc[:,0]

    bst.fit(X_train_new,y_train_new)
   
    
    print("epoch{}, alpha = {}\n".format(epoch,alpha_used))
    train_preds = bst.predict_proba(X_train)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions)
    train_recall = recall_score(y_train, train_predictions)
    train_f1 = f1_score(y_train, train_predictions)
    train_auc = roc_auc_score(y_train,train_preds)

    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))


    train_preds = bst.predict_proba(X_train_new)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train_new, train_predictions)
    train_precision = precision_score(y_train_new, train_predictions)
    train_recall = recall_score(y_train_new, train_predictions)
    train_f1 = f1_score(y_train_new, train_predictions)
    train_auc = roc_auc_score(y_train_new,train_preds)
    print("New positive samples added:")
    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))

    # make prediction
    preds = bst.predict_proba(X_test)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds] 

    test_accuracy = accuracy_score(y_test, predictions)
    test_precision = precision_score(y_test, predictions)
    test_recall = recall_score(y_test, predictions)
    test_f1=f1_score(y_test, predictions)
    test_auc=roc_auc_score(y_test,preds)

    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


    # make prediction
    preds = bst.predict_proba(new_pos_X)[:,1]
    predictions = [1 if value>=0.4 else 0 for value in preds] 
    new_test_recall = recall_score([1]*len(predictions), predictions)
    print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new recall
    
    pos_X_test = X_test[y_test==1]
    un_X_test = X_test[y_test==0]
    keep_samples_test = rank_inputs(bst, un_X_test, alpha_used)
    unlabeled_new_test = un_X_test[keep_samples_test == 1]
    pos_new_test = un_X_test[keep_samples_test ==0]
    
    X_test_new = pd.concat([pos_X_test,pos_new_test,unlabeled_new_test],axis=0)
    y_test_new = pd.DataFrame([1]*(len(pos_X_test)+len(pos_new_test))+[0]*len(unlabeled_new_test)).iloc[:,0]
    
    preds = bst.predict_proba(X_test_new)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds]
    test_accuracy = accuracy_score(y_test_new, predictions)
    test_precision = precision_score(y_test_new, predictions)
    test_recall = recall_score(y_test_new, predictions)
    test_f1=f1_score(y_test_new, predictions)
    test_auc=roc_auc_score(y_test_new,preds)
    print("New positive samples added:")
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


xgb_pu = copy.deepcopy(bst)
#feature importance
plot_importance(xgb_pu)
plt.show()

In [None]:
#save the model
xgb_pu.save_model('./data_gnn/xgb_pu.json')

predict_prob = xgb_pu.predict_proba(train)[:,1] 
predict_label = [1 if value>=thr else 0 for value in predict_prob] 
samples_2['predict_label'] = predict_label
samples_2['predict_prob'] = predict_prob
merger['trade_money']=merger['trade_money']*merger['reverse'] 
tmp=merger.groupby(['roleid_src','roleid_dst'])['trade_money'].agg('sum').reset_index()
tmp=tmp[tmp['trade_money']>=0]
merger['trade_money']=merger['trade_money']*merger['reverse']

samples_pos=samples_2[samples_2['predict_label']>0]
samples_2.drop(columns=['predict_label','predict_prob'],inplace=True)
samples_pos=samples_pos.merge(tmp,on=['roleid_src','roleid_dst']) 

#save the edges(predicted RMTs) for graph construction
ns = samples_pos[['roleid_src','roleid_dst','trade_n_times_mean','trade_money']]
ns.columns = ['Source','Target','Weight','trade_money'] 
ns.to_csv('./data_gnn/test_graph_2.csv')



In [None]:
thr = 0.4
preds = xgb_pu.predict_proba(train)[:,1]
predictions = pd.Series([1 if value>=thr else 0 for value in preds])
G_new = copy.deepcopy(G0) 

fp_X_1 = train_1[(predictions==1)]
print(len(fp_X_1))

for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity

## 4.3 with RAR

In [None]:
#Reliability Assessment Rules
def postCut(keep_samples, un_X, before_G, classifier):  
    G1 = copy.deepcopy(before_G)
    i = 0
    for _,row in un_X[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
        if keep_samples[i] == 0:  #suspicious positive
            G1.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
        i+=1
    
    #Louvain community detection
    partition = community.best_partition(G1) 
    print("partition ok") 

    size = len(set(partition.values())) #number of communities
    print(size)
    mod = community.modularity(partition,G1)
    print(mod) #modularity 
    
    cnt = {} #count node numbers of each community
    for com in set(partition.values()) :
        list_nodes = [nodes for nodes in partition.keys() if partition[nodes] == com]
        cnt[com] = len(list_nodes)
    
    score = classifier.predict_proba(un_X.iloc[:,2:])[:,1]
    
    i = 0
    #if meets any of the 4 conditions, select as reliable positive
    for _,row in un_X.iterrows():
        if keep_samples[i] == 1:
            i+=1
            continue
        
        if score[i]>=0.9 or (cnt[partition[row['roleid_src']]]<50 and cnt[partition[row['roleid_dst']]]<50):
            i+=1
            continue
        if row['roleid_src'] in before_G.nodes() or row['roleid_dst'] in before_G.nodes(): 
            i+=1
            continue

        if G1.degree(row['roleid_src'])>=30 or G1.degree(row['roleid_dst'])>=30:
            i+=1
            continue
        
        keep_samples[i] = -1  #drop the unqualified suspicious positive samples
        i+=1
        
    return keep_samples

In [None]:
alpha_default = 0.8 
epochs = 20
thr = 0.4
if_pu = True

#after warm start
for epoch in range(epochs):

    pos_probs = np.array(bst.predict_proba(pos_X_test)[:,1])
    unlabeled_probs = np.array(bst.predict_proba(un_X_test)[:,1])
    unlabeled_targets = np.array([1 if i>=thr else 0 for i in unlabeled_probs])

    #step1:MPE
    our_mpe_estimate, index, alpha_estimate, lower_bound, \
            upper_bound, plot_arr, ideal_plot_arr = top_bin_estimator_count(pos_probs, unlabeled_probs, unlabeled_targets)
    alpha_used = our_mpe_estimate
    if alpha_used >=1:
        alpha_used = 0.8
    
    #step2: PvN training
    #select the reliable negative samples and suspicious positive samples
    keep_samples = rank_inputs(bst, un_X_train, alpha_used)
    keep_samples = postCut(keep_samples, un_X_train_1, G0, bst)
    unlabeled_new = un_X_train[keep_samples == 1]
    pos_new = un_X_train[keep_samples ==0]
    
    X_train_new = pd.concat([pos_X_train,pos_new,unlabeled_new],axis=0)
    y_train_new = pd.DataFrame([1]*(len(pos_X_train)+len(pos_new))+[0]*len(unlabeled_new)).iloc[:,0]

    bst.fit(X_train_new,y_train_new)
   
    
    print("epoch{}, alpha = {}\n".format(epoch,alpha_used))
    train_preds = bst.predict_proba(X_train)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions)
    train_recall = recall_score(y_train, train_predictions)
    train_f1 = f1_score(y_train, train_predictions)
    train_auc = roc_auc_score(y_train,train_preds)

    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))


    train_preds = bst.predict_proba(X_train_new)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train_new, train_predictions)
    train_precision = precision_score(y_train_new, train_predictions)
    train_recall = recall_score(y_train_new, train_predictions)
    train_f1 = f1_score(y_train_new, train_predictions)
    train_auc = roc_auc_score(y_train_new,train_preds)
    print("New positive samples added:")
    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))

    # make prediction
    preds = bst.predict_proba(X_test)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds] 

    test_accuracy = accuracy_score(y_test, predictions)
    test_precision = precision_score(y_test, predictions)
    test_recall = recall_score(y_test, predictions)
    test_f1=f1_score(y_test, predictions)
    test_auc=roc_auc_score(y_test,preds)

    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


    # make prediction
    preds = bst.predict_proba(new_pos_X)[:,1]
    predictions = [1 if value>=0.4 else 0 for value in preds] 
    new_test_recall = recall_score([1]*len(predictions), predictions)
    print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new recall
    
    pos_X_test = X_test[y_test==1]
    un_X_test = X_test[y_test==0]
    keep_samples_test = rank_inputs(bst, un_X_test, alpha_used)
    unlabeled_new_test = un_X_test[keep_samples_test == 1]
    pos_new_test = un_X_test[keep_samples_test ==0]
    
    X_test_new = pd.concat([pos_X_test,pos_new_test,unlabeled_new_test],axis=0)
    y_test_new = pd.DataFrame([1]*(len(pos_X_test)+len(pos_new_test))+[0]*len(unlabeled_new_test)).iloc[:,0]
    
    preds = bst.predict_proba(X_test_new)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds]
    test_accuracy = accuracy_score(y_test_new, predictions)
    test_precision = precision_score(y_test_new, predictions)
    test_recall = recall_score(y_test_new, predictions)
    test_f1=f1_score(y_test_new, predictions)
    test_auc=roc_auc_score(y_test_new,preds)
    print("New positive samples added:")
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


xgb_test = copy.deepcopy(bst)
#feature importance
plot_importance(xgb_test)
plt.show()


In [None]:
#save the model
xgb_test.save_model('./data_gnn/xgb_test.json')

predict_prob = xgb_test.predict_proba(train)[:,1] 
predict_label = [1 if value>=thr else 0 for value in predict_prob] 
samples_2['predict_label'] = predict_label
samples_2['predict_prob'] = predict_prob
merger['trade_money']=merger['trade_money']*merger['reverse'] 
tmp=merger.groupby(['roleid_src','roleid_dst'])['trade_money'].agg('sum').reset_index()
tmp=tmp[tmp['trade_money']>=0]
merger['trade_money']=merger['trade_money']*merger['reverse']

samples_pos=samples_2[samples_2['predict_label']>0]
samples_2.drop(columns=['predict_label','predict_prob'],inplace=True)
samples_pos=samples_pos.merge(tmp,on=['roleid_src','roleid_dst']) 

#save the edges(predicted RMTs) for graph construction
ns = samples_pos[['roleid_src','roleid_dst','trade_n_times_mean','trade_money']]
ns.columns = ['Source','Target','Weight','trade_money'] 
ns.to_csv('./data_gnn/test_graph_3.csv')



In [None]:
thr = 0.4
preds = xgb_test.predict_proba(train)[:,1]
predictions = pd.Series([1 if value>=thr else 0 for value in preds])
G_new = copy.deepcopy(G0) 

fp_X_1 = train_1[(predictions==1)]
print(len(fp_X_1))

for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity

# 5. GAT Embedding

## 5.1 graph construction

In [None]:
#map node IDs to embeddings
ids = np.unique(np.concatenate([samples_2['roleid_src'],samples_2['roleid_dst']]))
id_dict = {}
for i in range(len(ids)):
    id_dict[ids[i]] = i

input_size = 8
features = torch.randn(len(ids),input_size)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
features = features.to(device)

In [None]:
predict_prob = xgb_test.predict_proba(train)[:,1]
id1 = torch.from_numpy(np.array([id_dict[x] for x in samples_3['roleid_src']]))
id2 = torch.from_numpy(np.array([id_dict[x] for x in samples_3['roleid_dst']]))


In [None]:
g = dgl.DGLGraph()
#use RMT prediction score(also tried trade amount/trade frequency) as edge weight
g.add_edges(id1,id2,data={'weight':torch.from_numpy(predict_prob)})
#add self-loops with weight 1.0
g.add_edges(g.nodes(), g.nodes(),data={'weight':torch.ones_like(g.nodes(),dtype=torch.float32)})
g = g.to(device)
print(g)

## 5.2 model definition

In [None]:
class GATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, device):
        super(GATLayer, self).__init__()
        self.g = g
        # equation (1)
        self.fc = nn.Linear(in_dim, out_dim, bias=False)
        # equation (2)
        self.attn_fc = nn.Linear(2 * out_dim+1, 1, bias=False)  #concat the weight
        self.device = device

    def edge_attention(self, edges):
        # edge UDF for equation (2)
        z2 = torch.cat([edges.src['z'], edges.dst['z'], edges.data['weight'].unsqueeze(1)], dim=1)
        a = self.attn_fc(z2)
        return {'e': F.leaky_relu(a)}

    def message_func(self, edges):
        # message UDF for equation (3) & (4)
        return {'z': edges.src['z'], 'e': edges.data['e']}

    def reduce_func(self, nodes):
        # reduce UDF for equation (3) & (4)
        # equation (3)
        alpha = F.softmax(nodes.mailbox['e'], dim=1)
        # equation (4)
        h = torch.sum(alpha * nodes.mailbox['z'], dim=1)
        return {'h': h}

    def forward(self, h):
        # equation (1)
        z = self.fc(h)
        self.g.ndata['z'] = z
        # equation (2)
        self.g.apply_edges(self.edge_attention)
        # equation (3) & (4)
        self.g.update_all(self.message_func, self.reduce_func)
        #return self.g.ndata.pop('h')
        return self.g.ndata['h']


In [None]:
class MultiHeadGATLayer(nn.Module):
    def __init__(self, g, in_dim, out_dim, num_heads, device, merge='cat'):
        super(MultiHeadGATLayer, self).__init__()
        self.heads = nn.ModuleList()
        self.device = device
        for i in range(num_heads):
            self.heads.append(GATLayer(g, in_dim, out_dim,self.device))
        self.heads = self.heads.to(self.device) #to device!
        self.merge = merge

    def forward(self, h):
        head_outs = [attn_head(h) for attn_head in self.heads]
        if self.merge == 'cat':
            # concat on the output feature dimension (dim=1)
            return torch.cat(head_outs, dim=1)
        else:
            # merge using average
            return torch.mean(torch.stack(head_outs))


In [None]:
#2-layer multi-head GAT
class GAT(nn.Module):
    def __init__(self, g, in_dim, hidden_dim, out_dim, num_heads, device):
        super(GAT, self).__init__()
        self.layer1 = MultiHeadGATLayer(g, in_dim, hidden_dim, num_heads, device)
        # Be aware that the input dimension is hidden_dim*num_heads since
        # multiple head outputs are concatenated together. Also, only
        # one attention head in the output layer.
        self.layer2 = MultiHeadGATLayer(g, hidden_dim * num_heads, out_dim, 1,device)

    def forward(self, h):
        h = self.layer1(h)
        h = F.relu(h)
        h = self.layer2(h)
        return h


## 5.3 training

In [None]:
# model and test
gat = GAT(g,features.shape[1],16,8,2,device) 
print(gat)

test = gat(features)
print(test)
print(test.shape)
print(g)
print(g.adjacency_matrix)

In [None]:
#reconstruction loss
def reloss(features,g):

    adj = np.array(torch.zeros([len(g.nodes()),len(g.nodes())]))  #adj matrix
    weight = g.edata['weight'].cpu().numpy()
    cpu_features = features.detach().cpu().numpy()

    i = 0
    for src,dst in zip(g.edges()[0].cpu().numpy(),g.edges()[1].cpu().numpy()):
        adj[src][dst] = weight[i]
        i+=1
    #dot-product as similarity
    sim = cpu_features.dot(cpu_features.T)
    min_max_scaler = MinMaxScaler() 
    sim = min_max_scaler.fit_transform(sim)  #normalize

    return torch.norm(torch.from_numpy(sim-adj))/len(g.nodes())

In [None]:
optimizer = optim.Adam(gat.parameters(), lr=0.0001) 
epochs = 20
for epoch in range(epochs):
    gat.train()
    optimizer.zero_grad()
    features = gat(features)
    loss = reloss(features,g).to(device)
    loss.requires_grad_(True)
    loss.backward()
    optimizer.step()
    print('Train Epoch: {} \tloss: {:.6f}'.format(epoch,loss.data.cpu().numpy()))
    

In [None]:
#save the gat model and trained embeddings
torch.save(gat, './data_gnn/gat.pkl')
with open('./data_gnn/embeddings', "a") as log_file:
        np.savetxt(log_file,  features.detach().cpu().numpy())

# 6. TEDn+ with GAT Embedding

## 6.1 dataset & reweighted

In [None]:
# gat embeddings
node_embeddings = pd.DataFrame(features.detach().cpu().numpy())
node_embeddings = pd.concat([pd.Series(ids),node_embeddings],axis = 1)
columns = ['roleid']
for i in range(len(node_embeddings.columns)-1):
    columns.append("emb_"+str(i))
node_embeddings.columns = columns

In [None]:
#merge the node embeddings of the buyer/seller
samples_3 = samples_2.merge(node_embeddings,left_on = 'roleid_src',right_on='roleid')
samples_3 = samples_3.merge(node_embeddings,left_on = 'roleid_dst',right_on='roleid')
label = samples_3.label
label_2 = samples_3.label2
samples_3.drop(columns=['roleid_x','roleid_y','label','label2'],inplace = True)
samples_3.insert(len(samples_3.columns),'label',label)
samples_3.insert(len(samples_3.columns),'label2',label_2)
#drop the player profile
samples_3.drop(columns=['level1_x_mean', 'level2_x_mean',
       'level3_x_mean', 'level4_x_mean', 'score1_x_mean', 'score2_x_mean',
       'exp1_x_mean', 'exp2_x_mean', 'exp3_x_mean', 'online_time_x_mean',
       'level1_y_mean', 'level2_y_mean', 'level3_y_mean', 'level4_y_mean',
       'score1_y_mean', 'score2_y_mean', 'exp1_y_mean', 'exp2_y_mean',
       'exp3_y_mean', 'online_time_y_mean'],inplace=True)
print(samples_3.columns)

In [None]:
train_1 = samples_3.iloc[:,0:-2] #reserve the role IDs
train = samples_3.iloc[:,2:-2]
target = samples_3.iloc[:,-2]
print(train.shape)
print(target.shape)
X_train_1, X_test_1, y_train, y_test = train_test_split(train_1, target, test_size = 0.2, random_state = 7)
X_train = X_train_1.iloc[:,2:]
X_test = X_test_1.iloc[:,2:]

new_pos_X = samples_3[(samples_3['label2']==1)&(samples_3['label']==0)].iloc[:,2:-2]
new_pos = samples_3[(samples_3['label2']==1)&(samples_3['label']==0)]

In [None]:
pos_X_train = X_train[y_train==1]
pos_X_train_1 = X_train_1[y_train==1] #reserve the role IDs
un_X_train = X_train[y_train==0]
un_X_train_1 = X_train_1[y_train==0] #reserve the role IDs
pos_X_test = X_test[y_test==1]
pos_X_test_1 = X_test_1[y_test==1]
un_X_test = X_test[y_test==0]
un_X_test_1 = X_test_1[y_test==0]

print(pos_X_train.shape)
print(pos_X_test.shape)
print(un_X_train.shape)
print(un_X_test.shape)

In [None]:
#warm start for an epoch
bst = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(un_X_train)//len(pos_X_train)) #sklearn api


bst.fit(X_train, y_train)
xgb_normal = copy.deepcopy(bst)

train_preds = bst.predict_proba(X_train)[:,1]
train_predictions = [1 if value>=0.4 else 0 for value in train_preds]

train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions)
train_recall = recall_score(y_train, train_predictions)
train_f1 = f1_score(y_train, train_predictions)
train_auc = roc_auc_score(y_train,train_preds)

print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
print ("Train Precision: %.2f%%" % (train_precision * 100.0))
print ("Train Recall: %.2f%%" % (train_recall * 100.0))
print ("Train f1: %.4f" % (train_f1))
print ("Train AUC: %.4f" % (train_auc))


# make prediction
preds = bst.predict_proba(X_test)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] 

test_accuracy = accuracy_score(y_test, predictions)
test_precision = precision_score(y_test, predictions)
test_recall = recall_score(y_test, predictions)
test_f1=f1_score(y_test, predictions)
test_auc=roc_auc_score(y_test,preds)

print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
print ("Test Precision: %.2f%%" % (test_precision * 100.0))
print ("Test Recall: %.2f%%" % (test_recall * 100.0))
print ("Test f1: %.4f" % (test_f1))
print ("Test AUC: %.4f" % (test_auc))


# make prediction
preds = bst.predict_proba(new_pos_X)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] 
new_test_recall = recall_score([1]*len(predictions), predictions)
print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new_recall

#feature importance
plot_importance(xgb_normal)
plt.show()



In [None]:
thr = 0.4
preds = xgb_normal.predict_proba(train)[:,1]
predictions = pd.Series([1 if value>=thr else 0 for value in preds])
G_new = copy.deepcopy(G0) 

fp_X_1 = train_1[(predictions==1)]
print(len(fp_X_1))

for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity

In [None]:
#save the model
xgb_normal.save_model('./data_gnn/xgb_normal_1.json')

predict_prob = xgb_normal.predict_proba(train)[:,1] 
predict_label = [1 if value>=thr else 0 for value in predict_prob] 
samples_2['predict_label'] = predict_label
samples_2['predict_prob'] = predict_prob
merger['trade_money']=merger['trade_money']*merger['reverse'] 
tmp=merger.groupby(['roleid_src','roleid_dst'])['trade_money'].agg('sum').reset_index()
tmp=tmp[tmp['trade_money']>=0]
merger['trade_money']=merger['trade_money']*merger['reverse']

samples_pos=samples_2[samples_2['predict_label']>0]
samples_2.drop(columns=['predict_label','predict_prob'],inplace=True)
samples_pos=samples_pos.merge(tmp,on=['roleid_src','roleid_dst']) 

#save the edges(predicted RMTs) for graph construction
ns = samples_pos[['roleid_src','roleid_dst','trade_n_times_mean','trade_money']]
ns.columns = ['Source','Target','Weight','trade_money'] 
ns.to_csv('./data_gnn/test_graph_4.csv')



## 6.2 without RAR

In [None]:
alpha_default = 0.8 
epochs = 20
thr = 0.4
if_pu = True

#after warm start
for epoch in range(epochs):

    pos_probs = np.array(bst.predict_proba(pos_X_test)[:,1])
    unlabeled_probs = np.array(bst.predict_proba(un_X_test)[:,1])
    unlabeled_targets = np.array([1 if i>=thr else 0 for i in unlabeled_probs])

    #step1:MPE
    our_mpe_estimate, index, alpha_estimate, lower_bound, \
            upper_bound, plot_arr, ideal_plot_arr = top_bin_estimator_count(pos_probs, unlabeled_probs, unlabeled_targets)
    alpha_used = our_mpe_estimate
    if alpha_used >=1:
        alpha_used = 0.8
    
    #step2: PvN training
    #select the reliable negative samples and suspicious positive samples
    keep_samples = rank_inputs(bst, un_X_train, alpha_used)
    unlabeled_new = un_X_train[keep_samples == 1]
    pos_new = un_X_train[keep_samples ==0]
    
    X_train_new = pd.concat([pos_X_train,pos_new,unlabeled_new],axis=0)
    y_train_new = pd.DataFrame([1]*(len(pos_X_train)+len(pos_new))+[0]*len(unlabeled_new)).iloc[:,0]

    bst.fit(X_train_new,y_train_new)
   
    
    print("epoch{}, alpha = {}\n".format(epoch,alpha_used))
    train_preds = bst.predict_proba(X_train)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions)
    train_recall = recall_score(y_train, train_predictions)
    train_f1 = f1_score(y_train, train_predictions)
    train_auc = roc_auc_score(y_train,train_preds)

    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))


    train_preds = bst.predict_proba(X_train_new)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train_new, train_predictions)
    train_precision = precision_score(y_train_new, train_predictions)
    train_recall = recall_score(y_train_new, train_predictions)
    train_f1 = f1_score(y_train_new, train_predictions)
    train_auc = roc_auc_score(y_train_new,train_preds)
    print("New positive samples added:")
    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))

    # make prediction
    preds = bst.predict_proba(X_test)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds] 

    test_accuracy = accuracy_score(y_test, predictions)
    test_precision = precision_score(y_test, predictions)
    test_recall = recall_score(y_test, predictions)
    test_f1=f1_score(y_test, predictions)
    test_auc=roc_auc_score(y_test,preds)

    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


    # make prediction
    preds = bst.predict_proba(new_pos_X)[:,1]
    predictions = [1 if value>=0.4 else 0 for value in preds] 
    new_test_recall = recall_score([1]*len(predictions), predictions)
    print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new recall
    
    pos_X_test = X_test[y_test==1]
    un_X_test = X_test[y_test==0]
    keep_samples_test = rank_inputs(bst, un_X_test, alpha_used)
    unlabeled_new_test = un_X_test[keep_samples_test == 1]
    pos_new_test = un_X_test[keep_samples_test ==0]
    
    X_test_new = pd.concat([pos_X_test,pos_new_test,unlabeled_new_test],axis=0)
    y_test_new = pd.DataFrame([1]*(len(pos_X_test)+len(pos_new_test))+[0]*len(unlabeled_new_test)).iloc[:,0]
    
    preds = bst.predict_proba(X_test_new)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds]
    test_accuracy = accuracy_score(y_test_new, predictions)
    test_precision = precision_score(y_test_new, predictions)
    test_recall = recall_score(y_test_new, predictions)
    test_f1=f1_score(y_test_new, predictions)
    test_auc=roc_auc_score(y_test_new,preds)
    print("New positive samples added:")
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


xgb_pu = copy.deepcopy(bst)
#feature importance
plot_importance(xgb_pu)
plt.show()

In [None]:
#save the model
xgb_pu.save_model('./data_gnn/xgb_pu_1.json')

predict_prob = xgb_pu.predict_proba(train)[:,1] 
predict_label = [1 if value>=thr else 0 for value in predict_prob] 
samples_2['predict_label'] = predict_label
samples_2['predict_prob'] = predict_prob
merger['trade_money']=merger['trade_money']*merger['reverse'] 
tmp=merger.groupby(['roleid_src','roleid_dst'])['trade_money'].agg('sum').reset_index()
tmp=tmp[tmp['trade_money']>=0]
merger['trade_money']=merger['trade_money']*merger['reverse']

samples_pos=samples_2[samples_2['predict_label']>0]
samples_2.drop(columns=['predict_label','predict_prob'],inplace=True)
samples_pos=samples_pos.merge(tmp,on=['roleid_src','roleid_dst']) 

#save the edges(predicted RMTs) for graph construction
ns = samples_pos[['roleid_src','roleid_dst','trade_n_times_mean','trade_money']]
ns.columns = ['Source','Target','Weight','trade_money'] 
ns.to_csv('./data_gnn/test_graph_5.csv')



In [None]:
thr = 0.4
preds = xgb_pu.predict_proba(train)[:,1]
predictions = pd.Series([1 if value>=thr else 0 for value in preds])
G_new = copy.deepcopy(G0) 

fp_X_1 = train_1[(predictions==1)]
print(len(fp_X_1))

for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity

# 7. Other PU Methods

## 7.1 TEDn

In [None]:
alpha_default = 0.8 
epochs = 20
thr = 0.4
if_pu = True
bst = copy.deepcopy(xgb_normal)

#after warm start
for epoch in range(epochs):

    pos_probs = np.array(bst.predict_proba(pos_X_test)[:,1])
    unlabeled_probs = np.array(bst.predict_proba(un_X_test)[:,1])
    unlabeled_targets = np.array([1 if i>=thr else 0 for i in unlabeled_probs])

    #step1:MPE
    our_mpe_estimate, index, alpha_estimate, lower_bound, \
            upper_bound, plot_arr, ideal_plot_arr = top_bin_estimator_count(pos_probs, unlabeled_probs, unlabeled_targets)
    alpha_used = our_mpe_estimate
    if alpha_used >=1:
        alpha_used = 0.8
    
    #step2: PvN training
    #select the reliable negative samples and drop the suspicious positive samples
    keep_samples = rank_inputs(bst, un_X_train, alpha_used)
    unlabeled_new = un_X_train[keep_samples == 1]
    pos_new = un_X_train[keep_samples ==0]
    
    X_train_new = pd.concat([pos_X_train,unlabeled_new],axis=0)
    y_train_new = pd.DataFrame([1]*(len(pos_X_train)+[0]*len(unlabeled_new)).iloc[:,0]

    bst.fit(X_train_new,y_train_new)
   
    
    print("epoch{}, alpha = {}\n".format(epoch,alpha_used))
    train_preds = bst.predict_proba(X_train)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions)
    train_recall = recall_score(y_train, train_predictions)
    train_f1 = f1_score(y_train, train_predictions)
    train_auc = roc_auc_score(y_train,train_preds)

    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))


    train_preds = bst.predict_proba(X_train_new)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train_new, train_predictions)
    train_precision = precision_score(y_train_new, train_predictions)
    train_recall = recall_score(y_train_new, train_predictions)
    train_f1 = f1_score(y_train_new, train_predictions)
    train_auc = roc_auc_score(y_train_new,train_preds)
    print("New positive samples added:")
    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))

    # make prediction
    preds = bst.predict_proba(X_test)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds] 

    test_accuracy = accuracy_score(y_test, predictions)
    test_precision = precision_score(y_test, predictions)
    test_recall = recall_score(y_test, predictions)
    test_f1=f1_score(y_test, predictions)
    test_auc=roc_auc_score(y_test,preds)

    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


    # make prediction
    preds = bst.predict_proba(new_pos_X)[:,1]
    predictions = [1 if value>=0.4 else 0 for value in preds] 
    new_test_recall = recall_score([1]*len(predictions), predictions)
    print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new recall
    
    pos_X_test = X_test[y_test==1]
    un_X_test = X_test[y_test==0]
    keep_samples_test = rank_inputs(bst, un_X_test, alpha_used)
    unlabeled_new_test = un_X_test[keep_samples_test == 1]
    pos_new_test = un_X_test[keep_samples_test ==0]
    
    X_test_new = pd.concat([pos_X_test,pos_new_test,unlabeled_new_test],axis=0)
    y_test_new = pd.DataFrame([1]*(len(pos_X_test)+len(pos_new_test))+[0]*len(unlabeled_new_test)).iloc[:,0]
    
    preds = bst.predict_proba(X_test_new)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds]
    test_accuracy = accuracy_score(y_test_new, predictions)
    test_precision = precision_score(y_test_new, predictions)
    test_recall = recall_score(y_test_new, predictions)
    test_f1=f1_score(y_test_new, predictions)
    test_auc=roc_auc_score(y_test_new,preds)
    print("New positive samples added:")
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


xgb_tmp = copy.deepcopy(bst)
#feature importance
plot_importance(xgb_tmp)
plt.show()

In [None]:
thr = 0.4
preds = xgb_tmp.predict_proba(train)[:,1]
predictions = pd.Series([1 if value>=thr else 0 for value in preds])
G_new = copy.deepcopy(G0) 

fp_X_1 = train_1[(predictions==1)]
print(len(fp_X_1))

for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity

## 7.2 DEDPUL

In [None]:
def rolling_apply(diff, k_neighbours):
    s = pd.Series(diff)
    s = np.concatenate((
                        s.iloc[:2*k_neighbours].expanding(center=False).median()[::2].values,
                        s.rolling(k_neighbours*2+1, center=True).median().dropna().values,
                        np.flip(np.flip(s.iloc[-2*k_neighbours:], axis=0).expanding(center=False).median()[::2], axis=0).values
    ))
    return s


class GaussianMixtureNoFit(GaussianMixture):
    def __init__(self, n_components=1, covariance_type='full', tol=1e-3,
                 reg_covar=1e-6, max_iter=100, n_init=1, init_params='kmeans',
                 weights_init=None, means_init=None, precisions_init=None,
                 random_state=None, warm_start=False,
                 verbose=0, verbose_interval=10, max_components=None):

        self.max_components = max_components

        idx = np.arange(means_init.shape[0])
        if (max_components is not None) and (means_init.shape[0] > max_components):
            n_components = min(n_components, max_components)
            np.random.shuffle(idx)
            idx = idx[:self.max_components]

        weights_init = weights_init[idx]
        weights_init /= weights_init.sum()
        means_init = means_init[idx]
        precisions_init = precisions_init[idx]

        super().__init__(n_components, covariance_type, tol,
                         reg_covar, max_iter, n_init, init_params,
                         weights_init, means_init, precisions_init,
                         random_state, warm_start,
                         verbose, verbose_interval)

        self.weights = weights_init
        self.means = means_init
        self.precisions = precisions_init
        self.weights_ = weights_init
        self.means_ = means_init
        self.precisions_ = precisions_init
        self.precisions_cholesky_ = np.sqrt(precisions_init)
        self.covariances_ = 1 / precisions_init
        self.covariances = 1 / precisions_init

    def _initialize(self, X, resp):
        pass

    def _m_step(self, X, log_resp):
        pass

    def fit(self, X, y=None):
        pass
        return self
    

def maximize_log_likelihood(preds, kde_inner_fun, kde_outer_fun, n_folds=5, kde_type='kde', bw_low=0.01, bw_high=0.4,
                            n_gauss_low=1, n_gauss_high=50, bins_low=20, bins_high=250, n_steps=25):
    kf = KFold(n_folds, shuffle=True)
    idx_best, like_best = 0, 0
    bws = np.exp(np.linspace(np.log(bw_low), np.log(bw_high), n_steps))
    n_gauss = np.linspace(n_gauss_low, n_gauss_high, n_steps).astype(int)
    bins = np.linspace(bins_low, bins_high, n_steps).astype(int)
    for idx, (bw, n_g, bin) in enumerate(zip(bws, n_gauss, bins)):
        like = 0
        for train_idx, test_idx in kf.split(preds):
            if kde_type == 'kde':
                kde = gaussian_kde(np.apply_along_axis(kde_inner_fun, 0, preds[train_idx]), bw)
            elif kde_type == 'GMM':
                GMM = GaussianMixture(n_g, covariance_type='spherical').fit(
                    np.apply_along_axis(kde_inner_fun, 0, preds[train_idx]).reshape(-1, 1))
                kde = lambda x: np.exp(GMM.score_samples(x.reshape(-1, 1)))
            elif kde_type == 'hist':
                bars = np.histogram(preds[train_idx], bins=bin, range=(0, 1), density=True)[0]
                kde = lambda x: bars[np.clip((x // (1 / bin)).astype(int), 0, bin - 1)]
                kde_outer_fun = lambda kde, x: kde(x)

            like += compute_log_likelihood(preds[test_idx], kde, kde_outer_fun)
        if like > like_best:
            like_best, idx_best = like, idx
    if kde_type == 'kde':
        return bws[idx_best]
    elif kde_type == 'GMM':
        return n_gauss[idx_best]
    elif kde_type == 'hist':
        return bins[idx_best]
    
class MonotonizingTrends:
    def __init__(self, a=None, MT_coef=1):
        self.counter = dict()
        self.array_new = []
        if a is None:
            self.array_old = []
        else:
            self.add_array(a)
        self.MT_coef = MT_coef

    def add_array(self, a):
        if isinstance(a, np.ndarray) or isinstance(a, pd.Series):
            a = a.tolist()
        self.array_old = a

    def reset(self):
        self.counter = dict()
        self.array_old = []
        self.array_new = []

    def get_highest_point(self):
        if self.counter:
            return max(self.counter)
        else:
            return np.NaN

    def add_point_to_counter(self, point):
        if point not in self.counter.keys():
            self.counter[point] = 1

    def change_counter_according_to_point(self, point):
        for key in self.counter.keys():
            if key <= point:
                self.counter[key] += 1
            else:
                self.counter[key] -= self.MT_coef

    def clear_counter(self):
        for key, value in list(self.counter.items()):
            if value <= 0:
                self.counter.pop(key)

    def update_counter_with_point(self, point):
        self.change_counter_according_to_point(point)
        self.clear_counter()
        self.add_point_to_counter(point)

    def monotonize_point(self, point=None):
        if point is None:
            point = self.array_old.pop(0)
        new_point = max(point, self.get_highest_point())
        self.array_new.append(new_point)
        self.update_counter_with_point(point)
        return new_point

    def monotonize_array(self, a=None, reset=False, decay_MT_coef=False):
        if a is not None:
            self.add_array(a)
        decay_by = 0
        if decay_MT_coef:
            decay_by = self.MT_coef / len(a)

        for _ in range(len(self.array_old)):
            self.monotonize_point()
            if decay_MT_coef:
                self.MT_coef -= decay_by

        if not reset:
            return self.array_new
        else:
            array_new = self.array_new[:]
            self.reset()
            return array_new

In [None]:
def estimate_preds_cv_xgboost(data, target, random_state=None, n_networks=1,
                               cv=3, n_early_stop=10, verbose=False):
#     if xgboost_params is None:
#         xgboost_params = {}
    preds = np.zeros((n_networks, data.shape[0]))
    for i in range(n_networks):
        kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
        for train_idx, test_idx in kf.split(data, target):
            clf = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=(len(target)-target.sum())//target.sum()) #sklearn api
            clf.fit(data[train_idx], target[train_idx],
                    eval_set=[(data[test_idx], target[test_idx])], verbose=verbose, early_stopping_rounds=n_early_stop)
            preds[i, test_idx] = clf.predict_proba(data[test_idx])[:, 1]
        if random_state is not None:
            random_state += 1
    preds = preds.mean(axis=0)
    # preds = np.median(preds, axis=0)
    return preds

In [None]:
def estimate_poster_cv(df, target, estimator='dedpul', bayes=False, alpha=None, estimate_poster_options=None,
                       estimate_diff_options=None, estimate_preds_cv_options=None, train_nn_options=None):
    """
    Estimates posteriors and priors alpha (if not provided) of N in U; f_u(x) = (1 - alpha) * f_p(x) + alpha * f_n(x)
    :param df: features, np.array (n_instances, n_features)
    :param target: binary vector, 0 if positive, 1 if unlabeled, np.array with shape (n,)
    :param estimator: 'dedpul', 'baseline_dedpul', 'random_dedpul ,'en', 'em_en', or 'nnre';
        'ntc_methods' for every estimate but 'nnre'
    :param alpha: share of N in U; is estimated if not provided (nnRE requires it to be provided)
    :param estimate_poster_options: parameters for estimate_poster... functions
    :param estimate_diff_options: parameters for estimate_diff
    :param estimate_preds_cv_options: parameters for estimate_preds_cv
    :param train_nn_options: parameters for train_NN
    :return: if estimator != 'ntc_methods':
        tuple (alpha, poster), e.g. (priors, posteriors) of N in U for the U sample df[target == 1]
        if estimator == 'ntc_methods':
        dictionary with such (alpha, poster) tuples as values and method names as keys
    """

    if isinstance(df, pd.DataFrame):
        df = df.values
    if isinstance(target, pd.Series):
        target = target.values

    if estimator == 'nnre':
        training_mode = 'nnre'
    else:
        training_mode = 'standard'

    if train_nn_options is None:
        train_nn_options = dict()

    if estimate_poster_options is None:
        estimate_poster_options = dict()

    if estimate_diff_options is None:
        estimate_diff_options = dict()

    if estimate_preds_cv_options is None:
        estimate_preds_cv_options = dict()

    preds = estimate_preds_cv_xgboost(df, target)
    ### uncomment the line above and comment the line below for experiments with catboost instead of neural networks
    #preds = estimate_preds_cv(df=df, target=target, alpha=alpha, training_mode=training_mode, bayes=bayes,
                              #train_nn_options=train_nn_options, **estimate_preds_cv_options)
    if bayes:
        preds, means, variances = preds
    if estimator in {'dedpul', 'baseline_dedpul', 'ntc_methods'}:
        if bayes:
            diff = estimate_diff_bayes(means, variances, target, **estimate_diff_options)
        else:
            diff = estimate_diff(preds, target, **estimate_diff_options)

    if estimator == 'dedpul':
        alpha, poster = estimate_poster_em(diff=diff, mode='dedpul', alpha=alpha, **estimate_poster_options)

    elif estimator == 'baseline_dedpul':
        alpha, poster = estimate_poster_dedpul(diff=diff, alpha=alpha, **estimate_poster_options)

    elif estimator == 'en':
        alpha, poster = estimate_poster_en(preds, target, alpha=alpha, **estimate_poster_options)

    elif estimator == 'em_en':
        alpha, poster = estimate_poster_em(preds=preds, target=target, mode='en', alpha=alpha, **estimate_poster_options)

    elif estimator == 'nnre':
        poster = preds[target == 1]

    elif estimator == 'ntc_methods':
        res = dict()
        res['dedpul'] = estimate_poster_em(diff=diff, mode='dedpul', alpha=None, **estimate_poster_options)
        res['baseline_dedpul'] = estimate_poster_dedpul(diff=diff, alpha=None, **estimate_poster_options)
        res['e1_en'] = estimate_poster_en(preds, target, alpha=None, estimator='e1', **estimate_poster_options)
        res['e3_en'] = estimate_poster_en(preds, target, alpha=None, estimator='e3', **estimate_poster_options)
        res['em_en'] = estimate_poster_em(preds=preds, target=target, mode='en', alpha=None, **estimate_poster_options)

        res['dedpul_poster'] = estimate_poster_em(diff=diff, mode='dedpul', alpha=alpha, **estimate_poster_options)
        res['baseline_dedpul_poster'] = estimate_poster_dedpul(diff=diff, alpha=alpha, **estimate_poster_options)
        res['e1_en_poster'] = estimate_poster_en(preds, target, alpha=alpha, estimator='e1', **estimate_poster_options)
        res['e3_en_poster'] = estimate_poster_en(preds, target, alpha=alpha, estimator='e3', **estimate_poster_options)
        res['em_en_poster'] = estimate_poster_em(preds=preds, target=target, mode='en', alpha=alpha, **estimate_poster_options)

        return res

    return alpha, poster


In [None]:
def estimate_diff(preds, target, bw_mix=0.05, bw_pos=0.1, kde_mode='logit', threshold=None, k_neighbours=None,
                  tune=False, MT=True, MT_coef=0.2, decay_MT_coef=False, kde_type='kde',
                  n_gauss_mix=20, n_gauss_pos=10, bins_mix=20, bins_pos=20):
    """
    Estimates densities of predictions y(x) for P and U and ratio between them f_p / f_u for U sample;
        uses kernel density estimation (kde);
        post-processes difference of estimated densities - imposes monotonicity on lower preds
        (so that diff is partly non-decreasing) and applies rolling median to further reduce variance
    :param preds: predictions of NTC y(x), probability of belonging to U rather than P, np.array with shape (n,)
    :param target: binary vector, 0 if positive, 1 if unlabeled, np.array with shape (n,)
    :param bw_mix: bandwidth for kde of U
    :param bw_pos: bandwidth for kde of P
    :param kde_mode: 'prob', 'log_prob' or 'logit'; default is 'logit'
    :param monotonicity: monotonicity is imposed on density difference for predictions below this number, float in [0, 1]
    :param k_neighbours: difference is relaxed with median rolling window with size k_neighbours * 2 + 1,
        default = int(preds[target == 1].shape[0] // 10)

    :return: difference of densities f_p / f_u for U sample
    """

    if kde_mode is None:
        kde_mode = 'logit'

    if (threshold is None) or (threshold == 'mid'):
        threshold = preds[target == 1].mean() / 2 + preds[target == 0].mean() / 2
    elif threshold == 'low':
        threshold = preds[target == 0].mean()
    elif threshold == 'high':
        threshold = preds[target == 1].mean()

    if k_neighbours is None:
        k_neighbours = int(preds[target == 1].shape[0] // 20)

    if kde_mode == 'prob':
        kde_inner_fun = lambda x: x
        kde_outer_fun = lambda dens, x: dens(x)
    elif kde_mode == 'log_prob':
        kde_inner_fun = lambda x: np.log(x)
        kde_outer_fun = lambda dens, x: dens(np.log(x)) / (x + 10 ** -5)
    elif kde_mode == 'logit':
        kde_inner_fun = lambda x: np.log(x / (1 - x + 10 ** -5))
        kde_outer_fun = lambda dens, x: dens(np.log(x / (1 - x + 10 ** -5))) / (x * (1 - x) + 10 ** -5)

    if kde_type == 'kde':
        if tune:
            bw_mix = maximize_log_likelihood(preds[target == 1], kde_inner_fun, kde_outer_fun, kde_type=kde_type)
            bw_pos = maximize_log_likelihood(preds[target == 0], kde_inner_fun, kde_outer_fun, kde_type=kde_type)

        kde_mix = gaussian_kde(np.apply_along_axis(kde_inner_fun, 0, preds[target == 1]), bw_mix)
        kde_pos = gaussian_kde(np.apply_along_axis(kde_inner_fun, 0, preds[target == 0]), bw_pos)

    elif kde_type == 'GMM':
        if tune:
            n_gauss_mix = maximize_log_likelihood(preds[target == 1], kde_inner_fun, kde_outer_fun, kde_type=kde_type)
            n_gauss_pos = maximize_log_likelihood(preds[target == 0], kde_inner_fun, kde_outer_fun, kde_type=kde_type)

        GMM_mix = GaussianMixture(n_gauss_mix, covariance_type='spherical').fit(
            np.apply_along_axis(kde_inner_fun, 0, preds[target == 1]).reshape(-1, 1))
        GMM_pos = GaussianMixture(n_gauss_pos, covariance_type='spherical').fit(
            np.apply_along_axis(kde_inner_fun, 0, preds[target == 0]).reshape(-1, 1))

        kde_mix = lambda x: np.exp(GMM_mix.score_samples(x.reshape(-1, 1)))
        kde_pos = lambda x: np.exp(GMM_pos.score_samples(x.reshape(-1, 1)))

    elif kde_type == 'hist':
        if tune:
            bins_mix = maximize_log_likelihood(preds[target == 1], kde_inner_fun, lambda kde, x: kde(x),
                                               kde_type=kde_type)
            bins_pos = maximize_log_likelihood(preds[target == 0], kde_inner_fun, lambda kde, x: kde(x),
                                               kde_type=kde_type)
        bars_mix = np.histogram(preds[target == 1], bins=bins_mix, range=(0, 1), density=True)[0]
        bars_pos = np.histogram(preds[target == 0], bins=bins_pos, range=(0, 1), density=True)[0]

        kde_mix = lambda x: bars_mix[np.clip((x // (1 / bins_mix)).astype(int), 0, bins_mix-1)]
        kde_pos = lambda x: bars_pos[np.clip((x // (1 / bins_pos)).astype(int), 0, bins_pos-1)]
        kde_outer_fun = lambda kde, x: kde(x)

    # sorting to relax and impose monotonicity
    sorted_mixed = np.sort(preds[target == 1])

    diff = np.apply_along_axis(lambda x: kde_outer_fun(kde_pos, x) / (kde_outer_fun(kde_mix, x) + 10 ** -5), axis=0,
                               arr=sorted_mixed)
    diff[diff > 50] = 50
    diff = rolling_apply(diff, 5)
    diff = np.append(
        np.flip(np.maximum.accumulate(np.flip(diff[sorted_mixed <= threshold], axis=0)), axis=0),
        diff[sorted_mixed > threshold])
    diff = rolling_apply(diff, k_neighbours)

    if MT:
        MTrends = MonotonizingTrends(MT_coef=MT_coef)
        diff = np.flip(np.array(MTrends.monotonize_array(np.flip(diff, axis=0), reset=True, decay_MT_coef=decay_MT_coef)), axis=0)

    diff.sort()
    diff = np.flip(diff, axis=0)

    # desorting
    diff = diff[np.argsort(np.argsort(preds[target == 1]))]

    return diff


def estimate_diff_bayes(means, variances, target, threshold=None, k_neighbours=None):
    if threshold == 'mid':
        threshold = means[target == 1].mean() / 2 + means[target == 0].mean() / 2
    elif (threshold == 'low') or (threshold is None):
        threshold = means[target == 0].mean()
    elif threshold == 'high':
        threshold = means[target == 1].mean()

    if k_neighbours is None:
        k_neighbours = int(means[target == 1].shape[0] // 20)

    n_mix = means[target == 1].shape[0]
    GMM_mix = GaussianMixtureNoFit(n_mix, covariance_type='spherical', max_iter=1, n_init=1,
                                   weights_init=np.ones(n_mix) / n_mix,
                                   means_init=means[target == 1].reshape(-1, 1),
                                   precisions_init=1 / variances[target == 1]).fit(
        means[target == 1].reshape(-1, 1))
    kde_mix = lambda x: np.exp(GMM_mix.score_samples(x))

    n_pos = means[target == 0].shape[0]
    GMM_pos = GaussianMixtureNoFit(n_pos, covariance_type='spherical', max_iter=1, n_init=1,
                                   weights_init=np.ones(n_pos) / n_pos,
                                   means_init=means[target == 0].reshape(-1, 1),
                                   precisions_init=1 / variances[target == 0]).fit(
        means[target == 0].reshape(-1, 1))
    kde_pos = lambda x: np.exp(GMM_pos.score_samples(x))

    sorted_means = np.sort(means[target == 1])
    # diff = np.array(kde_pos(sorted_means.reshape(-1, 1)) / kde_mix(sorted_means.reshape(-1, 1)))
    diff = np.array([])
    for i in range(int(np.ceil(len(sorted_means) / 1000))):
        current = sorted_means[i * 1000: min((i + 1) * 1000, len(sorted_means))]
        diff = np.append(diff, kde_pos(current.reshape(-1, 1)) / kde_mix(current.reshape(-1, 1)))
    diff[diff > 50] = 50

    diff = rolling_apply(diff, k_neighbours)
    diff = np.append(np.flip(np.maximum.accumulate(np.flip(diff[sorted_means <= threshold], axis=0)), axis=0),
                     diff[sorted_means > threshold])

    diff = diff[np.argsort(np.argsort(means[target == 1]))]

    return diff


def estimate_poster_dedpul(diff, alpha=None, quantile=0.05, alpha_as_mean_poster=False, max_it=100, **kwargs):
    """
    Estimates posteriors and priors alpha (if not provided) of N in U with dedpul method
    :param diff: difference of densities f_p / f_u for the sample U, np.array (n,), output of estimate_diff()
    :param alpha: priors, share of N in U (estimated if None)
    :param quantile: if alpha is None, relaxation of the estimate of alpha;
        here alpha is estimaeted as infinum, and low quantile is its relaxed version;
        share of posteriors probabilities that we allow to be negative (with the following zeroing-out)
    :param kwargs: dummy

    :return: tuple (alpha, poster), e.g. (priors, posteriors) of N in U for the U sample, represented by diff
    """
    if alpha_as_mean_poster and (alpha is not None):
        poster = 1 - diff * (1 - alpha)
        poster[poster < 0] = 0
        cur_alpha = np.mean(poster)
        if cur_alpha < alpha:
            left_border = alpha
            right_border = 1
        else:
            left_border = 0
            right_border = alpha

            poster_zero = 1 - diff
            poster_zero[poster_zero < 0] = 0
            if np.mean(poster_zero) > alpha:
                left_border = -50
                right_border = 0
                # return 0, poster_zero
        it = 0
        try_alpha = cur_alpha
        while (abs(cur_alpha - alpha) > kwargs.get('tol', 10**-5)) and (it < max_it):
            try_alpha = (left_border + (right_border - left_border) / 2)
            poster = 1 - diff * (1 - try_alpha)
            poster[poster < 0] = 0
            cur_alpha = np.mean(poster)
            if cur_alpha > alpha:
                right_border = try_alpha
            else:
                left_border = try_alpha
            it += 1
        alpha = try_alpha
        if it >= max_it:
            print('Exceeded maximal number of iterations in finding mean_poster=alpha')
    else:
        if alpha is None:
            alpha = 1 - 1 / max(np.quantile(diff, 1 - quantile, interpolation='higher'), 1)
        poster = 1 - diff * (1 - alpha)
        poster[poster < 0] = 0
    return alpha, poster


def estimate_poster_en(preds, target, alpha=None, estimator='e1', quantile=0.05, **kwargs):
    """
    Estimates posteriors and priors alpha (if not provided) of N in U with en [Elkan-Noto, 2008] method
    :param preds: predictions of classifier, np.array with shape (n,)
    :param target: binary vector, 0 if positive, 1 if unlabeled, np.array with shape (n,)
    :param alpha: priors, share of N in U (estimated if None)
    :param estimator: 'e1' or 'e3' - from [Elkan-Noto, 2008]
    :param quantile: if alpha is None and estimator is 'e3', relaxation of the estimate of alpha;
        share of posteriors probabilities that we allow to be negative (with the following zeroing-out)
    :param kwargs: dummy
    :return: tuple (alpha, poster), e.g. (priors, posteriors) of N in U for the U sample preds[target == 1]
    """
    if alpha is None:
        if estimator == 'e1':
            c = 1 - np.mean(preds[target == 0])
            alpha = 1 - (1 - c) / c
        elif estimator == 'e3':
            # c = np.quantile(1 - preds, 0.95)
            alpha = 1 - min(np.quantile(preds / (1 - preds), quantile, interpolation='lower'), 1)
        # alpha = 1 - (1 - c) / c
        alpha = max(alpha, 0)
    poster = 1 - (1 - alpha) * (1 - preds[target == 1]) / preds[target == 1]
    poster[poster < 0] = 0
    return alpha, poster


def estimate_poster_em(diff=None, preds=None, target=None, mode='dedpul', converge=True, tol=10**-5,
                       max_iterations=1000, nonconverge=True, step=0.001, max_diff=0.05, plot=False, disp=False,
                       alpha=None, alpha_as_mean_poster=True, **kwargs):
    """
    Performs Expectation-Maximization to estimate posteriors and priors alpha (if not provided) of N in U
        with either of 'en' or 'dedpul' methods; both 'converge' and 'nonconverge' are recommended to be set True for
        better estimate
    :param diff: difference of densities f_p/f_u for the sample U, np.array (n,), output of estimate_diff()
    :param preds: predictions of classifier, np.array with shape (n,)
    :param target: binary vector, 0 if positive, 1 if unlabeled, np.array with shape (n,)
    :param mode: 'dedpul' or 'en'; if 'dedpul', diff needs to be provided; if 'en', preds and target need to be provided
    :param converge: True or False; True if convergence estimate should be computed
    :param tol: tolerance of error between priors and mean posteriors, indicator of convergence
    :param max_iterations: if exceeded, search of converged alpha stops even if tol is not reached
    :param nonconverge: True or False; True if non-convergence estimate should be computed
    :param step: gap between points of the [0, 1, step] gird to choose best alpha from
    :param max_diff: alpha with difference of mean posteriors and priors bigger than max_diff cannot be chosen;
        an heuristic to choose bigger alpha
    :param plot: True or False, if True - plots ([0, 1, grid], mean posteriors - alpha) and
        ([0, 1, grid], second lag of (mean posteriors - alpha))
    :param disp: True or False, if True - displays if the algorithm didn't converge
    :param alpha: proportions of N in U; is estimated if None
    :return: tuple (alpha, poster), e.g. (priors, posteriors) of N in U for the U sample
    """
    assert converge + nonconverge, "At least one of 'converge' and 'nonconverge' has to be set to 'True'"

    if alpha is not None:
        if mode == 'dedpul':
            alpha, poster = estimate_poster_dedpul(diff, alpha=alpha, alpha_as_mean_poster=alpha_as_mean_poster, tol=tol, **kwargs)
        elif mode == 'en':
            _, poster = estimate_poster_en(preds, target, alpha=alpha, **kwargs)
        return alpha, poster

    # if converge:
    alpha_converge = 0
    for i in range(max_iterations):

        if mode.endswith('dedpul'):
            _, poster_converge = estimate_poster_dedpul(diff, alpha=alpha_converge, **kwargs)
        elif mode == 'en':
            _, poster_converge = estimate_poster_en(preds, target, alpha=alpha_converge, **kwargs)

        mean_poster = np.mean(poster_converge)
        error = mean_poster - alpha_converge

        if np.abs(error) < tol:
            break
        if np.min(poster_converge) > 0:
            break
        alpha_converge = mean_poster

    if disp:
        if i >= max_iterations - 1:
            print('max iterations exceeded')

    # if nonconverge:

    errors = np.array([])
    for alpha_nonconverge in np.arange(0, 1, step):

        if mode.endswith('dedpul'):
            _, poster_nonconverge = estimate_poster_dedpul(diff, alpha=alpha_nonconverge, **kwargs)
        elif mode == 'en':
            _, poster_nonconverge = estimate_poster_en(preds, target, alpha=alpha_nonconverge, **kwargs)
        errors = np.append(errors, np.mean(poster_nonconverge) - alpha_nonconverge)

    idx = np.argmax(np.diff(np.diff(errors))[errors[1: -1] < max_diff])
    alpha_nonconverge = np.arange(0, 1, step)[1: -1][errors[1: -1] < max_diff][idx]

    if plot:
        fig, axs = plt.subplots(2, 1, sharex=False, sharey=False, figsize=(6, 10))
        axs[0].plot(np.arange(0, 1, step), errors)
        axs[1].plot(np.arange(0, 1, step)[1: -1], np.diff(np.diff(errors)))

    # if converge and not nonconverge:y77
    #     return alpha_converge, poster_converge

    if ((alpha_nonconverge >= alpha_converge) or#converge and nonconverge and
        (((errors < 0).sum() > 1) and (alpha_converge < 1 - step))):
        return alpha_converge, poster_converge

    elif nonconverge:
        if mode == 'dedpul':
            _, poster_nonconverge = estimate_poster_dedpul(diff, alpha=alpha_nonconverge, **kwargs)
        elif mode == 'en':
            _, poster_nonconverge = estimate_poster_en(preds, target, alpha=alpha_nonconverge, **kwargs)

        if disp:
            print('didn\'t converge')
        return alpha_nonconverge, poster_nonconverge
        # return np.mean(poster_nonconverge), poster_nonconverge

    else:
        if disp:
            print('didn\'t converge')
        return None, None



In [None]:
d = estimate_poster_cv(X_train,y_train, estimator='ntc_methods', alpha=None,
                                         estimate_poster_options={'disp': False},
                                         estimate_diff_options={})

for key,value in d.items():
    print(key,value)

In [None]:
bst = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(un_X_train)//len(pos_X_train)) #sklearn api

#prior
bst.fit(X_train, y_train)
alpha_used = 1.0 - d['dedpul'][0] 
ratio = 1.0*(len(pos_X_train)+alpha_used*len(un_X_train))/len(pos_X_train)
pre_probs = bst.predict_proba(train)[:,1]
#post-processing
preds = pre_probs*ratio
predictions = pd.Series([1 if value>=0.5 else 0 for value in preds]) 
G_new = copy.deepcopy(G0) 

fp_X_1 = train_1[(target==0)&(predictions==1)]
print(len(fp_X_1))

for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok")
#统计值
size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity 

preds = bst.predict_proba(new_pos_X)[:,1]
preds = preds*ratio
predictions = [1 if value>=0.5 else 0 for value in preds] 
new_test_recall = recall_score([1]*len(predictions), predictions)
print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0)) #recall_new

## 7.3 DEDPUL + CVIR

In [None]:
bst = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(un_X_train)//len(pos_X_train)) #sklearn api


bst.fit(X_train, y_train)

In [None]:
alpha_default = 0.8 
epochs = 20
thr = 0.4
if_pu = True
bst = copy.deepcopy(xgb_normal)

#after warm start
for epoch in range(epochs):

    pos_probs = np.array(bst.predict_proba(pos_X_test)[:,1])
    unlabeled_probs = np.array(bst.predict_proba(un_X_test)[:,1])
    unlabeled_targets = np.array([1 if i>=thr else 0 for i in unlabeled_probs])

    #step1:MPE
    ratio,_ = estimate_poster_cv(X_test,y_test, estimator='dedpul', alpha=None,
                                             estimate_poster_options={'disp': False},
                                             estimate_diff_options={})
    alpha_used = 1.0-ratio
    if alpha_used >=1:
        alpha_used = alpha_default
    
    #step2: PvN training
    #select the reliable negative samples and drop the suspicious positive samples
    keep_samples = rank_inputs(bst, un_X_train, alpha_used)
    unlabeled_new = un_X_train[keep_samples == 1]
    pos_new = un_X_train[keep_samples ==0]
    
    X_train_new = pd.concat([pos_X_train,unlabeled_new],axis=0)
    y_train_new = pd.DataFrame([1]*(len(pos_X_train)+[0]*len(unlabeled_new)).iloc[:,0]

    bst.fit(X_train_new,y_train_new)
   
    
    print("epoch{}, alpha = {}\n".format(epoch,alpha_used))
    train_preds = bst.predict_proba(X_train)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions)
    train_recall = recall_score(y_train, train_predictions)
    train_f1 = f1_score(y_train, train_predictions)
    train_auc = roc_auc_score(y_train,train_preds)

    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))


    train_preds = bst.predict_proba(X_train_new)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train_new, train_predictions)
    train_precision = precision_score(y_train_new, train_predictions)
    train_recall = recall_score(y_train_new, train_predictions)
    train_f1 = f1_score(y_train_new, train_predictions)
    train_auc = roc_auc_score(y_train_new,train_preds)
    print("New positive samples added:")
    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))

    # make prediction
    preds = bst.predict_proba(X_test)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds] 

    test_accuracy = accuracy_score(y_test, predictions)
    test_precision = precision_score(y_test, predictions)
    test_recall = recall_score(y_test, predictions)
    test_f1=f1_score(y_test, predictions)
    test_auc=roc_auc_score(y_test,preds)

    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


    # make prediction
    preds = bst.predict_proba(new_pos_X)[:,1]
    predictions = [1 if value>=0.4 else 0 for value in preds] 
    new_test_recall = recall_score([1]*len(predictions), predictions)
    print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new recall
    
    pos_X_test = X_test[y_test==1]
    un_X_test = X_test[y_test==0]
    keep_samples_test = rank_inputs(bst, un_X_test, alpha_used)
    unlabeled_new_test = un_X_test[keep_samples_test == 1]
    pos_new_test = un_X_test[keep_samples_test ==0]
    
    X_test_new = pd.concat([pos_X_test,pos_new_test,unlabeled_new_test],axis=0)
    y_test_new = pd.DataFrame([1]*(len(pos_X_test)+len(pos_new_test))+[0]*len(unlabeled_new_test)).iloc[:,0]
    
    preds = bst.predict_proba(X_test_new)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds]
    test_accuracy = accuracy_score(y_test_new, predictions)
    test_precision = precision_score(y_test_new, predictions)
    test_recall = recall_score(y_test_new, predictions)
    test_f1=f1_score(y_test_new, predictions)
    test_auc=roc_auc_score(y_test_new,preds)
    print("New positive samples added:")
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


xgb_tmp = copy.deepcopy(bst)
#feature importance
plot_importance(xgb_tmp)
plt.show()

In [None]:
un_X_1 = train_1[target==0]
un_X = train[target==0]
un = samples_3[target==0]
pos_X_1 = train_1[target>0]
pos_X = train[target>0]

preds = bst.predict_proba(un_X)[:,1]
fp = un[preds>=0.4]

G_new = copy.deepcopy(G0) 
for index, row in fp[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity 


## 7.4 EN + CVIR

In [None]:
bst = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(un_X_train)//len(pos_X_train)) #sklearn api


bst.fit(X_train, y_train)

In [None]:
alpha_default = 0.8 
epochs = 20
thr = 0.4
if_pu = True
bst = copy.deepcopy(xgb_normal)

#after warm start
for epoch in range(epochs):

    pos_probs = np.array(bst.predict_proba(pos_X_test)[:,1])
    unlabeled_probs = np.array(bst.predict_proba(un_X_test)[:,1])
    unlabeled_targets = np.array([1 if i>=thr else 0 for i in unlabeled_probs])

    #step1:MPE
    ratio,_ = estimate_poster_cv(X_test,y_test, estimator='em_en', alpha=None,
                                             estimate_poster_options={'disp': False},
                                             estimate_diff_options={})
    alpha_used = 1.0-ratio
    if alpha_used >=1:
        alpha_used = alpha_default
    
    #step2: PvN training
    #select the reliable negative samples and drop the suspicious positive samples
    keep_samples = rank_inputs(bst, un_X_train, alpha_used)
    unlabeled_new = un_X_train[keep_samples == 1]
    pos_new = un_X_train[keep_samples ==0]
    
    X_train_new = pd.concat([pos_X_train,unlabeled_new],axis=0)
    y_train_new = pd.DataFrame([1]*(len(pos_X_train)+[0]*len(unlabeled_new)).iloc[:,0]

    bst.fit(X_train_new,y_train_new)
   
    
    print("epoch{}, alpha = {}\n".format(epoch,alpha_used))
    train_preds = bst.predict_proba(X_train)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train, train_predictions)
    train_precision = precision_score(y_train, train_predictions)
    train_recall = recall_score(y_train, train_predictions)
    train_f1 = f1_score(y_train, train_predictions)
    train_auc = roc_auc_score(y_train,train_preds)

    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))


    train_preds = bst.predict_proba(X_train_new)[:,1]
    train_predictions = [1 if value>=thr else 0 for value in train_preds]

    train_accuracy = accuracy_score(y_train_new, train_predictions)
    train_precision = precision_score(y_train_new, train_predictions)
    train_recall = recall_score(y_train_new, train_predictions)
    train_f1 = f1_score(y_train_new, train_predictions)
    train_auc = roc_auc_score(y_train_new,train_preds)
    print("New positive samples added:")
    print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
    print ("Train Precision: %.2f%%" % (train_precision * 100.0))
    print ("Train Recall: %.2f%%" % (train_recall * 100.0))
    print ("Train f1: %.4f" % (train_f1))
    print ("Train AUC: %.4f" % (train_auc))

    # make prediction
    preds = bst.predict_proba(X_test)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds] 

    test_accuracy = accuracy_score(y_test, predictions)
    test_precision = precision_score(y_test, predictions)
    test_recall = recall_score(y_test, predictions)
    test_f1=f1_score(y_test, predictions)
    test_auc=roc_auc_score(y_test,preds)

    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


    # make prediction
    preds = bst.predict_proba(new_pos_X)[:,1]
    predictions = [1 if value>=0.4 else 0 for value in preds] 
    new_test_recall = recall_score([1]*len(predictions), predictions)
    print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #new recall
    
    pos_X_test = X_test[y_test==1]
    un_X_test = X_test[y_test==0]
    keep_samples_test = rank_inputs(bst, un_X_test, alpha_used)
    unlabeled_new_test = un_X_test[keep_samples_test == 1]
    pos_new_test = un_X_test[keep_samples_test ==0]
    
    X_test_new = pd.concat([pos_X_test,pos_new_test,unlabeled_new_test],axis=0)
    y_test_new = pd.DataFrame([1]*(len(pos_X_test)+len(pos_new_test))+[0]*len(unlabeled_new_test)).iloc[:,0]
    
    preds = bst.predict_proba(X_test_new)[:,1]
    predictions = [1 if value>=thr else 0 for value in preds]
    test_accuracy = accuracy_score(y_test_new, predictions)
    test_precision = precision_score(y_test_new, predictions)
    test_recall = recall_score(y_test_new, predictions)
    test_f1=f1_score(y_test_new, predictions)
    test_auc=roc_auc_score(y_test_new,preds)
    print("New positive samples added:")
    print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
    print ("Test Precision: %.2f%%" % (test_precision * 100.0))
    print ("Test Recall: %.2f%%" % (test_recall * 100.0))
    print ("Test f1: %.4f" % (test_f1))
    print ("Test AUC: %.4f" % (test_auc))


xgb_tmp = copy.deepcopy(bst)
#feature importance
plot_importance(xgb_tmp)
plt.show()

In [None]:
un_X_1 = train_1[target==0]
un_X = train[target==0]
un = samples_3[target==0]
pos_X_1 = train_1[target>0]
pos_X = train[target>0]

preds = bst.predict_proba(un_X)[:,1]
fp = un[preds>=0.4]

G_new = copy.deepcopy(G0) 
for index, row in fp[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity 

## 7.5 C-CRE with GMM

In [None]:
# train the GMM
gm = GaussianMixture(n_components=8, random_state=0,covariance_type='diag')
y_0 = gm.fit_predict(samples)
y_pred = gm.predict_proba(samples)
y_pred = y_pred[:,4]+y_pred[:,5] #y_pred[:,0]+y_pred[:,1]+y_pred[:,2]
y = (y_pred>=thr).astype(int)

In [None]:
x1 = []
x2 = []
print(len(X_train))
print(gm.weights_)
for i in range(len(gm.weights_)):
    part = target[y_0==i]
    print("cluster_"+str(i)+": "+str(len(part)))
    print("Positive Cnt："+str(np.sum(part))+" Ratio："+str(1.0*np.sum(part)/len(part)))   


In [None]:
print(np.sum(y))
test_accuracy = accuracy_score(target, y)
test_precision = precision_score(target, y)
test_recall = recall_score(target, y)
test_f1 = f1_score(target, y)
test_auc = roc_auc_score(target,y_pred)

print("Whole Accuracy: %.2f%%" % (test_accuracy * 100.0))
print ("Whole Precision: %.2f%%" % (test_precision * 100.0))
print ("Whole Recall: %.2f%%" % (test_recall * 100.0))
print ("Whole f1: %.4f" % (test_f1))
print ("Test auc: %.4f" % (test_auc))

In [None]:
best_mod = -0.5
best_clf = None
max_size = 0
for beta in np.arange(0.1,0.2,0.01): #hypeparmetet tuning with high modulairty and size
    un_probs = gm.predict_proba(un_X_train)[:,4]+gm.predict_proba(un_X_train)[:,5]
    sorted_idx = np.argsort(un_probs)
    u_size = len(sorted_idx)
    keep_samples = np.ones_like(un_probs)
    keep_samples[sorted_idx[int(beta*u_size):]] = 0
    neg_X_train = un_X_train[keep_samples==1]
    y_train_new = [1 for i in range(len(pos_X_train))]+[0 for i in range(len(neg_X_train))]
    X_train_new = pd.concat([pos_X_train,neg_X_train])
    
    clf = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(neg_X_train)//len(pos_X_train)) #sklearn api
    clf.fit(X_train_new, y_train_new)
    
    preds = clf.predict_proba(samples)[:,1]
    predictions = pd.Series([1 if value>=thr else 0 for value in preds])
    G_new = copy.deepcopy(G0) 

    fp_X_1 = train_1[(target==0)&(predictions==1)] 

    for index, row in fp_X_1[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
        G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])

    print(G_new.number_of_nodes())
    print(G_new.number_of_edges())
    size = G_new.number_of_nodes()
    
    partition = community.best_partition(G_new) 
    print("partition ok") 
    num_com = len(set(partition.values()))
    aver_size = size/num_com
    print(num_com)
    if aver_size<2: #too much partition
        continue
        
    mod = community.modularity(partition,G_new)
    print(mod) #modularity
    if (mod+0.01>best_mod and size>max_size) or mod>best_mod+0.01:
        print(f"The best parameter beta is updated: beta is {beta}.")
        best_mod = mod
        best_clf = clf
        max_size = max(max_size,size)
        
print(best_mod)
xgb_gmm = copy.deepcopy(best_clf)


In [None]:
train_preds = best_clf.predict_proba(X_train_new)[:,1]
train_predictions = [1 if value>=0.4 else 0 for value in train_preds]

train_accuracy = accuracy_score(y_train_new, train_predictions)
train_precision = precision_score(y_train_new, train_predictions)
train_recall = recall_score(y_train_new, train_predictions)
train_f1 = f1_score(y_train_new, train_predictions)
train_auc = roc_auc_score(y_train_new,train_preds)

print ("Train Accuary: %.2f%%" % (train_accuracy * 100.0))
print ("Train Precision: %.2f%%" % (train_precision * 100.0))
print ("Train Recall: %.2f%%" % (train_recall * 100.0))
print ("Train f1: %.4f" % (train_f1))
print ("Train AUC: %.4f" % (train_auc))


# make prediction
preds = best_clf.predict_proba(X_test)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] 

test_accuracy = accuracy_score(y_test, predictions)
test_precision = precision_score(y_test, predictions)
test_recall = recall_score(y_test, predictions)
test_f1=f1_score(y_test, predictions)
test_auc=roc_auc_score(y_test,preds)

print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
print ("Test Precision: %.2f%%" % (test_precision * 100.0))
print ("Test Recall: %.2f%%" % (test_recall * 100.0))
print ("Test f1: %.4f" % (test_f1))
print ("Test AUC: %.4f" % (test_auc))


# make prediction
preds = best_clf.predict_proba(new_pos_X)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] 
new_test_recall = recall_score([1]*len(predictions), predictions)
print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))  #recall_new

## 7.6 PE

In [None]:
bst = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(un_X_train)//len(pos_X_train)) #sklearn api


bst.fit(X_train,y_train)

#choose the best hyperparmeters (proportion of unlabeld samples to use), same way as C-CRE
lamda1 = 1.0
lamda2 = 0.3
preds = bst.predict_proba(un_X_train)[:,1] 
fp_X_train = un_X_train[preds>=0.4]
keep_samples = rank_inputs(bst, fp_X_train, lamda1)
rp_X_train = fp_X_train[keep_samples ==0]
tn_X_train = un_X_train[preds<0.4]
keep_samples = rank_inputs(bst, tn_X_train, 1-lamda2)
rn_X_train = tn_X_train[keep_samples ==0]

X_train_new = pd.concat([pos_X_train,rp_X_train,rn_X_train],axis=0)
y_train_new = pd.DataFrame([1]*(len(pos_X_train)+len(rp_X_train))+[0]*len(rn_X_train)).iloc[:,0]

bst.fit(X_train_new,y_train_new)


In [None]:
preds = bst.predict_proba(new_pos_X)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] 
new_test_recall = recall_score([1]*len(predictions), predictions)
print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))

In [None]:
un_X_1 = train_1[target==0]
un_X = train[target==0]
un = samples_3[target==0]
pos_X_1 = train_1[target>0]
pos_X = train[target>0]

preds = bst.predict_proba(un_X)[:,1]
fp = un[preds>=0.4]

G_new = copy.deepcopy(G0) 
for index, row in fp[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity 

## 7.7 PGPU

In [None]:
bst = XGBClassifier(max_depth=6,min_child_weight=2,learning_rate=0.01, n_estimators=100, 
                   objective='binary:logistic',scale_pos_weight=len(un_X_train)//len(pos_X_train)) #sklearn api


bst.fit(X_train,y_train)

preds = bst.predict_proba(un_X_train)[:,1] 
fp_X_train = un_X_train[preds>=0.4]
# choose the reliable negative samples
min_score = np.min(np.array(bst.predict_proba(pos_X_train)))
print(min_score)
rn_X_train = un_X_train[preds<min_score]

X_train_new = pd.concat([pos_X_train,fp_X_train,rn_X_train],axis=0)
y_train_new = pd.DataFrame([1]*(len(pos_X_train)+len(fp_X_train))+[0]*len(rn_X_train)).iloc[:,0]

bst.fit(X_train_new,y_train_new)

In [None]:
preds = bst.predict_proba(new_pos_X)[:,1]
predictions = [1 if value>=0.4 else 0 for value in preds] #指定0.4为threshold
new_test_recall = recall_score([1]*len(predictions), predictions)
print ("New Test Recall: %.2f%%" % (new_test_recall * 100.0))

In [None]:
un_X_1 = train_1[target==0]
un_X = train[target==0]
un = samples_3[target==0]
pos_X_1 = train_1[target>0]
pos_X = train[target>0]

preds = bst.predict_proba(un_X)[:,1]
fp = un[preds>=0.4]

G_new = copy.deepcopy(G0) 
for index, row in fp[['roleid_src','roleid_dst','trade_n_times_mean']].iterrows():
    G_new.add_edge(row['roleid_src'],row['roleid_dst'],weight=row['trade_n_times_mean'])
    
print(G_new.number_of_nodes())
print(G_new.number_of_edges())
partition = community.best_partition(G_new) 
print("partition ok") 

size = len(set(partition.values())) #number of communities
print(size)
mod = community.modularity(partition,G_new)
print(mod) #modularity 

# 8. Case Study

In [None]:
pos_samples = samples_2[samples_2['label']==1]
unlabeled_samples = samples_2[samples_2['label']==0]
profile_51 = profile[profile['ds']=='2021-05-01']
print(profile_51.describe())

#draw two main trading features
font_size_1 = 25
font_size_2 = 20
sns.set(style='ticks',context='notebook',font_scale=1.2)#
plt.scatter(pos_samples['diff_price_mean'],pos_samples['ts_between_mean'],label='Positive',s=2,c='r')
plt.scatter(unlabeled_samples.head(5000)['diff_price_mean'],unlabeled_samples.head(5000)['ts_between_mean'],label='Unlabeled',s=2,c='g')
plt.xticks(fontsize = font_size_2)
plt.yticks(fontsize = font_size_2)
plt.legend(fontsize = font_size_1)
plt.xlabel("Mean of Item Price Gap",fontsize = font_size_1)
plt.ylabel("Mean of Interval (second)",fontsize = font_size_1)
plt.savefig("./pictures_tmp/labels_x.pdf",bbox_inches = 'tight')
plt.show()

In [None]:
# the central node of the biggest community
target_id = '7bbd84581e618d1d3cecc2ea25410254'
tmp_1 = samples_2[samples_2['roleid_src']==target_id]
tmp_2 = samples_2[samples_2['roleid_dst']==target_id]
tp_fp = pd.read_csv('data_gnn/test_fp_tp_classes_5.csv')[['Id']]
tmp_1 = tp_fp.merge(tmp_1,left_on='Id',right_on='roleid_dst')
tmp_2 = tp_fp.merge(tmp_2,left_on='Id',right_on='roleid_src')
print(tmp_1)
print(tmp_2)
bins=np.arange(-10000,2000,100)
plt.hist(tmp_1['net_money_y_mean'],bins,color='b',alpha=0.5)
sns.distplot(tmp_1['net_money_y_mean'],color='b',kde=False, norm_hist=False)
plt.show()
sns.distplot(tmp_2['net_money_x_mean'],color='b',kde=False, norm_hist=False)
plt.show()
sns.distplot(node_money['net_money'],color='b',kde=False, norm_hist=False)

In [None]:
#determine the role of edge nodes using player profile (3 types)
sns.set(style='ticks',context='notebook',font_scale=1.2)
i = 1
bins_dict = {'level1':np.arange(-1,1,0.01),'score2':np.arange(-1,1,0.02),'online_time':np.arange(0,86400,1000)}
font_size_1 = 25
font_size_2 = 20
#for col in ['level1','score2','online_time']:
for col in ['score2','online_time']:
    bins = bins_dict[col]

    plt.hist(profile_51[col],bins,color='g',alpha=0.5)
    if col=='online_time':
        plt.xlabel('Weekly Online Time (second)',fontsize = font_size_1)
    elif col=='score2':
        plt.xlabel('Character score',fontsize = font_size_1)
    plt.ylabel('Frequency',fontsize = font_size_1)
    plt.xticks(fontsize = font_size_2)
    plt.yticks(fontsize = font_size_2)
    plt.legend(["All Players"],fontsize = font_size_1)
    plt.savefig("./pictures_tmp/profile_"+str(i)+".pdf",bbox_inches = 'tight')
    i += 1
    plt.show()

    plt.hist(tmp_1[col+'_y_mean'],bins,color='b',alpha=0.5)
    if col=='online_time':
        plt.xlabel('Weekly Online Time (second)',fontsize = font_size_1)
    elif col=='score2':
        plt.xlabel('Character score',fontsize = font_size_1)
    plt.ylabel('Frequency',fontsize = font_size_1)
    plt.xticks(fontsize = font_size_2)
    plt.yticks(fontsize = font_size_2)
    plt.legend(["Profitable Edge Players"],fontsize = font_size_1)
    plt.savefig("./pictures_tmp/profile_"+str(i)+".pdf",bbox_inches = 'tight')
    i += 1
    plt.show()

    plt.hist(tmp_2[col+'_x_mean'],bins,color='r',alpha=0.5)
    if col=='online_time':
        plt.xlabel('Weekly Online Time (second)',fontsize = font_size_1)
    elif col=='score2':
        plt.xlabel('Character score',fontsize = font_size_1)
    plt.ylabel('Frequency',fontsize = font_size_1)
    plt.xticks(fontsize = font_size_2)
    plt.yticks(fontsize = font_size_2)
    plt.legend(["Losing Edge Players"],fontsize = font_size_1)
    plt.savefig("./pictures_tmp/profile_"+str(i)+".pdf",bbox_inches = 'tight')
    i += 1
    plt.show()