In [283]:
import numpy as np
from sklearn.cluster import k_means
from sklearn.metrics import log_loss, roc_auc_score
import pickle
import pandas as pd
from sklearn.utils import shuffle
def read_part_data(part_name):
    dense_features = ['I' + str(i) for i in range(1, 14)]
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    features = dense_features + sparse_features

    pkl_file = open('/data/LeiLixiang/pycharmproject/DeepCTR-Torch/data/features_min.pkl', 'rb')
    features_min = pickle.load(pkl_file)

    x_npy = np.load('/data/LeiLixiang/pycharmproject/Fi_GNN/data/Criteo/'+part_name+'/train_x2.npy')
    x = pd.DataFrame(x_npy, columns=features)
    index_npy = np.load('/data/LeiLixiang/pycharmproject/Fi_GNN/data/Criteo/'+part_name+'/train_i.npy')
    index = pd.DataFrame(index_npy, columns=features)
    y_npy = np.load('/data/LeiLixiang/pycharmproject/Fi_GNN/data/Criteo/'+part_name+'/train_y.npy')
    y = pd.DataFrame(y_npy, columns=['label'])

    x = pd.concat([x[dense_features], index[sparse_features]], axis=1)

    for feat in sparse_features:
        x[feat] -= features_min[feat]

    return pd.concat([x, y], axis=1)

def read_label(part_name):
    label = np.load('label_{}.npy'.format(part_name))
    return label


def get_position(series, threshold):
    position=pd.Series([False for i in range(series.shape[0])])
    counts = series.value_counts().sort_index()
    counts = counts[counts>threshold]
    for i, v in counts.items():
        position = position | (series==i)
    position2 = pd.Series([not i for i in position])
    return position, position2

def get_auc_loss(position, position2, data_pre):
    more_data = data_pre[position]
    less_data = data_pre[position2]
    auc1 = round(roc_auc_score(more_data['label'].values, more_data['label_pre'].values), 4)
    auc2 = round(roc_auc_score(less_data['label'].values, less_data['label_pre'].values), 4)
    loss1 = round(log_loss(more_data['label'].values, more_data['label_pre'].values), 4)
    loss2 = round(log_loss(less_data['label'].values, less_data['label_pre'].values), 4)
    return (auc1, auc2), (loss1, loss2)


def get_auc_loss2(series, threshold1, threshold2, data_pre):
    position=pd.Series([False for i in range(series.shape[0])])
    counts = series.value_counts().sort_index()
    counts = counts[(counts>threshold1)&(counts<threshold2)]
    for i, v in counts.items():
        position = position | (series==i)
    data_tmp = data_pre[position]
    auc = round(roc_auc_score(data_tmp['label'].values, data_tmp['label_pre'].values), 4)
    loss = round(log_loss(data_tmp['label'].values, data_tmp['label_pre'].values), 4)
    return auc, loss

def get_auc_list(series, data_pre, interval_num):
    counts = series.value_counts().sort_values()
    feat_num = counts.shape[0]
    auc_list = []
    loss_list = []
    for i in range(interval_num):
        position=pd.Series([False for i in range(series.shape[0])])
        interval = int(feat_num/interval_num)+1
        feat = counts[i*interval:(i+1)*interval]
        for i, v in feat.items():
            position = position | (series==i)
        data_tmp = data_pre[position]
        auc = round(roc_auc_score(data_tmp['label'].values, data_tmp['label_pre'].values), 4)
        loss = round(log_loss(data_tmp['label'].values, data_tmp['label_pre'].values), 4)
        auc_list.append(auc)
        loss_list.append(loss)
    return auc_list, loss_list

In [12]:
data = read_part_data('part2')
label_pre = read_label('part2')
label_pre = pd.DataFrame(label_pre, columns=['label_pre'])
data_pre = pd.concat([data, label_pre], axis=1)

In [215]:
series = data_pre['C21']

In [237]:
position, position2 = get_position(series, 100)

In [238]:
auc = get_auc_loss(position, position2, data_pre)

In [239]:
auc

((0.8005, 0.8055), (0.4482, 0.453))

In [241]:
get_auc_loss2(series, 1000, 1000000, data_pre)

(0.7987, 0.4491)

In [291]:
auc_list, loss_list = get_auc_list(series, data_pre, 40)

In [294]:
counts = series.value_counts().sort_values()
for i in range(40):
    interval = int(counts.shape[0]/40)+1
    print(counts[i*interval:(i+1)*interval])

67316     1
116338    1
169478    1
149775    1
166167    1
         ..
72990     1
130127    1
117833    1
113735    1
118934    1
Name: C21, Length: 3784, dtype: int64
131307    1
115191    1
171043    1
143597    1
164091    1
         ..
161318    1
140844    1
76060     1
55586     1
132647    1
Name: C21, Length: 3784, dtype: int64
153122    1
116992    1
166248    1
144929    1
59685     1
         ..
98845     1
61519     1
93528     1
97218     1
155573    1
Name: C21, Length: 3784, dtype: int64
60847     1
147529    1
74249     1
142653    1
97690     1
         ..
131278    1
53396     1
77172     1
117856    1
122218    1
Name: C21, Length: 3784, dtype: int64
118120    1
164061    1
82085     1
81270     1
127441    1
         ..
152391    1
160579    1
64399     1
146376    1
58128     1
Name: C21, Length: 3784, dtype: int64
53098     1
138188    1
166878    1
25344     1
164699    1
         ..
119981    1
76377     1
74931     1
96851     1
144593    1
Name: C21, Length:

In [298]:
sum(auc_list[0:32])/32

0.8022874999999999