In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

from product_nets_master.python import utils
from product_nets_master.python.models import LR, FM, PNN1, PNN2, FNN, CCPM

import os
import sys
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.py'))))
import handset_model_current as handset_model # only used for OverSamplingBatchGenerator

from tqdm import tqdm
import pickle

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

Using TensorFlow backend.


In [2]:
# modified to read and format data from handset_model
# initialize global variables in utils
os.chdir("..")
print("current directory: %s" % (os.getcwd()))
args = utils.setArgs()  # use split_id=2
utils.read_data(args)

current directory: /Users/ChentianJiang/dev/Telenor_handset_model
reading and formatting data (with handset_model)...
generating dictionary with levels of catagorical variables...
Reusing data split with id=2
Loading previously pre-processed numerical data...
Loading previously pre-processed categorical data...
SX_TRAIN:  <class 'scipy.sparse.csr.csr_matrix'> (466632, 235)
Y_TRAIN:  <class 'numpy.ndarray'> (466632,)
SX_TEST:  <class 'scipy.sparse.csr.csr_matrix'> (116659, 235)
Y_TEST:  <class 'numpy.ndarray'> (116659,)
FIELD_SIZES:  [7, 7, 25, 17, 8, 16, 77, 78]
FIELD_OFFSETS:  [0, 7, 14, 39, 56, 64, 80, 157]
INPUT_DIM:  235
INPUT_DIM_NUM:  7


In [3]:
# dicts from handset_model
data_train_dict = utils.DATA_TRAIN_DICT
data_test_dict = utils.DATA_TEST_DICT

# X data are for cat data only:
train_data = utils.SX_TRAIN, utils.Y_TRAIN
train_data = utils.shuffle(train_data)
test_data = utils.SX_TEST, utils.Y_TEST

if train_data[1].ndim > 1:
    print('label must be 1-dim')
    exit(0)
print('read finish')

# these refer to cat data only:
train_size = train_data[0].shape[0]
test_size = test_data[0].shape[0]
num_feas = len(utils.FIELD_SIZES)
field_sizes = utils.FIELD_SIZES
field_offsets = utils.FIELD_OFFSETS
input_dim = utils.INPUT_DIM
os.chdir("pnn")
print("current directory: %s" % (os.getcwd()))

# SETTINGS
# reduce rounds so that training doesn't take too long (for experimentation)
min_round = 1
num_round = 20
early_stop_round = 3  # convergence
batch_size = 256

algo = 'pnn1'
ratio = 3

include_num = True

read finish
current directory: /Users/ChentianJiang/dev/Telenor_handset_model/pnn


In [5]:
X_train_cat = data_train_dict['cat']
X_train_num = data_train_dict['num']
y_train = data_train_dict['labels'][handset_model.LABEL_COL]

X_train_cat.shape, X_train_num.shape, y_train.shape

((466632, 235), (466632, 7), (466632,))

In [6]:
group = X_train_cat.groupby(y_train)

In [7]:
X_train_cat_0 = X_train_cat.loc[group.indices[0], :]
X_train_cat_1 = X_train_cat.loc[group.indices[1], :]

X_train_num_0 = X_train_num.loc[group.indices[0], :]
X_train_num_1 = X_train_num.loc[group.indices[1], :]

y_train_0 = y_train.loc[group.indices[0]]
y_train_1 = y_train.loc[group.indices[1]]

X_train_cat_0.shape, X_train_cat_1.shape, X_train_num_0.shape, X_train_num_1.shape, y_train_0.shape, y_train_1.shape

((464306, 235), (2326, 235), (464306, 7), (2326, 7), (464306,), (2326,))

In [8]:
n_subset = 20
remainder = y_train_0.shape[0] % n_subset
size = int(np.floor(y_train_0.shape[0]/n_subset))
MJ_subset_dex = dict()

dex = 0
for i in range(n_subset):
    if i == n_subset-1:
        subset_dex = (dex, dex+size+remainder)
    else:
        subset_dex = (dex, dex+size)
    
    MJ_subset_dex[i+1] = subset_dex
    dex += size

MJ_subset_dex

{1: (0, 15476),
 2: (15476, 30952),
 3: (30952, 46428),
 4: (46428, 61904),
 5: (61904, 77380),
 6: (77380, 92856),
 7: (92856, 108332),
 8: (108332, 123808),
 9: (123808, 139284),
 10: (139284, 154760),
 11: (154760, 170236),
 12: (170236, 185712),
 13: (185712, 201188),
 14: (201188, 216664),
 15: (216664, 232140),
 16: (232140, 247616),
 17: (247616, 263092),
 18: (263092, 278568),
 19: (278568, 294044),
 20: (294044, 309520),
 21: (309520, 324996),
 22: (324996, 340472),
 23: (340472, 355948),
 24: (355948, 371424),
 25: (371424, 386900),
 26: (386900, 402376),
 27: (402376, 417852),
 28: (417852, 433328),
 29: (433328, 448804),
 30: (448804, 464306)}

In [9]:
t1 = 10
t2 = 7
t3 = 10

In [10]:
if algo == 'pnn1':
    pnn1_params = {
    'layer_sizes': [field_sizes, 10, 1],
    'layer_acts': ['tanh', 'none'],
    'drop_out': [0, 0],
    'opt_algo': 'gd',
    # lr: changed from 0.1 to 0.01 because 0.1 is too large
    # --> diverges for cat+num data!
    'learning_rate': 0.01,
    'layer_l2': [0, 0],
    'kernel_l2': 0,
    'random_seed': 0,
    'include_num': include_num
    }
model = PNN1(**pnn1_params)
train_data = utils.split_data(train_data)
test_data = utils.split_data(test_data)

In [11]:
X_train_cat_0.reset_index(drop=True, inplace=True)
X_train_cat_1.reset_index(drop=True, inplace=True)

X_train_num_0.reset_index(drop=True, inplace=True)
X_train_num_1.reset_index(drop=True, inplace=True)

y_train_0.reset_index(drop=True, inplace=True)
y_train_1.reset_index(drop=True, inplace=True)

In [12]:
print("training model with %s" % (algo))
union = False
j = 0
fetches = [model.optimizer, model.loss]
while j < (n_subset + 1):
    history_score = []
    ls = []
    if j == 0:
        union = False
        print("\ninput: MN")
        t = t1
        n_cat = X_train_cat_1.values
        n_num = X_train_num_1.values
        y_i = y_train_1.values.astype(int)
    elif j == n_subset:
        print("\ninput: MN U MJ (last step %d)" % (j))
        t = t3
        n_cat = pd.concat([X_train_cat_1, X_train_cat_0], ignore_index=True).values
        n_num = pd.concat([X_train_num_1, X_train_num_0], ignore_index=True).values
        y_i = np.concatenate([y_train_1.values, y_train_0.values]).astype(int)
    else:
        dex_start = MJ_subset_dex[j][0]
        dex_end = MJ_subset_dex[j][1]
        if union:
            t = t1
            print("\ninput: MN U MJ subset %d" % (j))
            n_cat = pd.concat([X_train_cat_1, X_train_cat_0.iloc[dex_start:dex_end, :]], ignore_index=True).values
            n_num = pd.concat([X_train_num_1, X_train_num_0.iloc[dex_start:dex_end, :]], ignore_index=True).values
            y_i = np.concatenate([y_train_1.values, y_train_0[dex_start:dex_end].values]).astype(int)
            union = False

        else:
            t = t2
            print("\ninput: MJ subset %d" % (j))
            n_cat = X_train_cat_0.iloc[dex_start:dex_end, :].values
            n_num = X_train_num_0.iloc[dex_start:dex_end, :].values
            y_i = y_train_0[dex_start:dex_end].values.astype(int)
        
    if algo in {'fnn', 'ccpm', 'pnn1', 'pnn2'}:
        fields = utils.split_data_gen(n_cat)  # slight modification of utils.split_data

        X_i = []
        for f in fields:
            w = np.where(f==1)
            indices = [[w[0][i], w[1][i]] for i in range(len(w[0]))]

            indices = np.array(indices, dtype='int32')
            values = np.array([1 for i in range(len(indices))])
            shape = f.shape
            X_i.append((indices, values, shape))
    else:
        w = np.where(n_cat==1)
        indices = [[w[0][i], w[1][i]] for i in range(len(w[0]))]

        indices = np.array(indices, dtype='int32')
        values = np.array([1 for i in range(len(indices))])
        shape = n_cat.shape
        X_i = (indices, values, shape)

    if include_num:
        X_i_num = n_num
    else:
        X_i_num = None
        
    print("t=%d" % (t))    
    for i in range(t):
        _, l = model.run(fetches, X_i, y_i, X_i_num)
        ls.append(l)

        if include_num:
            test_preds = model.run(model.y_prob, X=utils.slice(test_data)[0],
            X_num = np.array(utils.DATA_TEST_DICT['num']))
        else:
            test_preds = model.run(model.y_prob, utils.slice(test_data)[0])

        p_test = precision_score(test_data[1], np.rint(test_preds))
        history_score.append(p_test)  # score in terms of eval precision
        
        # fixed implementation of early stopping
        if i > min_round and i > early_stop_round:  # prevent convergence at i=0
            # if np.argmax(history_score) == i - early_stop_round and history_score[-1] - history_score[
            #             -1 * early_stop_round] < 1e-5:
            
            # convergence
            if (np.argmax(history_score) == i - early_stop_round) and (history_score[i] - \
            history_score[i - early_stop_round] < 1e-5):
                print('early stop\nbest iteration:\n[%d]\teval-precision: %f' % \
                      (np.argmax(history_score), np.max(history_score)))
                print("aka converged")
                j += 1
                break

            elif i == t - 1:
                print("reached t=%d" % (t))
                if j == 0:
                    j += 1
                elif j == n_subset:
                    j = 0
                else:
                    if union:
                        j += 1
                        union = False
                    else:
                        union = True
                        j -= 1
    
    if include_num:
        train_preds = model.run(model.y_prob, X=utils.slice(train_data)[0],
        X_num = np.array(utils.DATA_TRAIN_DICT['num']))
        test_preds = model.run(model.y_prob, X=utils.slice(test_data)[0],
        X_num = np.array(utils.DATA_TEST_DICT['num']))
    else:
        train_preds = model.run(model.y_prob, utils.slice(train_data)[0])
        test_preds = model.run(model.y_prob, utils.slice(test_data)[0])

    roc_auc_train = roc_auc_score(train_data[1], train_preds)
    roc_auc_test = roc_auc_score(test_data[1], test_preds)

    # added more metrics
    a_train = accuracy_score(train_data[1], np.rint(train_preds))
    a_test = accuracy_score(test_data[1], np.rint(test_preds))

    p_train = precision_score(train_data[1], np.rint(train_preds))
    p_test = precision_score(test_data[1], np.rint(test_preds))

    r_train = recall_score(train_data[1], np.rint(train_preds))
    r_test = recall_score(test_data[1], np.rint(test_preds))

    m_train = confusion_matrix(train_data[1], np.rint(train_preds))
    m_test = confusion_matrix(test_data[1], np.rint(test_preds))
    true_pos_rate_train = m_train[1][1]/(m_train[1][1]+m_train[1][0])
    true_pos_rate_test = m_test[1][1]/(m_test[1][1]+m_test[1][0])

#     print('[%d]' % (j))
    print('loss (with l2 norm):%f' % (np.mean(ls)))
    print('train-auc: %f\teval-auc: %f' % (roc_auc_train, roc_auc_test))
    print('train-accuracy: %f\teval-accuracy: %f' % (a_train, a_test))
    print('train-precision: %f\teval-precision: %f' % (p_train, p_test))
    print('train-recall: %f\teval-recall: %f' % (r_train, r_test))

    print('train-confusion-matrix:\n', m_train)
    print('test-confusion-matrix:\n', m_test)
    print('train-true-pos-rate: %f\teval-true-pos-rate: %f' % (true_pos_rate_train, true_pos_rate_test))
        
print("finished")

training model with pnn1

input: MN
t=10
reached t=10
loss (with l2 norm):6.847435
train-auc: 0.500000	eval-auc: 0.500000
train-accuracy: 0.004985	eval-accuracy: 0.004980
train-precision: 0.004985	eval-precision: 0.004980
train-recall: 1.000000	eval-recall: 1.000000
train-confusion-matrix:
 [[     0 464306]
 [     0   2326]]
test-confusion-matrix:
 [[     0 116078]
 [     0    581]]
train-true-pos-rate: 1.000000	eval-true-pos-rate: 1.000000

input: MJ subset 1
t=7


  'precision', 'predicted', average, warn_for)


reached t=7
loss (with l2 norm):1182.439575
train-auc: 0.500000	eval-auc: 0.500000
train-accuracy: 0.995015	eval-accuracy: 0.995020
train-precision: 0.000000	eval-precision: 0.000000
train-recall: 0.000000	eval-recall: 0.000000
train-confusion-matrix:
 [[464306      0]
 [  2326      0]]
test-confusion-matrix:
 [[116078      0]
 [   581      0]]
train-true-pos-rate: 0.000000	eval-true-pos-rate: 0.000000

input: MN
t=10
early stop
best iteration:
[1]	eval-precision: 0.004980
aka converged
loss (with l2 norm):1424.757935
train-auc: 0.500000	eval-auc: 0.500000
train-accuracy: 0.004985	eval-accuracy: 0.004980
train-precision: 0.004985	eval-precision: 0.004980
train-recall: 1.000000	eval-recall: 1.000000
train-confusion-matrix:
 [[     0 464306]
 [     0   2326]]
test-confusion-matrix:
 [[     0 116078]
 [     0    581]]
train-true-pos-rate: 1.000000	eval-true-pos-rate: 1.000000

input: MJ subset 1
t=7
reached t=7
loss (with l2 norm):196.252594
train-auc: 0.500188	eval-auc: 0.494502
train-ac

KeyboardInterrupt: 