In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys
import tempfile

from six.moves import urllib

import pandas as pd
import tensorflow as tf
import numpy as np

import pickle

# file
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'  # get rid of some tf warnings
os.chdir("..")

# IMPORTANT:
import handset_model_current as handset_model  # set F_CLASSIF=True in file
# use the following code in preprocess_data()
#  elif cat_encoding == 'one-hot':
#     df_cat = df[CATEGORICAL_COLS]
#     # instead of df_cat = _one_hot_encode(df, cat_levels, binary_enc=binary_enc)


os.chdir("wide&deep")

Using TensorFlow backend.


In [2]:
def bool_arg(string):
    value = string.lower()
    if value == 'true':
        return True
    elif value == 'false':
        return False
    else:
        raise argparse.ArgumentTypeError("Expected True or False, but got {}".format(string))

# minimal preprocessing
parser = argparse.ArgumentParser()

# model hyperparameters
# small number of epochs for experimentation
parser.add_argument('--epochs', default=10, type=int,
                    help="Nr of epochs. Default is 100", dest="epochs")
parser.add_argument('--batch_size', default=256, type=int,
                    help="Batch size. Default is 32", dest="batch_size")
parser.add_argument('--earlystop', default=3, type=int,
                    help="Number of epochs with no improvement after which training will be stopped.",
                    dest="earlystop")
parser.add_argument('--verbose', default=True, type=bool_arg, help="If True (default), verbose output",
                    dest="verbose")

# cross_val is not ready to be used
parser.add_argument('--cross_val', default=0, type=int,
                    help="Number of folds (if bigger than 0) to use for cross validation. Default is 0.",
                    dest="cross_val")

# no applying class weights
parser.add_argument('--apply_class_weights', default=False, type=bool_arg,
                    help="If True, apply different loss weights (based on frequency of samples) to different "
                         "classes.",
                    dest="apply_class_weights")

# no smooth factor
parser.add_argument('--smooth_factor', default=0, type=float,
                    help="Smooth factor to be used when calculating class weights, so that highly unfrequent "
                    "classes do not get huge weights.",
                    dest="smooth_factor")

# oversampling with neg to pos ratio=3
parser.add_argument('--oversample', default=True, type=bool_arg,
                    help="If True (default), apply oversampling to generate balanced batches.",
                    dest="oversample")
parser.add_argument('--ratio', default=3, type=int,
                    help="Ratio of negative to positive samples to use for balanced batch generation "
                         "(if oversample=True)",
                    dest="ratio")

# activation: prelu
parser.add_argument('--activation', default='prelu',
                    help="NN activation to be used. Default is prelu",
                    dest="activation")

# no x_vars
parser.add_argument('--x_vars', default=False, type=bool_arg, help="If True (default), include X variables",
                    dest="x_vars")

# standardize numerical data
parser.add_argument('--std', default=True, type=bool_arg, help="If True (default), standardize data.",
                    dest="std")

# no pca
parser.add_argument('--pca_whiten', default=False, type=bool_arg, help="If True (default), PCA-whiten data.",
                    dest="pca_whiten")
parser.add_argument('--pca_reduce', default=0, type=float,
                    help="{0, 1, 0<x<1} If 0, no dimensionality reduction is done. If 1, Thomas P. Minka's method "
                         "('Automatic Choice of Dimensionality for PCA'. NIPS 2000) is used to determine the "
                         "number of dimensions to keep. If 0 < pca_reduce < 1, enough number of dimensions will "
                         "be kept to keep 'pca_reduce' percentage of variance explained. Default is 0.9.",
                    dest="pca_reduce")

# one-hot encode cat data (embeddings are not used)
parser.add_argument('--cat_enc', default='one-hot',
                    help="Encoding to be used for categorical variables. Default is 'integer' "
                         "(embedding layers will then be used). Other alternatives: 'hashing_char', "
                         "'hashing_all', 'one-hot'.",
                    dest="cat_enc")

# no log transform
parser.add_argument('--log_xform', default=False, type=bool_arg, help="If True (default), log-transform data.",
                    dest="log_xform")

# encode categorical and binary data as 1/0
parser.add_argument('--binary_enc', default=True, type=bool_arg,
                    help="If False (default), the negative cases of binary variables will be represented as -1 "
                         "instead of 0.", dest="binary_enc")

# id for saving/ loading
parser.add_argument('--data_split_id', default=5, type=int,
                    help="Id for the train-test data split to be used. If a new id is given, a new data split "
                         "will be generated and saved to disk with the given id. If id is 0 (default), a new "
                         "split will be generated, but not saved to disk. If a previously used id is given, "
                         "a previously generated and saved data split with that id will be used.",
                    dest="data_split_id")
parser.add_argument("-f")
args = parser.parse_args()

In [3]:
os.chdir('..')
data_train, data_test, cat_levels = handset_model.load_and_preprocess_data(args)  # use split_id=5
os.chdir('wide&deep')

generating dictionary with levels of catagorical variables...
Reusing data split with id=5
Loading previously pre-processed numerical data...
Loading previously pre-processed categorical data...


In [4]:
data_train['num'].shape, data_train['cat'].shape  # 2nd dim should be 7 and 8

((466632, 7), (466632, 8))

In [5]:
for k,v in cat_levels.items():
    print(k, len(v))

CU_MAP_SEGMENT 7
CLM_LIVSFASE_SEGMENT 7
CU_U_MAIN_DEV_OS_TYPE 25
CU_U_MAIN_DEV_PRODUCERNAME 17
MPP_DEVICE_OS_TYPE 8
MPP_DEVICE_PRODUCERNAME 16
CU_U_MAIN_DEV_MODELNAME 77
MPP_DEVICE_MODELNAME 78


In [6]:
# wide and deep example:
# https://github.com/tensorflow/tensorflow/blob/r1.3/tensorflow/examples/learn/wide_n_deep_tutorial.py

In [7]:
COLUMNS = handset_model.COLS
len(COLUMNS)  # should be 7+8+1=16

16

In [8]:
COLUMNS.remove(handset_model.LABEL_COL)
len(COLUMNS)

15

In [9]:
LABEL_COLUMN = handset_model.LABEL_COL
CATEGORICAL_COLUMNS = handset_model.CATEGORICAL_COLS
CONTINUOUS_COLUMNS = [i for i in COLUMNS if i not in CATEGORICAL_COLUMNS]

CATEGORICAL_COLUMNS, CONTINUOUS_COLUMNS

(['CU_MAP_SEGMENT',
  'CLM_LIVSFASE_SEGMENT',
  'CU_U_MAIN_DEV_OS_TYPE',
  'CU_U_MAIN_DEV_PRODUCERNAME',
  'MPP_DEVICE_OS_TYPE',
  'MPP_DEVICE_PRODUCERNAME',
  'CU_U_MAIN_DEV_MODELNAME',
  'MPP_DEVICE_MODELNAME'],
 ['CU_AGE',
  'CU_U_NET_REV_AVG_3MO',
  'CU_U_MB_AVG_3MO',
  'MPP_MB_AVG_3MO',
  'MPP_NO_VOICE_DOM_LAST2',
  'MPP_GROSS_PERIODIC_FEE_FULL',
  'MPP_NET_REVENUE'])

In [10]:
def build_estimator(model_dir, model_type):
    """Build an estimator."""
    # Sparse base columns.
    cu_map = tf.contrib.layers.sparse_column_with_keys(
        column_name="CU_MAP_SEGMENT", keys=cat_levels["CU_MAP_SEGMENT"])
    
    clm_livsfase = tf.contrib.layers.sparse_column_with_keys(
      column_name="CLM_LIVSFASE_SEGMENT", keys=cat_levels['CLM_LIVSFASE_SEGMENT'])
    
    cu_os_type = tf.contrib.layers.sparse_column_with_keys(
      column_name="CU_U_MAIN_DEV_OS_TYPE", keys=cat_levels['CU_U_MAIN_DEV_OS_TYPE'])
    
    cu_producername = tf.contrib.layers.sparse_column_with_keys(
      column_name="CU_U_MAIN_DEV_PRODUCERNAME", keys=cat_levels['CU_U_MAIN_DEV_PRODUCERNAME'])
    
    mpp_os_type = tf.contrib.layers.sparse_column_with_keys(
      column_name="MPP_DEVICE_OS_TYPE", keys=cat_levels['MPP_DEVICE_OS_TYPE'])
    
    mpp_producername = tf.contrib.layers.sparse_column_with_keys(
      column_name="MPP_DEVICE_PRODUCERNAME", keys=cat_levels['MPP_DEVICE_PRODUCERNAME'])
    
    cu_modelname = tf.contrib.layers.sparse_column_with_keys(
      column_name="CU_U_MAIN_DEV_MODELNAME", keys=cat_levels['CU_U_MAIN_DEV_MODELNAME'])
    
    mpp_modelname = tf.contrib.layers.sparse_column_with_keys(
      column_name="MPP_DEVICE_MODELNAME", keys=cat_levels['MPP_DEVICE_MODELNAME'])

    # Continuous base columns.
    cu_age = tf.contrib.layers.real_valued_column('CU_AGE')
    cu_rev = tf.contrib.layers.real_valued_column('CU_U_NET_REV_AVG_3MO')
    cu_mb = tf.contrib.layers.real_valued_column('CU_U_MB_AVG_3MO')
    mpp_mb = tf.contrib.layers.real_valued_column('MPP_MB_AVG_3MO')
    mpp_no_voice = tf.contrib.layers.real_valued_column('MPP_NO_VOICE_DOM_LAST2')
    mpp_fee = tf.contrib.layers.real_valued_column('MPP_GROSS_PERIODIC_FEE_FULL')
    mpp_rev = tf.contrib.layers.real_valued_column('MPP_NET_REVENUE')
    
    # will not do transformations since numerical data is standardized
    # Transformations.
#     age_buckets = tf.contrib.layers.bucketized_column(age,
#                                                     boundaries=[
#                                                         18, 25, 30, 35, 40, 45,
#                                                         50, 55, 60, 65
#                                                     ])

    # Wide columns and deep columns.
    cat_cols = [cu_map, clm_livsfase, cu_os_type, cu_producername, mpp_os_type,
                  mpp_producername, cu_modelname, mpp_modelname]
    
    wide_columns = cat_cols
    
    np.random.seed(0)
    # make 5 random crossed cols
    i = 0
    while i < 5:
        a = np.random.randint(0, len(cat_cols))
        b = np.random.randint(0, len(cat_cols))
        if cat_cols[a] != cat_cols[b]:
            cross_col = tf.contrib.layers.crossed_column([cat_cols[a], cat_cols[b]], hash_bucket_size=int(1e4))
            wide_columns.append(cross_col)
            i += 1
    
    # according to the wide and deep tensorflow tutorial:
    # "Empirically, a more informed decision for the number of dimensions is to 
    # start with a value on the order of log(n) (base 2) or kn^(1/4), 
    # where n is the number of unique features in a feature column and 
    # k is a small constant (usually smaller than 10)."
    
    # I will try log(n)
    def log2_unique(sparse_column_with_keys):
        return np.floor(np.log2(len(sparse_column_with_keys[3][1])))
    
    deep_columns = [
        tf.contrib.layers.embedding_column(cu_map, dimension=log2_unique(cu_map)),
        tf.contrib.layers.embedding_column(clm_livsfase, dimension=log2_unique(clm_livsfase)),
        tf.contrib.layers.embedding_column(cu_os_type, dimension=log2_unique(cu_os_type)),
        tf.contrib.layers.embedding_column(cu_producername, dimension=log2_unique(cu_producername)),
        tf.contrib.layers.embedding_column(mpp_os_type, dimension=log2_unique(mpp_os_type)),
        tf.contrib.layers.embedding_column(mpp_producername, dimension=log2_unique(mpp_producername)),
        tf.contrib.layers.embedding_column(cu_modelname, dimension=log2_unique(cu_modelname)),
        tf.contrib.layers.embedding_column(mpp_modelname, dimension=log2_unique(mpp_modelname)),

        cu_age,
        cu_rev,
        cu_mb,
        mpp_mb,
        mpp_no_voice,
        mpp_fee,
        mpp_rev
        ]

    if model_type == "wide":
        m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
                                              feature_columns=wide_columns)
    elif model_type == "deep":
        m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
                                           feature_columns=deep_columns,
                                           hidden_units=[128, 128, 128])
    else:
        m = tf.contrib.learn.DNNLinearCombinedClassifier(
            model_dir=model_dir,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[128, 128, 128],
            fix_global_step_increment_bug=True)
    return m

In [11]:
def input_fn(df):
    """Input builder function."""
    # Creates a dictionary mapping from each continuous feature column name (k) to
    # the values of that column stored in a constant Tensor.
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
    # Creates a dictionary mapping from each categorical feature column name (k)
    # to the values of that column stored in a tf.SparseTensor.
    categorical_cols = {
      k: tf.SparseTensor(
          indices=[[i, 0] for i in range(df[k].size)],
          values=df[k].values,
          dense_shape=[df[k].size, 1])
      for k in CATEGORICAL_COLUMNS}
    # Merges the two dictionaries into one.
    feature_cols = dict(continuous_cols)
    feature_cols.update(categorical_cols)
    # Converts the label column into a constant Tensor.
    label = tf.constant(df[LABEL_COLUMN].values)
    # Returns the feature columns and the label.
    return feature_cols, label

In [12]:
# def train_and_eval(model_dir, model_type, train_steps, train_data, test_data):
model_dir = "wide&deep_model"
model_type = "wide_n_deep"
train_steps = 10

"""Train and evaluate the model."""

df_train = pd.concat([data_train['num'], data_train['cat'], data_train['labels']], axis=1)
df_test = pd.concat([data_test['num'], data_test['cat'], data_test['labels']], axis=1)

#     # remove NaN elements
#     df_train = df_train.dropna(how='any', axis=0)
#     df_test = df_test.dropna(how='any', axis=0)

model_dir = tempfile.mkdtemp() if not model_dir else model_dir
print("model directory = %s" % model_dir)

tf.logging.set_verbosity(tf.logging.ERROR)

m = build_estimator(model_dir, model_type)
m.fit(input_fn=lambda: input_fn(df_train), steps=train_steps)

model directory = wide&deep_model


DNNLinearCombinedClassifier(params={'head': <tensorflow.contrib.learn.python.learn.estimators.head._BinaryLogisticHead object at 0x117489f28>, 'linear_feature_columns': (_SparseColumnKeys(column_name='CU_MAP_SEGMENT', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('1', '0', '4', '6', '5', '3', '2'), num_oov_buckets=0, vocab_size=7, default_value=-1), combiner='sum', dtype=tf.string), _SparseColumnKeys(column_name='CLM_LIVSFASE_SEGMENT', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('godt voksen', 'småbarnsfamilie', 'senior', 'etablert barnefamilie', 'ung voksen', 'voksen uten barn', 'ungdom'), num_oov_buckets=0, vocab_size=7, default_value=-1), combiner='sum', dtype=tf.string), _SparseColumnKeys(column_name='CU_U_MAIN_DEV_OS_TYPE', is_integerized=False, bucket_size=None, lookup_config=_SparseIdLookupConfig(vocabulary_file=None, keys=('iphone os', 'unknown', 'proprietary os

In [13]:
# predictions for test data
results = m.evaluate(input_fn=lambda: input_fn(df_test), steps=1)
for key in sorted(results):
    print("%s: %s" % (key, results[key]))

accuracy: 0.99502
accuracy/baseline_label_mean: 0.00498033
accuracy/threshold_0.500000_mean: 0.99502
auc: 0.351565
global_step: 30
labels/actual_label_mean: 0.00498033
labels/prediction_mean: 0.0240634
loss: 0.045544
precision/positive_threshold_0.500000_mean: 0.0
recall/positive_threshold_0.500000_mean: 0.0
