In [1]:
import os
import argparse
import pandas as pd
import pickle

In [2]:
# only run this once!
os.chdir("..")
import handset_model_current

Using TensorFlow backend.


In [2]:
def bool_arg(string):
    value = string.lower()
    if value == 'true':
        return True
    elif value == 'false':
        return False
    else:
        raise argparse.ArgumentTypeError("Expected True or False, but got {}".format(string))

# if __name__ == "__main__":
# minimal preprocessing
parser = argparse.ArgumentParser()

# model hyperparameters
# small number of epochs for experimentation
parser.add_argument('--epochs', default=10, type=int,
                    help="Nr of epochs. Default is 100", dest="epochs")
parser.add_argument('--batch_size', default=256, type=int,
                    help="Batch size. Default is 32", dest="batch_size")
parser.add_argument('--earlystop', default=3, type=int,
                    help="Number of epochs with no improvement after which training will be stopped.",
                    dest="earlystop")
parser.add_argument('--verbose', default=True, type=bool_arg, help="If True (default), verbose output",
                    dest="verbose")

# cross_val is not ready to be used
parser.add_argument('--cross_val', default=0, type=int,
                    help="Number of folds (if bigger than 0) to use for cross validation. Default is 0.",
                    dest="cross_val")

# no applying class weights
parser.add_argument('--apply_class_weights', default=False, type=bool_arg,
                    help="If True, apply different loss weights (based on frequency of samples) to different "
                         "classes.",
                    dest="apply_class_weights")

# no smooth factor
parser.add_argument('--smooth_factor', default=0, type=float,
                    help="Smooth factor to be used when calculating class weights, so that highly unfrequent "
                    "classes do not get huge weights.",
                    dest="smooth_factor")

# oversampling with neg to pos ratio=1
parser.add_argument('--oversample', default=True, type=bool_arg,
                    help="If True (default), apply oversampling to generate balanced batches.",
                    dest="oversample")
parser.add_argument('--ratio', default=1, type=int,
                    help="Ratio of negative to positive samples to use for balanced batch generation "
                         "(if oversample=True)",
                    dest="ratio")

# activation: prelu
parser.add_argument('--activation', default='prelu',
                    help="NN activation to be used. Default is prelu",
                    dest="activation")

# no x_vars
parser.add_argument('--x_vars', default=False, type=bool_arg, help="If True (default), include X variables",
                    dest="x_vars")

# standardize numerical data
parser.add_argument('--std', default=True, type=bool_arg, help="If True (default), standardize data.",
                    dest="std")

# no pca
parser.add_argument('--pca_whiten', default=False, type=bool_arg, help="If True (default), PCA-whiten data.",
                    dest="pca_whiten")
parser.add_argument('--pca_reduce', default=0, type=float,
                    help="{0, 1, 0<x<1} If 0, no dimensionality reduction is done. If 1, Thomas P. Minka's method "
                         "('Automatic Choice of Dimensionality for PCA'. NIPS 2000) is used to determine the "
                         "number of dimensions to keep. If 0 < pca_reduce < 1, enough number of dimensions will "
                         "be kept to keep 'pca_reduce' percentage of variance explained. Default is 0.9.",
                    dest="pca_reduce")

# one-hot encode cat data (embeddings are not used)
parser.add_argument('--cat_enc', default='one-hot',
                    help="Encoding to be used for categorical variables. Default is 'integer' "
                         "(embedding layers will then be used). Other alternatives: 'hashing_char', "
                         "'hashing_all', 'one-hot'.",
                    dest="cat_enc")

# no log transform
parser.add_argument('--log_xform', default=False, type=bool_arg, help="If True (default), log-transform data.",
                    dest="log_xform")

# encode categorical and binary data as 1/-1 (for fair comparison with standardized numerical data)
parser.add_argument('--binary_enc', default=False, type=bool_arg,
                    help="If False (default), the negative cases of binary variables will be represented as -1 "
                         "instead of 0.", dest="binary_enc")

# id for saving
parser.add_argument('--data_split_id', default=1, type=int,
                    help="Id for the train-test data split to be used. If a new id is given, a new data split "
                         "will be generated and saved to disk with the given id. If id is 0 (default), a new "
                         "split will be generated, but not saved to disk. If a previously used id is given, "
                         "a previously generated and saved data split with that id will be used.",
                    dest="data_split_id")
parser.add_argument("-f")
args = parser.parse_args()
# main(args)

In [3]:
data_train, data_test, cat_levels = handset_model_current.load_and_preprocess_data(args)  # split_id=1 for all features

generating dictionary with levels of catagorical variables...
Reusing data split with id=1
Loading previously pre-processed numerical data...
Loading previously pre-processed categorical data...


In [5]:
data_train['num'].shape, data_train['cat'].shape, \
data_test['num'].shape, data_test['cat'].shape, \
data_train['labels'].shape, data_test['labels'].shape 

((466632, 306),
 (466632, 756),
 (116659, 306),
 (116659, 756),
 (466632, 1),
 (116659, 1))

In [6]:
X_train = pd.concat([data_train['num'], data_train['cat']], axis=1)
X_test = pd.concat([data_test['num'], data_test['cat']], axis=1)
X_final = pd.concat([X_train, X_test], ignore_index=True)

y = pd.concat([data_train['labels'][handset_model_current.LABEL_COL], \
               data_test['labels'][handset_model_current.LABEL_COL]], ignore_index=True)

In [7]:
drop_cols = ['Unnamed: 0', 'ID']
X_final.drop(drop_cols, axis=1, inplace=True)

In [9]:
X_train.shape, X_test.shape, X_final.shape, \
y.shape

((466632, 1062), (116659, 1062), (583291, 1060), (583291,))

In [11]:
corrDict = dict()
for colName in X_final.columns:
    corr = abs(X_final[colName].corr(y))
    if corr > 0.04:
        print(colName, corr)
        corrDict[colName] = corr

CU_AGE 0.0432861057372
CU_U_MB_AVG_3MO 0.0455189620728
MPP_MB_SUM_3MO 0.0425954516754
MPP_MB_AVG_3MO 0.0424083809852
MPP_GROSS_PERIODIC_FEE_FULL 0.0489477804739
CU_MAP_SEGMENT_6 0.0464689746653
CLM_LIVSFASE_SEGMENT_ung voksen 0.04190183543
CU_U_MAIN_DEV_OS_TYPE_iphone os 0.0541875972373
CU_U_MAIN_DEV_PRODUCERNAME_apple 0.0541875972373
MPP_DEVICE_OS_TYPE_iphone os 0.042598699569
MPP_DEVICE_PRODUCERNAME_apple 0.042598699569


In [23]:
os.chdir("feature_selection")

In [24]:
with open('corrDict.pickle', 'wb') as f:
    pickle.dump(corrDict, f)

In [25]:
group = X_final.groupby(y)

In [26]:
X0 = X_final.loc[group.indices[0], :]
X1 = X_final.loc[group.indices[1], :]

In [28]:
y0 = y.loc[group.indices[0]]
y1 = y.loc[group.indices[1]]

In [30]:
mean1 = X1.mean()
mean0 = X0.mean()

In [31]:
meanDiff = abs(mean1 - mean0)

In [32]:
meanDict = dict()
for i in meanDiff.index.values:
    if meanDiff[i] > 0.5:
        print(i, meanDiff[i])
        meanDict[i] = meanDiff[i]

CU_AGE 0.614539254253
CU_U_NET_REV_AVG_3MO 0.514587359183
CU_U_MB_AVG_3MO 0.641625300575
MPP_MB_LAST1 0.521722536441
MPP_MB_LAST2 0.551535217464
MPP_MB_LAST3 0.567267376188
MPP_MB_SUM_3MO 0.60158436149
MPP_MB_AVG_3MO 0.599217503742
MPP_GROSS_PERIODIC_FEE_FULL 0.693419478547
MPP_NET_REVENUE 0.56731436173
CU_MAP_SEGMENT_6 0.55108643158
CU_U_MAIN_DEV_OS_TYPE_iphone os 0.761851886577
CU_U_MAIN_DEV_OS_TYPE_android 0.505456057002
CU_U_MAIN_DEV_PRODUCERNAME_apple 0.761851886577
MPP_DEVICE_OS_TYPE_iphone os 0.604549872621
MPP_DEVICE_OS_TYPE_android 0.547848345531
MPP_DEVICE_PRODUCERNAME_apple 0.604549872621


In [34]:
with open('meanDict.pickle', 'wb') as f:
    pickle.dump(meanDict, f)

In [35]:
var1 = X1.var()
var0 = X0.var()

In [36]:
varDiff = abs(var1 - var0)

In [37]:
varDict = dict()
for i in varDiff.index.values:
    if varDiff[i] > 1:
        print(i, varDiff[i])
        varDict[i] = varDiff[i]

CU_MPR_NO_MMS_DOM_LAST1 1.42333386142
CU_MPR_NO_SMS_INT_LAST1 1.06722139832
CU_FIX_NO_VOICE_INT_LAST2 2.49542246679
CU_FIX_NO_VOICE_INT_LAST3 3.05030083975
CU_U_MB_AVG_3MO 1.56430575587
MPP_BANKID_USED_LAST1 1.57393625331
MPP_BANKID_USED_LAST2 1.54462117929
MPP_BANKID_USED_LAST3 1.55374375763
MPP_MB_LAST1 2.3390980876
MPP_MB_LAST2 1.48972499955
MPP_MB_LAST3 1.71293747125
MPP_MB_SUM_3MO 1.65071812047
MPP_MB_AVG_3MO 1.65009006214
MPP_KR_SMS_INT_LAST3 2.54310807937
MPP_NO_VOICE_DOM_LAST3 1.00714918042
MPP_NO_VOICE_INT_LAST1 1.17235679347
MPP_NO_VOICE_INT_LAST2 1.11125668268
MPP_NO_VOICE_INT_LAST3 1.12647016259
MPP_NET_OTHER_FEE 1.30841506924


In [39]:
with open('varDict.pickle', 'wb') as f:
    pickle.dump(varDict, f)

In [41]:
med1 = X1.median()
med0 = X0.median()

In [42]:
medDiff = abs(med1 - med0)

In [43]:
medDict = dict()
for i in medDiff.index.values:
    if medDiff[i] > 0.5:
        print(i, medDiff[i])
        medDict[i] = medDiff[i]

CU_AGE 0.60820436842
HH_ANT_VOKSEN 1.20883448015
CU_U_MB_AVG_3MO 0.501910253248
CU_U_MAIN_DEV_MODEL_ID 0.79163023219
MPP_GROSS_PERIODIC_FEE_FULL 0.52300250517
MPP_NET_REVENUE 0.525811311234
CU_GENDER_m 2.0
CU_GENDER_k 2.0
CU_ADSL_OK_RESULT_verify 2.0
CU_U_MAIN_DEV_OS_TYPE_iphone os 2.0
CU_U_MAIN_DEV_PRODUCERNAME_apple 2.0
CU_U_MAIN_DEV_CATEGORY_smartphone lte 2.0
MPP_DEVICE_CATEGORY_smartphone lte 2.0
MPP_BINDING_TYPE_binding terminal 2.0


In [45]:
with open('medDict.pickle', 'wb') as f:
    pickle.dump(medDict, f)