In [1]:
import os
import argparse
import pandas as pd
import pickle

In [2]:
# only run this once!
os.chdir("..")
import handset_model_current

Using TensorFlow backend.


In [3]:
def bool_arg(string):
    value = string.lower()
    if value == 'true':
        return True
    elif value == 'false':
        return False
    else:
        raise argparse.ArgumentTypeError("Expected True or False, but got {}".format(string))

# if __name__ == "__main__":
# minimal preprocessing
parser = argparse.ArgumentParser()

# model hyperparameters
# small number of epochs for experimentation
parser.add_argument('--epochs', default=10, type=int,
                    help="Nr of epochs. Default is 100", dest="epochs")
parser.add_argument('--batch_size', default=256, type=int,
                    help="Batch size. Default is 32", dest="batch_size")
parser.add_argument('--earlystop', default=3, type=int,
                    help="Number of epochs with no improvement after which training will be stopped.",
                    dest="earlystop")
parser.add_argument('--verbose', default=True, type=bool_arg, help="If True (default), verbose output",
                    dest="verbose")

# cross_val is not ready to be used
parser.add_argument('--cross_val', default=0, type=int,
                    help="Number of folds (if bigger than 0) to use for cross validation. Default is 0.",
                    dest="cross_val")

# no applying class weights
parser.add_argument('--apply_class_weights', default=False, type=bool_arg,
                    help="If True, apply different loss weights (based on frequency of samples) to different "
                         "classes.",
                    dest="apply_class_weights")

# no smooth factor
parser.add_argument('--smooth_factor', default=0, type=float,
                    help="Smooth factor to be used when calculating class weights, so that highly unfrequent "
                    "classes do not get huge weights.",
                    dest="smooth_factor")

# oversampling with neg to pos ratio=1
parser.add_argument('--oversample', default=True, type=bool_arg,
                    help="If True (default), apply oversampling to generate balanced batches.",
                    dest="oversample")
parser.add_argument('--ratio', default=1, type=int,
                    help="Ratio of negative to positive samples to use for balanced batch generation "
                         "(if oversample=True)",
                    dest="ratio")

# activation: prelu
parser.add_argument('--activation', default='prelu',
                    help="NN activation to be used. Default is prelu",
                    dest="activation")

# no x_vars
parser.add_argument('--x_vars', default=False, type=bool_arg, help="If True (default), include X variables",
                    dest="x_vars")

# standardize numerical data
parser.add_argument('--std', default=True, type=bool_arg, help="If True (default), standardize data.",
                    dest="std")

# no pca
parser.add_argument('--pca_whiten', default=False, type=bool_arg, help="If True (default), PCA-whiten data.",
                    dest="pca_whiten")
parser.add_argument('--pca_reduce', default=0, type=float,
                    help="{0, 1, 0<x<1} If 0, no dimensionality reduction is done. If 1, Thomas P. Minka's method "
                         "('Automatic Choice of Dimensionality for PCA'. NIPS 2000) is used to determine the "
                         "number of dimensions to keep. If 0 < pca_reduce < 1, enough number of dimensions will "
                         "be kept to keep 'pca_reduce' percentage of variance explained. Default is 0.9.",
                    dest="pca_reduce")

# one-hot encode cat data (embeddings are not used)
parser.add_argument('--cat_enc', default='one-hot',
                    help="Encoding to be used for categorical variables. Default is 'integer' "
                         "(embedding layers will then be used). Other alternatives: 'hashing_char', "
                         "'hashing_all', 'one-hot'.",
                    dest="cat_enc")

# no log transform
parser.add_argument('--log_xform', default=False, type=bool_arg, help="If True (default), log-transform data.",
                    dest="log_xform")

# encode categorical and binary data as 1/-1 (for fair comparison with standardized numerical data)
parser.add_argument('--binary_enc', default=False, type=bool_arg,
                    help="If False (default), the negative cases of binary variables will be represented as -1 "
                         "instead of 0.", dest="binary_enc")

# id for saving
parser.add_argument('--data_split_id', default=1, type=int,
                    help="Id for the train-test data split to be used. If a new id is given, a new data split "
                         "will be generated and saved to disk with the given id. If id is 0 (default), a new "
                         "split will be generated, but not saved to disk. If a previously used id is given, "
                         "a previously generated and saved data split with that id will be used.",
                    dest="data_split_id")
parser.add_argument("-f")
args = parser.parse_args()
# main(args)

In [4]:
# import all columns (remove usecols arg) in load_data(): df = pd.read_csv("handset_data_train_wo_X.csv")
# global vars: use all categorical columns, all binary columns
data_train, data_test, cat_levels = handset_model_current.load_and_preprocess_data(args)

generating dictionary with levels of catagorical variables...
Reusing data split with id=1
Loading previously pre-processed numerical data...
Loading previously pre-processed categorical data...


In [25]:
data_train['num'].shape, data_train['cat'].shape, \
data_test['num'].shape, data_test['cat'].shape, \
data_train['labels'].shape, data_test['labels'].shape 

((466632, 306),
 (466632, 756),
 (116659, 306),
 (116659, 756),
 (466632, 1),
 (116659, 1))

In [26]:
X_train = pd.concat([data_train['num'], data_train['cat']], axis=1)
X_test = pd.concat([data_test['num'], data_test['cat']], axis=1)
X_final = pd.concat([X_train, X_test], ignore_index=True)

y = pd.concat([data_train['labels'][handset_model_current.LABEL_COL], \
               data_test['labels'][handset_model_current.LABEL_COL]], ignore_index=True)

In [27]:
drop_cols = ['Unnamed: 0', 'ID', 'MPP_NET_DISCOUNT_OTHER_FEE']
X_final.drop(drop_cols, axis=1, inplace=True)

In [28]:
X_train.shape, X_test.shape, X_final.shape, \
y.shape

((466632, 1062), (116659, 1062), (583291, 1059), (583291,))

In [29]:
findNull = X_final.isnull().any()

In [45]:
type(findNull)

pandas.core.series.Series

In [30]:
for i in findNull.index:
    if findNull[i]:
        print(i)

In [35]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

test = SelectKBest(score_func=f_classif, k=20)
fit = test.fit(X_final, y)

In [36]:
dex = fit.get_support(indices=True)
dex

array([  0, 200, 201, 238, 239, 240, 241, 242, 273, 274, 297, 298, 315,
       323, 456, 459, 481, 689, 690, 697])

In [37]:
features_df = X_final.iloc[:,dex]
features_df

Unnamed: 0,CU_AGE,CU_U_NET_REV_AVG_3MO,CU_U_MB_AVG_3MO,MPP_MB_LAST1,MPP_MB_LAST2,MPP_MB_LAST3,MPP_MB_SUM_3MO,MPP_MB_AVG_3MO,MPP_NO_VOICE_DOM_LAST2,MPP_NO_VOICE_DOM_LAST3,MPP_GROSS_PERIODIC_FEE_FULL,MPP_NET_REVENUE,CU_MAP_SEGMENT_6,CLM_LIVSFASE_SEGMENT_ung voksen,CU_U_MAIN_DEV_OS_TYPE_iphone os,CU_U_MAIN_DEV_OS_TYPE_android,CU_U_MAIN_DEV_PRODUCERNAME_apple,MPP_DEVICE_OS_TYPE_iphone os,MPP_DEVICE_OS_TYPE_android,MPP_DEVICE_PRODUCERNAME_apple
0,-0.017146,-0.014346,-0.500524,-0.502154,-0.502828,-0.511420,-0.557909,-0.525738,-0.917582,-0.916409,-1.453054,-1.104742,-1,-1,1,-1,1,1,-1,1
1,1.334419,0.601262,-0.486885,-0.467515,-0.502828,-0.511420,-0.543249,-0.478039,-0.917582,-0.916409,-0.870397,-0.139646,-1,-1,-1,-1,-1,-1,-1,-1
2,0.050432,0.348043,0.412221,-0.384625,1.501685,-0.225265,0.330468,0.281098,-0.257238,-0.441928,-0.043485,-0.408667,-1,-1,-1,1,-1,-1,1,-1
3,1.942624,-0.270907,-0.434906,-0.418049,-0.440092,-0.474167,-0.487376,-0.484320,-0.634578,-0.467576,-0.043485,-0.259819,-1,-1,-1,1,-1,-1,1,-1
4,-1.436290,0.612599,1.505880,1.534630,1.319394,1.482100,1.598811,1.468138,1.373407,-0.159805,3.635314,1.747751,1,1,1,-1,1,1,-1,1
5,1.469576,-0.407010,-0.448016,-0.452270,-0.463441,-0.444414,-0.501468,-0.497508,-0.297667,-0.300866,-0.335296,-0.447246,-1,-1,1,-1,1,1,-1,1
6,-0.017146,-0.385357,-0.210643,-0.134429,-0.110846,-0.481611,-0.246311,-0.258708,-0.675007,-0.429104,-0.378530,-0.524835,-1,-1,1,-1,1,1,-1,1
7,-1.503868,0.582610,1.447587,1.003641,2.215667,0.916762,1.536150,1.409494,0.470488,0.930218,1.625041,1.024308,1,1,-1,1,-1,-1,1,-1
8,0.320745,-0.435762,-0.472766,-0.494908,-0.485167,-0.445786,-0.528072,-0.522407,-0.297667,-0.300866,-0.385231,-0.540972,-1,-1,1,-1,1,1,-1,1
9,-0.557772,3.892312,1.467996,-0.506537,-0.502828,-0.511420,-0.559764,-0.552068,-0.917582,-0.916409,-0.378530,-0.524835,1,-1,1,-1,1,1,-1,1


In [40]:
features_df.columns.values

array(['CU_AGE', 'CU_U_NET_REV_AVG_3MO', 'CU_U_MB_AVG_3MO', 'MPP_MB_LAST1',
       'MPP_MB_LAST2', 'MPP_MB_LAST3', 'MPP_MB_SUM_3MO', 'MPP_MB_AVG_3MO',
       'MPP_NO_VOICE_DOM_LAST2', 'MPP_NO_VOICE_DOM_LAST3',
       'MPP_GROSS_PERIODIC_FEE_FULL', 'MPP_NET_REVENUE',
       'CU_MAP_SEGMENT_6', 'CLM_LIVSFASE_SEGMENT_ung voksen',
       'CU_U_MAIN_DEV_OS_TYPE_iphone os', 'CU_U_MAIN_DEV_OS_TYPE_android',
       'CU_U_MAIN_DEV_PRODUCERNAME_apple', 'MPP_DEVICE_OS_TYPE_iphone os',
       'MPP_DEVICE_OS_TYPE_android', 'MPP_DEVICE_PRODUCERNAME_apple'], dtype=object)

In [46]:
for i in features_df.columns.values:
    print(i)

CU_AGE
CU_U_NET_REV_AVG_3MO
CU_U_MB_AVG_3MO
MPP_MB_LAST1
MPP_MB_LAST2
MPP_MB_LAST3
MPP_MB_SUM_3MO
MPP_MB_AVG_3MO
MPP_NO_VOICE_DOM_LAST2
MPP_NO_VOICE_DOM_LAST3
MPP_GROSS_PERIODIC_FEE_FULL
MPP_NET_REVENUE
CU_MAP_SEGMENT_6
CLM_LIVSFASE_SEGMENT_ung voksen
CU_U_MAIN_DEV_OS_TYPE_iphone os
CU_U_MAIN_DEV_OS_TYPE_android
CU_U_MAIN_DEV_PRODUCERNAME_apple
MPP_DEVICE_OS_TYPE_iphone os
MPP_DEVICE_OS_TYPE_android
MPP_DEVICE_PRODUCERNAME_apple


In [42]:
os.chdir("feature_selection")

In [44]:
with open('fArray.pickle', 'wb') as f:
    pickle.dump(features_df.columns.values, f)

In [51]:
count = 0
for i in X_final['MPP_BANKID_USED_LAST3']:
    if i == 0:
        count +=1
count

0