### Data Preprocessing

In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
data = pd.read_csv('data/emb_data4.csv')
data = shuffle(data)

In [3]:
X1_raw = data['master_id']
X2_raw = data.drop(['master_id', 'assigne_state'], axis=1)

# Need to save
X2_mean = X2_raw.mean(axis = 0)
X2_std = X2_raw.std(axis = 0)

# Scaling
X2 = (X2_raw - X2_mean)/X2_std
X2 = X2.fillna(0)

y = data['assigne_state']

In [4]:
X1_lookup = [-1]
for i in tqdm(range(len(X1_raw))):
    if X1_raw[i] not in X1_lookup:
        X1_lookup.append(X1_raw[i])
# X1_lookup need to save

100%|██████████| 113184/113184 [00:04<00:00, 26572.74it/s]


In [5]:
X1 = pd.DataFrame(columns=['master_idx'], index = X1_raw.index)

for i in tqdm(range(len(X1_raw))):
    if X1_raw[i] in X1_lookup:
        X1['master_idx'][i] = X1_lookup.index(X1_raw[i])
    else:
        X1['master_idx'][i] = 0

100%|██████████| 113184/113184 [02:34<00:00, 733.89it/s]


In [6]:
X1_tst = X1[0:20000].as_matrix()
X2_tst = X2[0:20000].as_matrix()
y_tst = y[0:20000]#.as_matrix().reshape(-1,1)

X1_val = X1[20001:50000].as_matrix()
X2_val = X2[20001:50000].as_matrix()
y_val = y[20001:50000]#.as_matrix().reshape(-1,1)

X1_trn = X1[50001:].as_matrix()
X2_trn = X2[50001:].as_matrix()
y_trn = y[50001:]#.as_matrix().reshape(-1,1)

In [7]:
# X1_trn, X1_val, X2_trn, X2_val, y_trn, y_val = train_test_split(X1, X2, y, test_size=0.2, random_state=42)

### Parallel Model

In [7]:
from keras.layers import Input, Embedding, merge
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, CSVLogger
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.layers.merge import concatenate
from sklearn.metrics import roc_auc_score
n_masters = len(X1_lookup)
n_factors = 50

Using Theano backend.


In [8]:
master_in = Input(shape=(1,), dtype='int64', name='master_in')
m1 = Embedding(n_masters, n_factors, input_length=1)(master_in)
m2 = Flatten()(m1)
order_in = Input(shape=(12,), dtype='float32', name='order_in')

In [9]:
x = concatenate([m2, order_in])
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(200, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(100, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(1, activation='sigmoid')(x)

In [10]:
model = Model([master_in, order_in], x)
model.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.1), metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
master_in (InputLayer)           (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 50)         147450      master_in[0][0]                  
____________________________________________________________________________________________________
flatten_1 (Flatten)              (None, 50)            0           embedding_1[0][0]                
____________________________________________________________________________________________________
order_in (InputLayer)            (None, 12)            0                                            
___________________________________________________________________________________________

In [11]:
class_weight = {1: 1.0, 0: y[y == 1].size / y[y == 0].size}

model.fit([X1_trn, X2_trn], y_trn, batch_size = 512, epochs=1000,
          validation_data=([X1_val, X2_val], y_val),
          class_weight = class_weight,
          callbacks = [EarlyStopping(monitor='val_acc', patience=30, verbose=1, min_delta=1e-4, mode='max'),
                       #ReduceLROnPlateau(monitor='val_acc', factor=0.3, patience=5, verbose=1, epsilon=1e-4, mode='max')
                        CSVLogger('data/learning_log.csv', separator=',', append=False),
                        ReduceLROnPlateau(monitor='val_acc', factor=0.3, patience=8, min_lr=0.00001, verbose = 1)
                           ])

Train on 63183 samples, validate on 29999 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 00032: reducing learning rate to 0.030000000447034835.
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 00040: reducing learning rate to 0.009000000357627868.
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 00048: reducing learning rate to 0.002700000163167715.
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 

<keras.callbacks.History at 0x12327df60>

In [12]:
y_pred = model.predict([X1_tst, X2_tst])
print(roc_auc_score(y_tst, y_pred))

0.753977008509


In [17]:
model.save_weights('data/model_01.h5')

## Dump preprocessing to pickle

In [19]:
from six.moves import cPickle as pickle

pickle_file = 'data/preprocessing.pickle'
try:
    f = open(pickle_file, 'wb')
    save = {
        'X1_lookup': X1_lookup,
        'X2_mean': X2_mean,
        'X2_std': X2_std
    }
    pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
    f.close()
except Exception as e:
    print('Unable to save data to', pickle_file, ':', e)
    raise

### Bins

In [39]:
y_pred2 = y_pred.reshape((20000,))

In [40]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred2, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = tst_y
df['y_pred'] = y_pred2
df['bin'] = categories

df.to_csv('bins.csv')

### Sequential Model

In [None]:
n_masters = X.nunique()
n_factors = 100

In [None]:
from keras.layers import Input, Embedding
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import SGD
from keras.models import Sequential, Model

In [None]:
emb_model = Sequential([
    Embedding(n_masters, n_factors, input_length=1),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [None]:
emb_model.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.001), metrics=['accuracy'])
emb_model.summary()

In [None]:
emb_model.fit(X_new['master_idx'], y, batch_size=256, epochs=100)