### Data Preprocessing

In [1]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

In [2]:
feat = pd.read_csv('data/emb_data4.csv')
feat = shuffle(feat)

In [3]:
X1 = feat['master_id']
X2 = feat.drop(['master_id', 'assigne_state'], axis=1)
X2 = scale(X2, axis=0)
y = feat['assigne_state']

In [4]:
u_masters = []

for i in range(len(X1)):
    if X1[i] not in u_masters:
        u_masters.append(X1[i])

In [5]:
from tqdm import tqdm
X_new = pd.DataFrame(columns=['master_idx'], index = X1.index)
for i in tqdm(range(len(X1))):
    X_new['master_idx'][i] = u_masters.index(X1[i])
        
print(X_new.shape)

100%|██████████| 113184/113184 [02:13<00:00, 849.89it/s]

(113184, 1)





In [12]:
tst_X_new = X_new[0:20000].as_matrix()
tst_X2 = X2[0:20000]
tst_y = y[0:20000]

val_X_new = X_new[20001:50000].as_matrix()
val_X2 = X2[20001:50000]
val_y = y[20001:50000]

trn_X_new = X_new[50001:].as_matrix()
trn_X2 = X2[50001:]
trn_y = y[50001:]

In [7]:
trn_X_new, val_X_new, trn_X2, val_X2, trn_y, val_y = train_test_split(X_new, X2, y, test_size=0.2, random_state=42)

### Parallel Model

In [13]:
from keras.layers import Input, Embedding, merge
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, CSVLogger
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.layers.merge import concatenate
n_masters = X1.nunique()
n_factors = 40

In [18]:
master_in = Input(shape=(1,), dtype='int64', name='master_in')
m1 = Embedding(n_masters, n_factors, input_length=1)(master_in)
m2 = Flatten()(m1)
order_in = Input(shape=(12,), dtype='float32', name='order_in')

In [19]:
#x =merge([m2, order_in], mode = 'concat')
x = concatenate([m2, order_in])
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(200, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(100, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(1, activation='sigmoid')(x)

In [20]:
model = Model([master_in, order_in], x)
model.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.1), metrics=['accuracy'])
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
master_in (InputLayer)           (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 1, 40)         117920      master_in[0][0]                  
____________________________________________________________________________________________________
flatten_3 (Flatten)              (None, 40)            0           embedding_3[0][0]                
____________________________________________________________________________________________________
order_in (InputLayer)            (None, 12)            0                                            
___________________________________________________________________________________________

In [21]:
#model.optimizer.lr = 0.01

In [26]:
model.fit([trn_X_new, trn_X2], trn_y, batch_size = 1024, epochs=1000,
          validation_data=([val_X_new, val_X2], val_y),
          callbacks = [EarlyStopping(monitor='val_acc', patience=30, verbose=1, min_delta=1e-4, mode='max'),
                           #ReduceLROnPlateau(monitor='val_acc', factor=0.3, patience=5, verbose=1, epsilon=1e-4, mode='max')
                            CSVLogger('data/learning_log.csv', separator=',', append=False),
                            ReduceLROnPlateau(monitor='val_acc', factor=0.3, patience=8, min_lr=0.00001, verbose = 1)
                           ])

Train on 63183 samples, validate on 29999 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 00021: reducing learning rate to 0.0008100000210106373.
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 00029: reducing learning rate to 0.00024299999931827186.
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 00037: reducing learning rate to 7.290000066859647e-05.
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 00043: early stopping


<keras.callbacks.History at 0x113291b70>

In [27]:
y_pred = model.predict([tst_X_new, tst_X2])

In [28]:
from sklearn.metrics import roc_auc_score    

In [29]:
roc_auc_score(tst_y, y_pred)

0.76589642311042894

### Bins

In [39]:
y_pred2 = y_pred.reshape((20000,))

In [40]:
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
group_names = ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100']

categories = pd.cut(y_pred2, bins, labels=group_names)

df = pd.DataFrame(columns=['y_true', 'y_pred', 'bin'])
df['y_true'] = tst_y
df['y_pred'] = y_pred2
df['bin'] = categories

df.to_csv('bins.csv')

### Sequential Model

In [None]:
n_masters = X.nunique()
n_factors = 100

In [None]:
from keras.layers import Input, Embedding
from keras.layers.core import Dense, Dropout, Flatten
from keras.optimizers import SGD
from keras.models import Sequential, Model

In [None]:
emb_model = Sequential([
    Embedding(n_masters, n_factors, input_length=1),
    Flatten(),
    Dense(100, activation='relu'),
    Dropout(0.7),
    Dense(1, activation='sigmoid')])

In [None]:
emb_model.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.001), metrics=['accuracy'])
emb_model.summary()

In [None]:
emb_model.fit(X_new['master_idx'], y, batch_size=256, epochs=100)