In [6]:
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from keras.models import *
from keras.layers import *
from keras.layers.core import Dense
from keras.utils import plot_model


In [7]:
data = pd.read_csv("non-trans_10000_5.csv") 
df = pd.DataFrame(data)

In [8]:
(trainX, testX) = train_test_split(df, test_size=0.2, random_state=42)
trainY = trainX['picked_node']
testY = testX['picked_node']


# Iteration 1: one hot encoded

In [9]:
def process_structured_data(df, train, test):

    continuous = ['left_node', 'right_node']
    cs = MinMaxScaler()
    trainX = cs.fit_transform(train[continuous])
    testX = cs.transform(test[continuous])
    
    colourBin = LabelBinarizer().fit(df['left_colour'])
    trainLcolourX = colourBin.transform(train['left_colour'])
    testLcolourX = colourBin.transform(test['left_colour'])
    trainRcolourX = colourBin.transform(train['right_colour'])
    testRcolourX = colourBin.transform(test['right_colour'])
    trainX = np.hstack([trainX, trainLcolourX, trainRcolourX])
    testX = np.hstack([testX, testLcolourX, testRcolourX])
    
    return (trainX, testX)

In [10]:
(proc_trainX, proc_testX) = process_structured_data(df, trainX, testX)

In [11]:
def create_mlp(dim, regularizer=None):
    """Creates a simple two-layer MLP with inputs of the given dimension"""
    model = Sequential()
    model.add(Dense(8, input_dim=dim, activation="relu", kernel_regularizer=regularizer))
    model.add(Dense(1, activation="sigmoid", kernel_regularizer=regularizer))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [12]:
one_model = create_mlp(proc_trainX.shape[1])

In [13]:
one_model.fit(proc_trainX, trainY, batch_size=16, epochs=10, validation_data=(proc_testX, testY))

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x2a24b5addc8>

# Iteration 2: ordinal encoded

In [14]:
def process_integer_data(df, train, test):
    continuous = ['left_node', 'right_node']
    cs = MinMaxScaler()
    trainX = cs.fit_transform(train[continuous])
    testX = cs.transform(test[continuous])
    
    colourencoder = OrdinalEncoder().fit(np.array(df['left_colour']).reshape(-1,1))
    trainLcolourX = colourencoder.transform(np.array(train['left_colour']).reshape(-1,1))
    testLcolourX = colourencoder.transform(np.array(test['left_colour']).reshape(-1,1))
    trainRcolourX = colourencoder.transform(np.array(train['right_colour']).reshape(-1,1))
    testRcolourX = colourencoder.transform(np.array(test['right_colour']).reshape(-1,1))
    trainX = np.hstack([trainX, trainLcolourX, trainRcolourX])
    testX = np.hstack([testX, testLcolourX, testRcolourX])
    
    return (trainX, testX)

In [15]:
(proc_trainX, proc_testX) = process_integer_data(df, trainX, testX)

In [16]:
int_model = create_mlp(proc_trainX.shape[1])

In [17]:
int_model.fit(proc_trainX, trainY, batch_size=10, epochs=10, validation_data=(proc_testX, testY))

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x2a24db879c8>

# Iteration 3: Embedded

In [18]:
(proc_trainX, proc_testX) = process_integer_data(df, trainX, testX)

In [20]:
def emb(n):

    embedding_size = int(np.ceil(n/2))
    vocab  = n + 1

    model1 = Sequential()
    model1.add(Embedding(vocab ,embedding_size, input_length=1))
    model1.add(Reshape(target_shape=(embedding_size,)))

    model2 = Sequential()
    model2.add(Embedding(vocab ,embedding_size, input_length=1))
    model2.add(Reshape(target_shape=(embedding_size,)))

    model_rest = Sequential()
    model_rest.add(Dense(2, input_shape=(2,)))

    combined = concatenate([model_rest.output, model1.output, model2.output])

    x = Dense(8, activation="relu", kernel_initializer='he_normal')(combined)
    x = Dense(1, activation="sigmoid")(x)

    full_model = Model(inputs=[model_rest.input, model1.input, model2.input], outputs = x)
    full_model.compile(loss="binary_crossentropy", metrics=['acc'], optimizer='adam')
    
    return full_model

emb_model = emb(len(np.unique(proc_trainX[:, 2])))

continous = proc_trainX[:, :2]
cat1 = proc_trainX[:, 2]
cat2 = proc_trainX[:, 3]

tcontinous = proc_testX[:, :2]
tcat1 = proc_testX[:, 2]
tcat2 = proc_testX[:, 3]

emb_model.fit([continous, cat1, cat2], trainY, epochs=20, batch_size=16, validation_data=([tcontinous, tcat1, tcat2], testY))

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x2a24de28d08>

# Evaluation

In [27]:
def evaluate(integer=True, embed=False, ep=10, function='linear'):
    for i in range(2, 9):
        
        data = pd.read_csv(str(function) + '_1000_' + str(i) + '.csv') 
        df = pd.DataFrame(data)
        
        (trainX, valX) = train_test_split(df, test_size=0.2, random_state=42)
        trainY = trainX['picked_node']
        valY = valX['picked_node']
        
        if integer:
            (proc_trainX, proc_valX) = process_integer_data(df, trainX, valX)
        else:
            (proc_trainX, proc_valX) = process_structured_data(df, trainX, valX)
        
        if embed:
            model = emb(len(np.unique(proc_trainX[:, 2])))
            continous = proc_trainX[:, :2]
            cat1 = proc_trainX[:, 2]
            cat2 = proc_trainX[:, 3]

            vcontinous = proc_valX[:, :2]
            vcat1 = proc_valX[:, 2]
            vcat2 = proc_valX[:, 3]

            model.fit([continous, cat1, cat2], epochs=ep, batch_size=16, validation_data=([vcontinous, vcat1, vcat2], valY))
        else:
            model = create_mlp(proc_trainX.shape[1])
            model.fit(proc_trainX, trainY, batch_size=10, epochs=ep, validation_data=(proc_valX, valY))
            
        data = pd.read_csv(str(function) + '_200_' + str(i) + '.csv') 
        df= pd.DataFrame(data)
        
        (trainX, testX) = train_test_split(df, test_size=0.01, random_state=42)
        trainY = trainX['picked_node']
        testY = testX['picked_node']
        
        if integer:
            (proc_testX, proc_test2X) = process_integer_data(df, trainX, valX)
        else:
            (proc_testX, proc_test2X) = process_structured_data(df, trainX, valX)
        
        
        _, accuracy = model.evaluate(proc_testX, trainY, batch_size=128)
        
        print('#####################################################\n\n')
        print(str(i) + " colours accuracy = " + str(accuracy))
        print('\n\n#####################################################')

evaluate(integer=False, ep=10,function='non-trans')
        
        
        

Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
#####################################################


2 colours accuracy = 0.8636363744735718


#####################################################
Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
#####################################################


3 colours accuracy = 0.8232323527336121


#####################################################
Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
#####################################################


4 colours accuracy = 0.7121211886405945


#####################################################
Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch

#####################################################


6 colours accuracy = 0.7676767706871033


#####################################################
Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
#####################################################


7 colours accuracy = 0.5959596037864685


#####################################################
Train on 800 samples, validate on 200 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
#####################################################


8 colours accuracy = 0.7171717286109924


#####################################################
