In [None]:
import numpy as np
import pandas as pd

from tensorflow import one_hot
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import text, sequence

#from sklearn.metrics import roc_auc_score

In [None]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 35000
# Max number of words in each comment.
MAX_SEQUENCE_LENGTH = 200
#Number of times to duplicate identity_hate
repeats=1 
#Size of the word embeddings
EMBED_SIZE = 300

In [None]:
train = pd.read_csv('spellcheckdata/spellcheck.csv')
train = train.replace(np.nan, '', regex=True)

X_train_text = train["comment_text"].values
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train_text))
word_index = tokenizer.word_index

count_list = list(tokenizer.word_counts.items())
counts=[count_list[i][1] for i in range(0,len(count_list))]

train = train.sample(frac=1,random_state=13)
val=train.tail(int(len(train)*1/10))
train=train.head(int(len(train)*9/10))

train_idhate = train[train['identity_hate'] == 1].sample(n=500,random_state=13)
train_threat = train[train['threat'] == 1].sample(n=300,random_state=13)
train = pd.concat([train, train_idhate])
train = pd.concat([train, train_threat])

train = train.sample(frac=1,random_state=25)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[list_classes].to_numpy()
X_train = tokenizer.texts_to_sequences(train["comment_text"].values)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_val = val[list_classes].to_numpy()
X_val = tokenizer.texts_to_sequences(val["comment_text"].values)
X_val = sequence.pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
embeddings_index = {}
f = open('glove.6B.300d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
def black_box_function(rnn_size, dropout):
        embedding_layer = layers.Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH)        

        model = keras.Sequential()
        model.add(embedding_layer)
        model.add(layers.SpatialDropout1D(dropout))
        model.add(layers.Bidirectional(layers.LSTM(int(rnn_size), return_sequences=True)))
        model.add(layers.GlobalMaxPooling1D())
        model.add(layers.Dense(6, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy','AUC'])

        
        history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_data=(X_val, y_val),callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, min_delta=0.0030,restore_best_weights=False)])

        out=roc_auc_score(y_val, model.predict(X_val))
        print(out)
    
        return out
    
def kernel(X1, X2, l=100, sigma_f=2.0):
    sqdist = np.sum(X1**2, 1).reshape(-1, 1) + np.sum(X2**2, 1) - 2 * np.dot(X1, X2.T)
    return sigma_f**2 * np.exp(-0.5 / l**2 * sqdist)


def posterior(X_s, X_train, Y_train, l=100, sigma_f=2.0, sigma_y=.01):
    K = kernel(X_train, X_train, l, sigma_f) + sigma_y**2 * np.diag(np.ones(X_train.shape[0]))
    K_s = kernel(X_train, X_s, l, sigma_f)
    K_ss = kernel(X_s, X_s, l, sigma_f)+ 1e-8 * np.diag(np.ones(X_s.shape[0]))
    K_inv = np.linalg.inv(K)
    mu_s = np.matmul(np.matmul(np.transpose(K_s),K_inv),Y_train)
    cov_s = K_ss - np.matmul(np.matmul(np.transpose(K_s),K_inv),K_s)
    return ([mu_s, cov_s])

def expected_improvement( x_proposed,X_train, Y_train,l=100, sigma_f=2.0, sigma_y=.01):
    out = posterior(x_proposed, X_train, Y_train, l, sigma_f, sigma_y)
    mu=out[0]
    mu = mu.reshape(-1,1)
    var=np.diag(out[1])
    y_current=max(Y_train)
    std = np.sqrt(var).reshape(-1,1)
    delta = mu - y_current
    z = np.divide(delta, std)
    return (delta * norm.cdf(z) + std * norm.pdf(z))


In [None]:
#Only calculate the BO value mean/cov for a grid of points
Xs=[]
for i in range(0,151,2):
    for j in range(30,150):
        Xs.append([i,j])
Xs=np.array(Xs)

In [None]:
#Represented the dropout rates as 200*dropout rate so that we could work with integers.
#In addition, subtracted all values by .982 to psuedo center them

X_t=[[20,90],[74,90],[130,90],[74,130],[74,50]] #need to provide some initial values.
Y_t=np.array([])
for t in X_t:
    rnn_size=t[1]   
    dropout=t[0]
    out=black_box_function(rnn_size, dropout/200)
    Y_t=np.append(Y_t,(out-.982))
X_t=np.array(X_t)
for i in range(10):
    new=Xs[np.argmax(expected_improvement(Xs, X_t, Y_t))] #choose next point to sample
    rnn_size=new[1]
    dropout=new[0]
    X_t=np.vstack((X_t,[dropout,rnn_size]))  #add new x point onto the X_t array
    out=black_box_function(rnn_size, dropout/200) #find the value for the current parameters via training a model.
    Y_t=np.append(Y_t,(out-.982)) #add new y point onto the Y_t array

Xs[np.argmax(posterior(Xs, X_t50, Y_t50)[0])] #find the maximum parameter set