## Lexdecomp_model (semantic matching + CNN)

In [None]:
import numpy as np
import keras.backend as K
from keras.models import Model
from keras.layers import Input, Embedding, Dot, Lambda, Conv2D
from keras.layers import MaxPooling2D, Flatten, Concatenate, Dense
from keras.layers import Activation, BatchNormalization, Dropout

#####################################debut algo ###############################################

def semantic_match(X, Y, A, window): #local-w 
    """Computing semantic match in direction X -> Y
    shape X: (s,n,d), Y: (s,m,d), A: (s, n, m)
    A is the semantic matching at word-word level
    window : w (=3)
    """
    # shape Pivot, lower_lim, upper_lim: (s,n,1)
    #Pivot = k : the max matching between each word of X and 
    #a whole sentence of Y for each sentence of X
    #for each sentence of Y
    
    Pivot = np.expand_dims(np.argmax(A, axis=-1), axis=-1) 
    lower_lim = np.maximum(0, Pivot-window) #
    upper_lim = np.minimum(A.shape[-1], Pivot+window)

    # shape indices: (s,n,m)
    # indices = np.tile(np.arange(A.shape[2]), (A.shape[0], A.shape[1] ,1))
    indices = np.tile(np.arange(A.shape[-1]), A.shape[:-1]+(1,))
    # NOTE: To replicate "mcrisc" implementation in github use: indices < upper_lim
    mask = ((indices >= lower_lim) & (indices <= upper_lim)).astype(np.float32)

    # shape X_hat: (n,d)
    X_hat = np.matmul(A*mask, Y)

    return X_hat #ligne 7 de l'algo 

def decompose(X, X_hat, method="linear"):
    """Decompose a dataset into pos and neg components 
    with regards to its semantic match version
    
    shape X, X_hat: (s,n,d)
    """
    assert method in ("linear", "orthogonal")
    if method == "linear":
        # shape alpha: (s,n,1)
        denom = (np.linalg.norm(X, axis=-1, keepdims=True) *
                 np.linalg.norm(X_hat, axis=-1, keepdims=True))
        alpha = np.divide(np.sum(X * X_hat, axis=-1, keepdims=True),
                          denom, where=denom!=0)

        # shape X_pos, X_neg: (s,n,d)
        X_pos = alpha * X
        X_neg = (1 - alpha) * X
    elif method == "orthogonal": #the chosen one (line 8)
        # shape X_pos, X_neg: (s,n,d)
        denom = np.sum(X_hat * X_hat, axis=-1, keepdims=True)
        X_pos = np.divide(np.sum(X * X_hat, axis=-1, keepdims=True),
                          denom, where=denom!=0) * X_hat
        X_neg = X - X_pos
    X_pos = np.expand_dims(X_pos, axis=-1)
    X_neg = np.expand_dims(X_neg, axis=-1)
    # shape X_decomp: (s,n,d,2)
    X_decomp = np.concatenate([X_pos, X_neg], axis=-1)
    return X_decomp


def decompose_data(X, Y, window=3, method="linear"): 
    """Decompose datasets X, Y into positive and negative
    channels with regards to each other
    shape X: (s,n,d), Y: (s,m,d)
    """
    # Cosine similarity
    # shape A: (s,n,m)
    norm_X = np.linalg.norm(X, axis=-1, keepdims=True)
    norm_Y = np.linalg.norm(Y, axis=-1, keepdims=True)
    A = np.matmul(np.divide(X, norm_X, where=norm_X!=0), np.swapaxes(np.divide(Y, norm_Y, where=norm_Y!=0), -1, -2))
    A = np.matmul(np.divide(X, norm_X, where=norm_X!=0), np.swapaxes(np.divide(Y, norm_Y, where=norm_Y!=0), -1, -2))

    # Semantic matching
    # shape X_hat: (s,n,d), Y_hat: (s,m,d)
    X_hat = semantic_match(X, Y, A, window=window)
    Y_hat = semantic_match(Y, X, np.swapaxes(A, -1, -2), window=window)
    # Decomposition (pos, neg)
    X_decomp = decompose(X, X_hat, method=method)
    Y_decomp = decompose(Y, Y_hat, method=method)

    return X_decomp, Y_decomp #lines 8 and 12


#####################################fin algo ###############################################


def transform_data(X, embedding_matrix):
    X_emb = np.zeros(X.shape+(embedding_matrix.shape[1],))
    for i, val in np.ndenumerate(X):
        X_emb[i] = embedding_matrix[val]
    return X_emb


def CNN_encoder(input_shape, embeddings_dim, max_seq_length, filters):
    X_input = Input(input_shape)
    # Applying different filter sizes at the same time
    conv_list = []
    for i, (filter_size, number_of_filters) in enumerate(filters):
        # Convolutional layer
        # Output shape: (batch_size, width_conv, number_of_filters)
        conv = Conv2D(filters=number_of_filters,
                      kernel_size=(filter_size, embeddings_dim),
                      strides=1,
                      padding="valid",
                      data_format="channels_last",
                      name="conv"+str(i))(X_input)
        #conv = BatchNormalization()(conv)
        conv = Activation("tanh")(conv)

        # Max-pooling layer
        # Output shape: (batch_size, 1, number_of_filters)
        width_conv = max_seq_length - filter_size + 1
        conv = MaxPooling2D(pool_size=(width_conv, 1),
                            name="maxpool"+str(i))(conv)
        # Flattening because we only have one layer of conv filters
        # Output shape: (batch_size, number_of_filters)
        conv = Flatten()(conv)

        # storing all conv filters
        conv_list.append(conv)

    # Concatenating the outputs of different filter sizes
    if len(filters) > 1:
        X = Concatenate()(conv_list)
    else:
        X = conv_list[0]

    model = Model(inputs=X_input, outputs=X)
    return model


def lexdecomp_model(input_shape, embeddings_dim, max_seq_length, filters, dropout=0.5, model_type="other"):
    S_input = Input(input_shape)
    T_input = Input(input_shape)

    # Weight-sharing encoder (Siamese architecture)
    if model_type == "siamese":
        encoder = CNN_encoder(input_shape, embeddings_dim, max_seq_length, filters)
        S_encoded = encoder(S_input)
        T_encoded = encoder(T_input)
    else:
        S_encoded = CNN_encoder(input_shape, embeddings_dim, max_seq_length, filters)(S_input)
        T_encoded = CNN_encoder(input_shape, embeddings_dim, max_seq_length, filters)(T_input)

    X = Concatenate()([S_encoded, T_encoded])
    X = Dropout(dropout)(X)
    X = Dense(1, activation="sigmoid")(X)

    model = Model(inputs=[S_input, T_input], outputs=X, name="lexdecomp_model")
    return model


## Wang_lexcomp_approach (Training + test)

In [None]:
import numpy as np
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, Callback
from keras import optimizers

from sklearn.metrics import accuracy_score, f1_score, log_loss, confusion_matrix
import pickle
import lexdecomp_model
import utils
import datetime

###################
# DATA PARAMETERS #
###################
version = "20180427"
# myprep, kimprep
pp_name = "kimprep"
lower_opt = "nolower"
emb_opt = "embcap"
# word2vec, glove, paragram
emb_name = "word2vec"
# Duplicate training by switching pairs
reverse_train=False
# Randomly generate negative samples
autoneg = 0
#######################
# END DATA PARAMETERS #
#######################

# Generating dataset from parsed MSRPC
(index_to_word, word_to_index,
 X_train1, X_train2, Y_train,
 X_test1, X_test2, Y_test) = utils.generate_dataset(pp_name, lower_opt, version,
                                      max_seq_length=-1,
                                      reverse_train_pairs=reverse_train,
                                      padding=True,
                                      autoneg=autoneg)
#max_seq_length = 39
max_seq_length = X_train1.shape[1]
print("Max seq length:", max_seq_length)
print("X_train:", X_train1.shape)
print("Y_train:", Y_train.shape)
print("X_test:", X_test1.shape)
print("Y_test:", Y_test.shape)

# Loading embeddings matrix
emb_fn = "msrpc_{}_{}_{}_{}_{}.pickle".format(pp_name, lower_opt, emb_name, emb_opt, version)
[embedding_matrix, unknown_words] = pickle.load(open("./data/"+emb_fn, 'rb'))
embeddings_dim = embedding_matrix.shape[1]
print("Embeddings dim:", embeddings_dim)


####################
# MODEL PARAMETERS #
####################
epochs = 30
batch_size = 64
window = 3
# method = linear or orthogonal
method = "orthogonal"
filters = [(1,500), (2,500), (3,500)]   ####change to 200 
use_class_weight = False
############################
### END MODEL PARAMETERS ###
############################

# Transforming train data from sequence of indeces to
# sequence of embeddings
# shape input: (samples, max_seq_length)
#      output: (samples, max_seq_length, embeddings_dim)
X_train1 = lexdecomp_model.transform_data(X_train1, embedding_matrix)
X_train2 = lexdecomp_model.transform_data(X_train2, embedding_matrix)
X_test1 = lexdecomp_model.transform_data(X_test1, embedding_matrix)
X_test2 = lexdecomp_model.transform_data(X_test2, embedding_matrix)

# Decomposing train and test data
# shape output: (samples, max_seq_length, embeddings_dim, 2)
print("Decomposing training data")
X_train1, X_train2 = lexdecomp_model.decompose_data(X_train1, X_train2, window, method)
print("Decomposing test data")
X_test1, X_test2 = lexdecomp_model.decompose_data(X_test1, X_test2, window, method)
print("Decomposed data")
print("X_train:", X_train1.shape)
print("Y_train:", Y_train.shape)
print("X_test:", X_test1.shape)
print("Y_test:", Y_test.shape)

# Selecting the model
model = lexdecomp_model.lexdecomp_model((max_seq_length, embeddings_dim, 2),
                                        embeddings_dim, max_seq_length, filters)
# Printing summaries
model.summary(line_length=100)

# Compiling model
model.compile(optimizer="Adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

# Training model
# Defining class weights for unbalanced datasets
if use_class_weight:
    if Y_train[Y_train == 1].size > Y_train[Y_train == 0].size:
        class_weight = {1:1.0, 0: Y_train[Y_train == 1].size / Y_train[Y_train == 0].size}
    else:
        class_weight = {1:Y_train[Y_train == 0].size / Y_train[Y_train == 1].size, 0: 1.0}
    print("class_weight", class_weight)
else:
    class_weight = None

# Callback to store prediction scores for each epoch
class prediction_history(Callback):
    def __init__(self):
        self.acchis = []
        self.f1his = []
        self.cmhis = []
    def on_epoch_end(self, epoch, logs={}):
        pred=self.model.predict([X_test1, X_test2])
        predclass = np.where(pred>0.5, 1, 0).reshape(-1)
        acc = accuracy_score(Y_test, predclass)
        print(acc)
        self.acchis.append(acc)
        f1 = f1_score(Y_test, predclass)
        print(f1)
        self.f1his.append(f1)
        cm = confusion_matrix(Y_test, predclass)
        print(cm)
        self.cmhis.append(cm)

per_epoch_preds = prediction_history()

# Training model
print("Training model ...")
my_calls = [per_epoch_preds]#None#[es]

history = model.fit(x=[X_train1, X_train2],
                    y=Y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    #validation_split=0.1,
                    validation_data=([X_test1, X_test2], Y_test),
                    class_weight=class_weight,
                    callbacks=my_calls)


print("Evaluation (loss, acc)")
loss, acc = model.evaluate(x=[X_test1, X_test2], y=Y_test)
print("loss: {:.4f}   acc: {:.4f}".format(loss, acc))
with open("tmp.p", "wb") as fid:
    pickle.dump(model.history.history, fid)
pred = np.where(model.predict(x=[X_test1, X_test2])>0.5, 1, 0).reshape(-1)
f1 = f1_score(Y_test, pred)
print("f1: {:.4f}".format(f1))
print("confusion matrix")
cf_mat = confusion_matrix(Y_test, pred)
print(cf_mat)
history.history["test_loss"] = loss
history.history["test_acc"] = acc
history.history["f1"] = f1
history.history["cf_mat"] = cf_mat
history.history["pred"] = pred

hdir = "./runs/wang_lexdecomp/"
date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
hfname = hdir + "hist_" + date + "_wang_lexdecopm.p"
with open(hfname, "wb") as fid:
    pickle.dump(history.history, fid)


## utils

In [None]:
import pickle
import numpy as np

def convert_to_sequence(texts, word_to_index, padding=False, size_limit=10):
    sequences = {}
    for idx, tokens in texts.items():
        if padding:
            sequences[idx] = np.array([word_to_index[token] for i, token in enumerate(tokens)
                                      if i < size_limit] + [0]*(max(0, size_limit-len(tokens))),
                                      dtype=np.int32)
        else:
            sequences[idx] = np.array([word_to_index[token] for token in tokens],
                                      dtype=np.int32)
    return sequences

def generate_dataset(pp_name, lower_opt, version, max_seq_length=-1,
            reverse_train_pairs=False, padding=True, autoneg=0):
    if padding:
        res = generate_dataset_with_padding(pp_name, lower_opt, version, max_seq_length,
                reverse_train_pairs, autoneg)
    else:
        res = generate_dataset_without_padding(pp_name, lower_opt, version, max_seq_length,
                reverse_train_pairs, autoneg)
    return res


def generate_dataset_with_padding(pp_name, lower_opt, version, max_seq_length=-1,
            reverse_train_pairs=False, autoneg=0):
    parsed_fn = "msrpc_{}_{}_{}.pickle".format(pp_name, lower_opt, version)

    # Loading pre-processed corpus
    [parsed_texts,
     index_to_word,
     word_to_index,
     pairs_train,
     Y_train_list,
     pairs_test,
     Y_test_list] = pickle.load(open("./data/"+parsed_fn, 'rb'))

    # Computing the max_seq_length if not provided
    if max_seq_length < 0:
        max_seq_length = np.max([len(tokens) for idx, tokens in parsed_texts.items()])

    # Transforming list of tokens to sequence of indices
    sequences = convert_to_sequence(parsed_texts, word_to_index,
                                    padding=True, size_limit=max_seq_length)

    # Training Data
    if reverse_train_pairs:
        X_train1 = np.zeros((len(pairs_train)*2+autoneg, max_seq_length), dtype=np.int32)
        X_train2 = np.zeros((len(pairs_train)*2+autoneg, max_seq_length), dtype=np.int32)
        for i, (x1, x2) in enumerate(pairs_train):
            X_train1[i*2,:] = sequences[x1]
            X_train1[i*2+1,:] = sequences[x2]
            X_train2[i*2,:] = sequences[x2]
            X_train2[i*2+1,:] = sequences[x1]
        Y_train = np.array([Y_train_list[i//2] for i in range(len(Y_train_list)*2)]+[0]*autoneg, dtype=np.int32)
    else:
        X_train1 = np.zeros((len(pairs_train)+autoneg, max_seq_length), dtype=np.int32)
        X_train2 = np.zeros((len(pairs_train)+autoneg, max_seq_length), dtype=np.int32)
        for i, (x1, x2) in enumerate(pairs_train):
            X_train1[i,:] = sequences[x1]
            X_train2[i,:] = sequences[x2]
        Y_train = np.array(Y_train_list+[0]*autoneg, dtype=np.int32)

    # Adding automatically generated negative samples
    # from sentences in positive samples
    left, right = zip(*[tup for tup, _class in zip(pairs_train, Y_train_list) if _class==1])
    pos_ids = np.array(list(set(left+right)), dtype=np.int32)
    selected_pos_ids = np.random.choice(pos_ids, size=autoneg)
    pairs_train_set = set(pairs_train)
    pairs_test_set = set(pairs_test)
    all_ids = np.array(list(parsed_texts.keys()), dtype=np.int32)
    starting_i = len(pairs_train)*2 if reverse_train_pairs else len(pairs_train)
    for i, pos_id in enumerate(selected_pos_ids, start=starting_i):
        while True:
            paired_id = np.random.choice(all_ids)
            # Check it is not in test set too
            if ((pos_id, paired_id) not in pairs_train_set and
                   (paired_id, pos_id) not in pairs_train_set and
                   (pos_id, paired_id) not in pairs_test_set and
                   (paired_id, pos_id) not in pairs_test_set):
                X_train1[i,:] = sequences[pos_id]
                X_train2[i,:] = sequences[paired_id]
                break
            else:
                print("Ignoring randomly generated sample that already exists")

    # Test Data
    X_test1 = np.zeros((len(pairs_test), max_seq_length), dtype=np.int32)
    X_test2 = np.zeros((len(pairs_test), max_seq_length), dtype=np.int32)
    for i, (x1, x2) in enumerate(pairs_test):
        X_test1[i,:] = sequences[x1]
        X_test2[i,:] = sequences[x2]
    Y_test = np.array(Y_test_list, dtype=np.int32)

    return index_to_word, word_to_index, X_train1, X_train2, Y_train, X_test1, X_test2, Y_test


def generate_dataset_without_padding(pp_name, lower_opt, version, max_seq_length=-1,
            reverse_train_pairs=False, autoneg=0):
    parsed_fn = "msrpc_{}_{}_{}.pickle".format(pp_name, lower_opt, version)

    # Loading pre-processed corpus
    [parsed_texts,
     index_to_word,
     word_to_index,
     pairs_train,
     Y_train_list,
     pairs_test,
     Y_test_list] = pickle.load(open("./data/"+parsed_fn, 'rb'))

    # Computing the max_seq_length if not provided
    if max_seq_length < 0:
        max_seq_length = np.max([len(tokens) for idx, tokens in parsed_texts.items()])

    # Transforming list of tokens to sequence of indices
    sequences = convert_to_sequence(parsed_texts, word_to_index,
                                    padding=False, size_limit=max_seq_length)

    # Training Data
    if reverse_train_pairs:
        X_train1 = []
        X_train2 = []
        for i, (x1, x2) in enumerate(pairs_train):
            X_train1.append(np.array(sequences[x1], dtype=np.int32))
            X_train1.append(np.array(sequences[x2], dtype=np.int32))
            X_train2.append(np.array(sequences[x2], dtype=np.int32))
            X_train2.append(np.array(sequences[x1], dtype=np.int32))
        Y_train = np.array([Y_train_list[i//2] for i in range(len(Y_train_list)*2)]+[0]*autoneg, dtype=np.int32)
    else:
        X_train1 = []
        X_train2 = []
        for i, (x1, x2) in enumerate(pairs_train):
            X_train1.append(np.array(sequences[x1], dtype=np.int32))
            X_train2.append(np.array(sequences[x2], dtype=np.int32))
        Y_train = np.array(Y_train_list+[0]*autoneg, dtype=np.int32)

    # Adding automatically generated negative samples
    # from sentences in positive samples
    left, right = zip(*[tup for tup, _class in zip(pairs_train, Y_train_list) if _class==1])
    pos_ids = np.array(list(set(left+right)), dtype=np.int32)
    selected_pos_ids = np.random.choice(pos_ids, size=autoneg)
    pairs_train_set = set(pairs_train)
    pairs_test_set = set(pairs_test)
    all_ids = np.array(list(parsed_texts.keys()), dtype=np.int32)
    starting_i = len(pairs_train)*2 if reverse_train_pairs else len(pairs_train)
    for i, pos_id in enumerate(selected_pos_ids, start=starting_i):
        while True:
            paired_id = np.random.choice(all_ids)
            # Check it is not in test set too
            if ((pos_id, paired_id) not in pairs_train_set and
                   (paired_id, pos_id) not in pairs_train_set and
                   (pos_id, paired_id) not in pairs_test_set and
                   (paired_id, pos_id) not in pairs_test_set):
                X_train1.append(np.array(sequences[pos_id], dtype=np.int32))
                X_train2.append(np.array(sequences[paired_id], dtype=np.int32))
                break
            else:
                print("Ignoring randomly generated sample that already exists")

    # Test Data
    X_test1 = []
    X_test2 = []
    for i, (x1, x2) in enumerate(pairs_test):
        X_test1.append(np.array(sequences[x1], dtype=np.int32))
        X_test2.append(np.array(sequences[x2], dtype=np.int32))
    Y_test = np.array(Y_test_list, dtype=np.int32)

    return (index_to_word, word_to_index,
            np.array(X_train1), np.array(X_train2), Y_train,
            np.array(X_test1), np.array(X_test2), Y_test)
