In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold

from lmmnn.layers import NLL
from lmmnn.calc_b_hat import calc_b_hat
from lmmnn.nn import process_one_hot_encoding
from lmmnn.callbacks import EarlyStoppingWithSigmasConvergence
from lmmnn.menet import menet_fit, menet_predict
from lmmnn.simulation import Count

from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Reshape, Concatenate, Input, Layer, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, Callback
import tensorflow.keras.backend as K

In [3]:
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [4]:
# The drugs_df CSV comes from simple binding the train and test TSVs from Gräßer et al. (2018),
# available in the UCI ML repo, see our paper.
drugs = pd.read_csv('drugs_df.csv')
drugs.rename(columns={'drug_name':'z0'}, inplace=True)
RE_col = 'z0'

In [5]:
drugs.head()

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount,drug_name
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,"May 20, 2012",27,3428
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,"April 27, 2010",192,1542
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,"December 14, 2009",17,1989
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,"November 3, 2015",10,2456
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,"November 27, 2016",37,553


In [6]:
max_features = 10000
batch_size = 20
epochs = 100
patience = 5
seq_len = 100
words_embed_dim = 100
Z_embed_dim = 10
lstm_kernels = 64
qs = [drugs['z0'].max() + 1]
n_cats = qs
q_spatial = None
Z_non_linear = False
mode = 'intercepts'
Z_non_linear = False
Z_embed_dim_pct = 10
n_sig2bs = 1
n_sig2bs_spatial = 0
est_cors = []
dist_matrix = None
time2measure_dict = None
spatial_embed_neurons = None
resultion = None
verbose = True

In [7]:
n_cats

3671

In [8]:
drugs[RE_col].min()

0

In [9]:
drugs[RE_col].max()

3670

In [10]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(drugs['review'])
text_sequences = tokenizer.texts_to_sequences(drugs['review'])
X = sequence.pad_sequences(text_sequences, padding='post', maxlen=seq_len)
X = pd.DataFrame(X)
x_cols = ['X' + str(i) for i in range(seq_len)]
X.columns = x_cols
X = pd.concat([X, drugs[RE_col]], axis=1)

In [11]:
X.loc[0, x_cols].values

array([   5,   38,   28,   35,  198,    1,   45,    5,   15,  832,   12,
       2948,   99,  149,    2, 3852, 1585,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int64)

In [12]:
((X[x_cols] > 0).astype(int).sum(axis=1) == seq_len).mean()

0.427009759930811

In [13]:
drugs.loc[0, 'review']

'"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"'

In [14]:
tokenizer.word_index['it']

5

In [15]:
def lstm_ignore():
    input_layer = Input(shape=(None, ), dtype=tf.int32)
    x = Embedding(max_features + 1, words_embed_dim)(input_layer)
    x = LSTM(lstm_kernels)(x)
    output = Dense(1)(x)
    return Model(inputs=[input_layer], outputs=output)

def lstm_lmmnn():
    input_layer = Input(shape=(seq_len, ), dtype=tf.int32)
    y_true_input = Input(shape=(1, ),)
    Z_input = Input(shape=(1, ), dtype=tf.int64)
    x = Embedding(max_features + 1, words_embed_dim)(input_layer)
    x = LSTM(lstm_kernels)(x)
    y_pred_output = Dense(1)(x)
    nll = NLL('intercepts', 1.0, [1.0])(y_true_input, y_pred_output, [Z_input])
    return Model(inputs=[input_layer, y_true_input, Z_input], outputs=nll)

def lstm_embed():
    input_layer = Input(shape=(None, ), dtype=tf.int32)
    Z_input = Input(shape=(1,))
    embed = Embedding(n_cats, Z_embed_dim, input_length = 1)(Z_input)
    embed = Reshape(target_shape = (Z_embed_dim, ))(embed)
    x = Embedding(max_features + 1, words_embed_dim)(input_layer)
    x = LSTM(lstm_kernels)(x)
    concat = Concatenate()([x, embed])
    output = Dense(1)(concat)
    return Model(inputs=[input_layer, Z_input], outputs=output)

def lstm_ohe(p):
    input_layer = Input(shape=(None, ), dtype=tf.int32)
    ohe_input = Input(shape=(p, ))
    x = Embedding(max_features + 1, words_embed_dim)(input_layer)
    x = LSTM(lstm_kernels)(x)
    concat = Concatenate()([x, ohe_input])
    output = Dense(1)(concat)
    return Model(inputs=[input_layer, ohe_input], outputs=output)

In [18]:
def reg_nn_ignore(X_train, X_test, y_train, y_test, n_cats, batch_size, epochs, patience, verbose=False):
    model = lstm_ignore()
    model.compile(loss='mse', optimizer='adam')

    callbacks = [EarlyStopping(monitor='val_loss', patience=epochs if patience is None else patience)]
    history = model.fit(X_train[x_cols], y_train, batch_size=batch_size, epochs=epochs,
                        validation_split=0.1, callbacks=callbacks, verbose=verbose)
    y_pred = model.predict(X_test[x_cols]).reshape(X_test.shape[0])
    y_pred = np.clip(y_pred, 1, 10)
    return y_pred, (None, None), len(history.history['loss'])

def reg_nn_ohe(X_train, X_test, y_train, y_test, n_cats, batch_size, epochs, patience, verbose=False):
    X_train, X_test = process_one_hot_encoding(X_train, X_test, x_cols)
    model = lstm_ohe(X_train.drop(x_cols, axis=1).shape[1])
    model.compile(loss='mse', optimizer='adam')

    callbacks = [EarlyStopping(monitor='val_loss', patience=epochs if patience is None else patience)]
    history = model.fit([X_train[x_cols], X_train.drop(x_cols, axis=1)], y_train, batch_size=batch_size, epochs=epochs,
                        validation_split=0.1, callbacks=callbacks, verbose=verbose)
    y_pred = model.predict([X_test[x_cols], X_test.drop(x_cols, axis=1)]).reshape(X_test.shape[0])
    y_pred = np.clip(y_pred, 1, 10)
    return y_pred, (None, None), len(history.history['loss'])

def reg_nn_lmm(X_train, X_test, y_train, y_test, n_cats, batch_size, epochs, patience, verbose=False):
    model = lstm_lmmnn()
    model.compile(optimizer= 'adam')
    
    patience = epochs if patience is None else patience
    # callbacks = [EarlyStoppingWithSigmasConvergence(patience=patience)]
    callbacks = [EarlyStopping(patience=patience)]
    X_train.reset_index(inplace=True)
    y_train.reset_index(inplace=True, drop=True)
    X_train.sort_values(by=[RE_col], inplace=True)
    y_train = y_train[X_train.index]
    history = model.fit([X_train[x_cols], y_train, X_train[RE_col]], None,
                        batch_size=batch_size, epochs=epochs, validation_split=0.1,
                        callbacks=callbacks, verbose=verbose)
    
    sig2e_est, sig2b_ests, rho_ests, weibull_ests = model.layers[-1].get_vars()
    sig2b_spatial_ests = []
    ls = None
    y_pred_tr = model.predict([X_train[x_cols], y_train, X_train[RE_col]]).reshape(X_train.shape[0])
    y_pred_tr = np.clip(y_pred_tr, 1, 10)
    b_hat = calc_b_hat(X_train, y_train, y_pred_tr, qs, q_spatial, sig2e_est, sig2b_ests, sig2b_spatial_ests,
                Z_non_linear, model, ls, mode, rho_ests, est_cors, dist_matrix, weibull_ests)
    dummy_y_test = np.random.normal(size=y_test.shape)
    y_pred = model.predict([X_test[x_cols], dummy_y_test, X_test[RE_col]]).reshape(X_test.shape[0]) + b_hat[X_test[RE_col]]
    y_pred = np.clip(y_pred, 1, 10)
    return y_pred, (sig2e_est, sig2b_ests), len(history.history['loss'])

def reg_nn_embed(X_train, X_test, y_train, y_test, n_cats, batch_size, epochs, patience, verbose=False):
    model = lstm_embed()
    model.compile(loss='mse', optimizer='adam')

    callbacks = [EarlyStopping(monitor='val_loss', patience=epochs if patience is None else patience)]
    history = model.fit([X_train[x_cols], X_train[RE_col]], y_train,
                        batch_size=batch_size, epochs=epochs, validation_split=0.1,
                        callbacks=callbacks, verbose=verbose)
    y_pred = model.predict([X_test[x_cols], X_test[RE_col]]).reshape(X_test.shape[0])
    y_pred = np.clip(y_pred, 1, 10)
    return y_pred, (None, None), len(history.history['loss'])

def reg_nn_menet(X_train, X_test, y_train, y_test, n_cats, batch_size, epochs, patience, verbose=False):
    q = n_cats
    clusters_train, clusters_test = X_train[RE_col].values, X_test[RE_col].values
    X_train, X_test = X_train[x_cols].values, X_test[x_cols].values
    y_train, y_test = y_train.values, y_test.values

    model = lstm_ignore()
    model.compile(loss='mse', optimizer='adam')

    model, b_hat, sig2e_est, n_epochs, _ = menet_fit(model, X_train, y_train, clusters_train, q, batch_size, epochs, patience, verbose=verbose)
    y_pred = menet_predict(model, X_test, clusters_test, q, b_hat)
    y_pred = np.clip(y_pred, 1, 10)
    return y_pred, (sig2e_est, None), n_epochs

def reg_nn(X_train, X_test, y_train, y_test, n_cats, batch=batch_size, epochs=epochs, patience=patience, reg_type='ohe', verbose=False):    
    start = time.time()
    if reg_type == 'ohe':
        y_pred, sigmas, n_epochs = reg_nn_ohe(X_train, X_test, y_train, y_test, n_cats, batch, epochs, patience, verbose)
    elif reg_type == 'lmm':
        y_pred, sigmas, n_epochs = reg_nn_lmm(X_train, X_test, y_train, y_test, n_cats, batch, epochs, patience, verbose)
    elif reg_type == 'ignore':
        y_pred, sigmas, n_epochs = reg_nn_ignore(X_train, X_test, y_train, y_test, n_cats, batch, epochs, patience, verbose)
    elif reg_type == 'embed':
        y_pred, sigmas, n_epochs = reg_nn_embed(X_train, X_test, y_train, y_test, n_cats, batch, epochs, patience, verbose)
    elif reg_type == 'menet':
        y_pred, sigmas, n_epochs = reg_nn_menet(X_train, X_test, y_train, y_test, n_cats, batch, epochs, patience, verbose)
    else:
        raise ValueError(reg_type + ' is an unknown reg_type')
    end = time.time()
    mse = np.mean((y_pred - y_test)**2)
    return mse, sigmas, n_epochs, end - start

In [22]:
res = pd.DataFrame(columns=['experiment', 'exp_type', 'mse', 'sigma_e_est', 'sigma_b_est', 'n_epochs', 'time'])
counter = 0

def iterate_reg_types(X_train, X_test, y_train, y_test, counter, verbose=False):
    mse_ig, _, n_epochs_ig, time_ig = reg_nn(X_train, X_test, y_train, y_test, qs[0], reg_type='ignore', verbose=verbose)
    print(' finished ignore, mse: %.2f' % (mse_ig))
    mse_lmm, sigmas, n_epochs_lmm, time_lmm = reg_nn(X_train, X_test, y_train, y_test, qs[0], reg_type='lmm', verbose=verbose)
    print(' finished lmm, mse: %.2f' % (mse_lmm))
    mse_ohe, _, n_epochs_ohe, time_ohe = reg_nn(X_train, X_test, y_train, y_test, qs[0], reg_type='ohe', verbose=verbose)
    print(' finished ohe, mse: %.2f' % (mse_ohe))
    mse_em, _, n_epochs_em, time_em = reg_nn(X_train, X_test, y_train, y_test, qs[0], reg_type='embed', verbose=verbose)
    print(' finished embed, mse: %.2f' % (mse_em))
    mse_me, sigmas_me, n_epochs_me, time_me = reg_nn(X_train, X_test, y_train, y_test, qs[0], reg_type='menet', verbose=verbose)
    print(' finished menet, mse: %.2f' % (mse_me))
    res.loc[next(counter)] = [i, 'ohe', mse_ohe, np.nan, np.nan, n_epochs_ohe, time_ohe]
    res.loc[next(counter)] = [i, 'lmm', mse_lmm, sigmas[0], sigmas[1][0], n_epochs_lmm, time_lmm]
    res.loc[next(counter)] = [i, 'ignore', mse_ig, np.nan, np.nan, n_epochs_ig, time_ig]
    res.loc[next(counter)] = [i, 'embed', mse_em, np.nan, np.nan, n_epochs_em, time_em]
    res.loc[next(counter)] = [i, 'menet', mse_me, sigmas_me[0], np.nan, n_epochs_me, time_me]

kf = KFold(n_splits=5, shuffle=True, random_state=42)
counter = Count().gen()
y = drugs['rating']

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    print('iteration %d' % i)
    X_train, X_test, y_train, y_test = X.loc[train_index], X.loc[test_index], y[train_index], y[test_index]
    iterate_reg_types(X_train, X_test, y_train, y_test, counter, verbose)

In [21]:
res

In [None]:
res.to_csv('../../results/res_drugs.csv')