In [1]:
import cPickle
import sys
sys.path.insert(0, '../../preprocess')
import vectorizer
vec = cPickle.load(open('../../yelpdata/total_vec_120K_embed.p', 'rb'))

Using TensorFlow backend.


[]


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

ds = pd.read_csv('../../yelpdata/total_data_120K.csv')
aspect_columns = ['bit', 'domain']
                 
ds['bit'] = ds['stars'].apply(lambda x : x > 3.0)
train_idxs, val_idxs = train_test_split(ds.index, stratify=ds[['bit', 'domain']], train_size=0.9, random_state=1337)

idxs = val_idxs
H = {}
for aspect in aspect_columns :
    H[aspect] = np.zeros((len(idxs), len(idxs)))
    aspect_vals = ds[aspect].unique()
    for val in aspect_vals :
        a = set(ds[ds[aspect] == val].index) & set(idxs)
        a = map(lambda s : list(idxs).index(s), a)
        for j in a :
            H[aspect][j, a] = 1

    H[aspect][np.arange(len(idxs)), np.arange(len(idxs))] = 0



In [3]:
train_X = vec.X[train_idxs]
X_tf = np.zeros((train_X.shape[0], vec.vocab_size))
for i in range(len(train_X)) :
    X_tf[i, train_X[i, :]] = 1.

X_tf = X_tf[:, 2:]
train_Xtf = X_tf

val_X = vec.X[val_idxs]
X_tf = np.zeros((val_X.shape[0], vec.vocab_size))
for i in range(len(val_X)) :
    X_tf[i, val_X[i, :]] = 1.

X_tf = X_tf[:, 2:]
val_Xtf = X_tf

In [4]:
from keras.layers import Input, Dense, Lambda
from keras.models import Model
from keras import backend as K
from keras import objectives
from keras import optimizers

learning_rate = 5e-5
batch_size = 64
vocab_size = vec.vocab_size - 2
intermediate_dim = 500
latent_dim = 200
epochs = 1000
epsilon_std = 1.0
activation = 'tanh'

x = Input(shape=(vocab_size,), name='x')
h = Dense(intermediate_dim, activation=activation, name='h')(x)
mu = Dense(latent_dim, name='mu')(h)
log_sigma2 = Dense(latent_dim, name='l')(h)
encoder = Model(x, mu)

# reparameterized sampler for normal distributions
def sample_norm(args):
    '''reparameterized sampling from normal distribution'''
    mu, log_var = args
    epsilon = K.random_normal(shape=(K.shape(mu)[0], latent_dim,), mean=0.)
    return mu + K.exp(0.5 * log_var) * epsilon

# decoder / generative network
z = Lambda(sample_norm, output_shape=(latent_dim,), name='z')([mu, log_sigma2])
e = Dense(vocab_size, name='e')(z)

def log_softmax(x, axis=None):
    x0 = x - K.max(x, axis=axis, keepdims=True)
    log_sum_exp_x0 = K.log(K.sum(K.exp(x0), axis=axis, keepdims=True))
    return x0 - log_sum_exp_x0

def kl_loss(x, e): 
    return (- 0.5 * K.sum(1 + log_sigma2 - K.square(mu) - K.exp(log_sigma2), axis=-1))


def cross_ent_loss(x, e): 
    return - K.sum(x * log_softmax(e, axis=-1), axis=-1) 
    

def vae_loss(x, e):
    xent_loss = cross_ent_loss(x, e)
    kld = kl_loss(x, e)
    return xent_loss + kld


opt = optimizers.adam(lr=learning_rate)
vae = Model(x, e)
vae.compile(optimizer=opt, 
            loss=vae_loss)

In [5]:
from keras import callbacks
patience = 0
earlyStopping = callbacks.EarlyStopping(monitor='val_loss', patience=patience, verbose=1, mode='min')

vae.fit(train_Xtf,  
        train_Xtf, 
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size,
        verbose=1, 
        callbacks=[earlyStopping], 
        validation_split=0.1)

Train on 102941 samples, validate on 11438 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000

KeyboardInterrupt: 

In [6]:
embedds = encoder.predict(val_Xtf)

In [7]:
from sklearn.preprocessing import normalize
embedds_n = normalize(embedds, 'l2')
scores = np.dot(embedds_n, embedds_n.T)
nb_studies = len(val_idxs)
scores[np.arange(nb_studies), np.arange(nb_studies)] = -1000

In [8]:
from sklearn.metrics import roc_auc_score
scores = np.array(scores)
for aspect_j in H :
    aucs = [0] * nb_studies
    for i in range(nb_studies) :
        aucs[i] = roc_auc_score(H[aspect_j][i], scores[i])
    print aspect_j, np.mean(aucs)

bit 0.634551691273
domain 0.788111027467
