In [4]:
# Model training for DRMM
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical
from collections import defaultdict
import numpy as np
import make_test_data
import pickle as pkl
import os
import pdb
from sklearn.linear_model import LogisticRegression

#TODO: fix root, remove key restrictions

# Placeholder data parsing
def get_data(histograms, training_year=2014, training=True):
    # Proposed format: [doc_id topic_id h0 h1 h2 h3 ...]
    #data_root = '/scratch/cluster/dnelson/ir_proj'

    # can set this to 2015 to read in annotations for 2015 instead
    # format: label_dict[doc_id][topic_id] = ground truth
    label_dict = make_test_data.make_truth(training_year)

    # enforcing order on a dictionary & downsampling to only data w/judgments for training
    key_array = histograms.keys()
    if training:
        key_array = [val for val in key_array if label_dict[int(val[0])][int(val[1])+1] >= 0]

    X = np.array([[np.log(val) if val > 0 else val for val in histograms[key]] for key in key_array])
    Y = np.array([int(label_dict[int(key[0])][int(key[1])+1] > 0) for key in key_array])
    return X, Y, key_array

def get_dict(training_year=2014):
    data_root = '/Users/Dan/class/deep_ir/project/data'
    
    with open(os.path.join(data_root, 'term_histograms_%d' % training_year), 'r') as f:
        histograms = pkl.load(f)
    return histograms


def get_fake_data():
    X_train = np.random.rand(1000, 29)

    # roughly uniform dist of 0, 0.5, 1
    rand_vals = np.random.rand(1000)
    Y_train = np.array([np.floor(val * 3)/2.0 for val in rand_vals])
    return X_train, Y_train

In [5]:
# Get all data for maximum iteration funtime power
# THIS IS THE LONG PART
train_dict = get_dict(training_year=2014)
test_dict = get_dict(training_year=2015)

In [6]:
X_train, Y_train, _ = get_data(train_dict, training_year=2014, training=True)
#X_train, Y_train = get_fake_data()
X_test, Y_test, test_keys = get_data(test_dict, training_year=2015, training=False)

qrels2014.txt
qrels-treceval-2015.txt


In [57]:
# ASIDE - this is how you make your own loss function. Not necessary but fun to play with.
from keras import backend as K

def supercool_loss(y_true, y_pred):
    return K.mean(K.abs(y_true*10 - y_pred))
    

In [2]:
from keras.regularizers import l1, activity_l1, l2
from keras.layers import Input, LSTM, Dense, merge, Dropout
from keras.models import Model

In [25]:
# Create model (input_shape is inferred after first layer)
# This model is a regression

# Defines all shared weight layers for feedforward network
num_inputs = 3
shared_dense = Dense(5, activation='relu')
shared_dropout = Dropout(0.5)
shared_score = Dense(1, activation='relu')

# constr
inputs = [Input(shape=(29,)) for val in range(num_inputs)]
output_1 = [shared_dense(val) for val in inputs]
output_d = [shared_dropout(val) for val in output_1]
output_score = [shared_score(val) for val in output_d]
concat_vals = merge(output_score, mode='concat', concat_axis=-1)

# Defines term weighting network
shared_term_weight = Dense(1, activation='relu')
term_weight_list = [shared_term_weight(val) for val in inputs]
term_weights = merge(term_weight_list, mode='concat', concat_axis=-1)

# Dot product over terms to weight them properly
output = merge([concat_vals, term_weights], mode='dot')


model = Model(input=inputs, output=output)
# Compile model
sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=1.)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['binary_accuracy'])

In [26]:
# X_train needs to be a list of numpy arrays
X_new_train = [X_train for val in range(num_inputs)]
model.fit(X_new_train, Y_train, nb_epoch=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x16dfc0dd0>

In [51]:
from keras.utils.np_utils import to_categorical

categorical_labels = to_categorical(Y_train, nb_classes=None)
print categorical_labels[:10]

[[ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 1.  0.]
 [ 0.  1.]
 [ 1.  0.]]


In [44]:
# Test model
pred_ranks = model.predict(X_test)
print max(pred_ranks)

[ 0.22982793]


In [46]:


#[0][0] = id, [0][1] = topic, [1] = rank
ranks_and_keys = zip(test_keys, pred_ranks)
result_dict = defaultdict(lambda: [])
for val in ranks_and_keys:
    result_dict[val[0][1]].append((val[1][0], val[0][0]))

with open('cat_query_results','wb') as f:
    for cur_topic in range(30):
        topic_results = result_dict[str(cur_topic)]
        topic_results.sort(reverse=True)

        for result_rank, cur_result in enumerate(topic_results[:1000]):
            line_to_write = [str(cur_topic + 1), '0', str(cur_result[1]), str(result_rank + 1), str(cur_result[0]), 'test_run', '\n']
            f.write(" ".join(line_to_write))

In [45]:
print max(pred_ranks)

[ 0.22982793]


In [8]:
print int(True)

1


In [15]:
print train_dict.keys()[:10]

[('2794284', '20'), ('3510867', '23'), ('2275225', '21'), ('3658214', '26'), ('2830982', '4'), ('2533397', '5'), ('3838404', '12'), ('1570135', '7'), ('3565924', '6'), ('3274658', '12')]


In [16]:
for val in train_dict.keys()[:10]:
    print train_dict[val]

[  0   0   0   0   0   0   0   0   0   0   1  10  93 301 428 335  90  39
   9   5   4   0   0   0   0   0   0   0   1]
[  0   0   0   0   0   0   0   0   0   0   4  27 189 504 524 399 188  85
  35  25   5   1   0   0   1   0   0   0   2]
[  0   0   0   0   0   0   0   0   0   0   2  20  84  96 111  62  25  13
   2   0   0   0   0   0   0   0   0   0   0]
[ 0  0  0  0  0  0  0  0  0  0  0  0  9 57 82 56 34  8  8  4  1  0  0  0  0
  0  0  0  2]
[  0   0   0   0   0   0   0   0   0   0   1  35 150 406 558 502 233 134
  54  13   4   3   0   2   1   1   0   0   2]
[  0   0   0   0   0   0   0   0   0   0   3  27 110 245 226 179  82  58
   8  10   3   0   0   0   0   0   0   0   0]
[  0   0   0   0   0   0   0   0   0   0   3  34 117 232 273 135  86  42
   6  12   1   0   0   1   1   1   0   0   1]
[  0   0   0   0   0   0   0   0   0   0  16  75 258 451 560 394 219  84
  54   2   2   0   0   0   0   0   0   0   0]
[  0   0   0   0   0   0   0   0   0   0   2   8  66 371 435 424 195  75
  60

In [18]:
thing = [[val, val, val] for val in X_train]
print np.array(thing).shape

(31403, 3, 29)


In [20]:
print X_train.shape

(31403, 29)


In [13]:
output_1 = [shared_dense(val) for val in inputs]

In [14]:
print output_1

[<tf.Tensor 'Relu_2:0' shape=(?, 5) dtype=float32>, <tf.Tensor 'Relu_3:0' shape=(?, 5) dtype=float32>, <tf.Tensor 'Relu_4:0' shape=(?, 5) dtype=float32>]
