In [4]:
import tensorflow as tf
import numpy as np
import seaborn
import matplotlib.pyplot as plt
import time
from collections import defaultdict
% pylab inline

Populating the interactive namespace from numpy and matplotlib


### Functions to extract pairs

In [33]:
# Functions to extrac the documents, query and rank information
# We put everything in a dictionary (key,value)= (query_id,[features,rank])
def extractFeatures(split):
    features = []
    for i in xrange(2, 138):
        features.append(float(split[i].split(':')[1]))
    return features

def extractQueryData(split):
    queryFeatures = [split[1].split(':')[1]]
    return queryFeatures

def readDataset(path):
    dictio_quid= defaultdict(list)
    print('Reading training data from file...')

    with open(path, 'r') as file:
        for line in file:
            split = line.split()
            rank=int(split[0])
            features=extractFeatures(split)
            query=extractQueryData(split)
            dictio_quid[query[0]].append((features,rank))

    print('Number of query ID %d' %(len(dictio_quid)))
    return dictio_quid

# Given a query ID, we separate on: [Xi,Xj,P_true] where P_true is either 0,0.5 or 1
def get_pairs_features(dictio_quid_featsRank):
    data=[]
    k=0
    for key in dictio_quid_featsRank.keys():
        # Temporary list of features,rank
        temp_list = dictio_quid_featsRank[key]

        for i in xrange(0, len(temp_list)):
            X1 = temp_list[i][0]
            rank1 = temp_list[i][1]
            for j in xrange(i + 1, len(temp_list)):
                X2=temp_list[j][0]
                rank2=temp_list[j][1]

                # Only look at queries with different id:
                if (rank1==rank2):
                    break
                #    data.append((X1,X2,0.5))
                if (rank1>rank2):
                    data.append((X1,X2,int(1)))
                else:
                    data.append((X1, X2,int(0)))
        k+=1
        if k%100==0:
            print "number of keys transformed: %d finished"%int(k)
    return data

def sampling_data(training_data,batch_size):
    N=len(training_data)
    indices = np.random.choice(N, batch_size)
    print "%d indices Selected"%batch_size
    return [training_data[i] for i in indices]
    
def separate(data):
    Xi=[]
    Xj=[]
    P_target=[]
    for instance in data:
        Xi.append(instance[0])
        Xj.append(instance[1])
        P_target.append(instance[2])
    return (np.array(Xi),np.array(Xj),np.array(P_target))

In [6]:
#Read training data
dictio_query = readDataset('./MSLR-WEB10K/Fold1/train.txt')
#dictio_query_val = readDataset('./MSLR-WEB10K/Fold1/vali.txt')
# Extract document pairs
training_data=get_pairs_features(dictio_query)


Reading training data from file...
Number of query ID 6000
number of keys transformed: 100 finished
number of keys transformed: 200 finished
number of keys transformed: 300 finished
number of keys transformed: 400 finished
number of keys transformed: 500 finished
number of keys transformed: 600 finished
number of keys transformed: 700 finished
number of keys transformed: 800 finished
number of keys transformed: 900 finished
number of keys transformed: 1000 finished
number of keys transformed: 1100 finished
number of keys transformed: 1200 finished
number of keys transformed: 1300 finished
number of keys transformed: 1400 finished
number of keys transformed: 1500 finished
number of keys transformed: 1600 finished
number of keys transformed: 1700 finished
number of keys transformed: 1800 finished
number of keys transformed: 1900 finished
number of keys transformed: 2000 finished
number of keys transformed: 2100 finished
number of keys transformed: 2200 finished
number of keys transformed

In [150]:
# Sampling data:
n_samples=150000
data=sampling_data(training_data,n_samples)

150000 indices Selected


In [151]:
# Constant, variables and place holders:
nDim=136
N=len(data)
first_layer=40
output_layer=1
A = tf.placeholder(tf.float32, [None, nDim])
B = tf.placeholder(tf.float32, [None, nDim])

In [152]:
P_AB = tf.placeholder(tf.float32, [None, output_layer])
P_true = tf.placeholder(tf.float32, [None, output_layer]) # float32 before

In [153]:
# Weights for the first layer to hidden layer:
weights1 = tf.Variable(tf.random_normal([nDim, first_layer]))
biases1 = tf.Variable(tf.random_normal([first_layer]))

In [154]:
# Hidden Layer nodes:
hiddenA = tf.matmul(A, weights1) + biases1
hiddenB = tf.matmul(B, weights1) + biases1

In [155]:
# Activations hidden layer
act_hiddenA = tf.nn.sigmoid(hiddenA)
act_hiddenB = tf.nn.sigmoid(hiddenB)

In [156]:
# Weights from hidden layer to output layer:
weights2 = tf.Variable(tf.random_normal([first_layer,output_layer]))
biases2 = tf.Variable(tf.random_normal([output_layer]))

In [157]:
# Output layer:
outputA = tf.matmul(act_hiddenA, weights2) + biases2
outputB = tf.matmul(act_hiddenB, weights2) + biases2

Oi = tf.nn.sigmoid(outputA)
Oj = tf.nn.sigmoid(outputB)
Oij=Oi-Oj

In [158]:
# Probability:
Pij=tf.exp(Oij)/(1+tf.exp(Oij))

In [159]:
# Cross entropy and cost:
cross_entropy = tf.reduce_sum(-P_true*Oij + tf.log(1+tf.exp(Oij)))
#cross_entropy = -tf.reduce_sum(tf.log(Pij))

#cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=Pij,labels=P_true)
#cost = tf.reduce_mean(cross_entropy)

In [166]:
# Optimizer:
#optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.5).minimize(cost)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.00001).minimize(cross_entropy)

In [167]:
# Start session:
batch_size = 100
session = tf.Session()
session.run(tf.global_variables_initializer())

In [168]:
Xi,Xj,P_target=separate(data)
P_target=P_target.reshape(P_target.shape[0],1)

In [169]:
batch_size = 100

In [170]:
def optimize(num_iterations):
    for i in range(num_iterations):
        # Get a batch of training examples.
        # x_batch now holds a batch of images and
        # y_true_batch are the true labels for those images.
#        A_batch, y_true_batch = data.train.next_batch(batch_size)
        indices = np.random.choice(len(data), n_samples)
        A_batch, B_batch, target_batch = Xi[indices], Xj[indices] , P_target[indices]
        
        # Put the batch into a dict with the proper names
        # for placeholder variables in the TensorFlow graph.
        # Note that the placeholder for y_true_cls is not set
        # because it is not used during training.
        #
        #feed_dict_train = {A: data1,B:data2, P_true: target} --> working
        feed_dict_train = {A: A_batch,B:B_batch, P_true: P_target} 
        # Run the optimizer using this batch of training data.
        # TensorFlow assigns the variables in feed_dict_train
        # to the placeholder variables and then runs the optimizer.
        session.run(optimizer, feed_dict=feed_dict_train)
        #oi = session.run(Oi, feed_dict= feed_dict_train)
        #oj = session.run(Oj, feed_dict= feed_dict_train)
        #oij = session.run(Oij, feed_dict= feed_dict_train)
        #pij = session.run(Pij, feed_dict= feed_dict_train)
        c_e = session.run(cross_entropy, feed_dict= feed_dict_train)
        #error = session.run(cost, feed_dict= feed_dict_train)
        print('Epoch', i, "loss:  ",c_e)
        


In [171]:
optimize(num_iterations=100)

('Epoch', 0, 'loss:  ', 104300.79)
('Epoch', 1, 'loss:  ', 104275.17)
('Epoch', 2, 'loss:  ', 104239.87)
('Epoch', 3, 'loss:  ', 104252.09)
('Epoch', 4, 'loss:  ', 104221.68)
('Epoch', 5, 'loss:  ', 104193.38)
('Epoch', 6, 'loss:  ', 104168.0)
('Epoch', 7, 'loss:  ', 104117.72)
('Epoch', 8, 'loss:  ', 104163.95)
('Epoch', 9, 'loss:  ', 104112.41)
('Epoch', 10, 'loss:  ', 104172.22)
('Epoch', 11, 'loss:  ', 104154.53)
('Epoch', 12, 'loss:  ', 104124.66)
('Epoch', 13, 'loss:  ', 104126.11)
('Epoch', 14, 'loss:  ', 104121.61)
('Epoch', 15, 'loss:  ', 104102.13)
('Epoch', 16, 'loss:  ', 104101.17)
('Epoch', 17, 'loss:  ', 104081.03)
('Epoch', 18, 'loss:  ', 104059.56)
('Epoch', 19, 'loss:  ', 104067.92)
('Epoch', 20, 'loss:  ', 104088.89)
('Epoch', 21, 'loss:  ', 104082.86)
('Epoch', 22, 'loss:  ', 104063.69)
('Epoch', 23, 'loss:  ', 104054.72)
('Epoch', 24, 'loss:  ', 104067.52)
('Epoch', 25, 'loss:  ', 104077.92)
('Epoch', 26, 'loss:  ', 104062.02)
('Epoch', 27, 'loss:  ', 104063.95)
('E

In [None]:
prediction = tf.round(Pij * 2) / 2
prediction= tf.cast(prediction, tf.float64)
correct_prediction = tf.equal(P_target, prediction)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

### Validation:

In [None]:
#Read training data
X_val, y_val, Query_val = readDataset('./MSLR-WEB10K/Fold1/vali.txt')

# Extract document pairs
pairs_val = extractPairsOfRatedSites(y_val, Query_val)
X_val_array=np.array(X_val)
data1_val,data2_val=separate_training(X_val_array,pairs_val)

In [None]:
P_target_val=np.ones([data1_val.shape[0],1])

In [None]:
len_test=500

In [None]:
feed_dict_test = {A: data1_val[0:len_test],B:data2_val[0:len_test] ,P_true: P_target_val[0:len_test]} 

In [None]:
def print_accuracy():
    # Use TensorFlow to compute the accuracy.
    acc = session.run(accuracy, feed_dict=feed_dict_test)
    
    # Print the accuracy.
    print("Accuracy on test-set: {0:.1%}".format(acc))

In [None]:
print_accuracy()