In [1]:
import math
import numpy as np
import os
from collections import defaultdict
from collections import Counter
import tensorflow as tf 
from sklearn import preprocessing

In [2]:
# Functions to extrac the documents, query and rank information
def extractFeatures(split):
    features = []
    for i in range(2, 138):
        features.append(float(split[i].split(':')[1]))
    # Convert to tuples:
    return features

def extractQueryData(split):
    # Add tuples:
    queryFeatures = split[1].split(':')[1]
    return queryFeatures

def readDataset(path):
    print('Reading training data from file...')
    with open(path, 'r') as file:
        #k=0
        features_list=[]
        rank_list=[]
        query_list=[]
        for line in file:
            split = line.split()
            features_list.append(extractFeatures(split))
            rank_list.append(int(split[0]))
            query_list.append(extractQueryData(split))
            #k+=1
            #if k==100:
            #    break
    #print('Number of query ID %d' %(len(features_list)))
    return features_list,rank_list,query_list


# Normalisation:
def normalize_features(features):
    features=np.array(features)

    # Substracting the mean:
    mean_features = np.mean(features, axis=0)
    features = features - mean_features

    # Dividing by the std:
    std_features = np.std(features, axis=0)
    features = features / std_features
    #print "features normalized"
    return features


# We put everything in a dictionary (key,value)= (query_id,[features,rank])
def make_dictionary(features,ranks,queries):
    dictio_quid=defaultdict(list)
    for feature_vec,rank,query in zip(features,ranks,queries):
        dictio_quid[query].append((feature_vec, rank))
    return dictio_quid

# Given a query ID, we separate on: [Xi,Xj,P_true] where P_true is either 0,0.5 or 1
def get_pairs_features(dictio_quid_featsRank):
    data = []
    #k = 0
    for key in dictio_quid_featsRank.keys():
        # Temporary list of features,rank
        temp_list = dictio_quid_featsRank[key]

        for i in range(0, len(temp_list)):
            X1 = temp_list[i][0]
            rank1 = temp_list[i][1]
            for j in range(i + 1, len(temp_list)):
                X2 = temp_list[j][0]
                rank2 = temp_list[j][1]

                # Only look at queries with different id:
                if (rank1 == rank2):
                    data.append((X1, X2, 0.5))
                if (rank1 > rank2):
                    data.append((X1, X2, int(1)))
                else:
                    data.append((X1, X2, int(0)))
        #k += 1
        #if k % 100 == 0:
           # print "number of keys transformed: %d finished" % int(k)
    return data


# Putting in the good format for tensorflow:


def separate(data):
    Xi = []
    Xj = []
    P_target = []
    for instance in data:
        Xi.append(instance[0])
        Xj.append(instance[1])
        P_target.append(instance[2])
    return (np.array(Xi), np.array(Xj), np.array(P_target))

# Sampling:
def sampling_data(training_data, batch_size):
    N = len(training_data)
    indices = np.random.choice(N, batch_size)
    #print ("%d indices Selected" ) % batch_size
    return [training_data[i] for i in indices]

# TensorFlow save model 
def save_model(session):
    if not os.path.exists('./model/'):
        os.mkdir('./model/')
    saver = tf.train.Saver()
    saver.save(session, './model/model.checkpoint')

In [22]:
#Read training data
features,ranks,queries = readDataset('./MSLR-WEB10K/Fold1/train.txt')
features=normalize_features(features)

# Making a dictionary:
dictio_quid = make_dictionary(features, ranks, queries)

# Getting the paris of features vectors:
training_data = get_pairs_features(dictio_quid)

# Sampling:
sampled_data = sampling_data(training_data, 10000)

# Separating into array to put in tensorflow
Xi, Xj, P_target = separate(sampled_data)
P_target_r= np.reshape(P_target, (-1, 1))

Reading training data from file...


In [4]:
# Validation set
val_features, val_ranks, val_queries = readDataset('./MSLR-WEB10K/Fold1/vali.txt')
val_features= normalize_features(val_features)
val_dictio_quid = make_dictionary(val_features, val_ranks, val_queries)
val_training_data = get_pairs_features(val_dictio_quid)
val_sampled_data = sampling_data(val_training_data, 10000)
val_Xi, val_Xj, val_P_target = separate(val_sampled_data)
val_P_target_r= np.reshape(val_P_target, (-1, 1))

Reading training data from file...


In [23]:
x_i = tf.placeholder("float", [None, 136])
x_j = tf.placeholder("float", [None, 136])
y_gold = tf.placeholder("float", [None, 1])

d_in = 136
d_hidden = 500
d_out = 1

W1 = tf.Variable(tf.random_normal([d_in, d_hidden], mean= 0.01, stddev= 0.01))
b1 = tf.Variable(tf.random_normal([d_hidden], mean= 0.01, stddev= 0.01))
W2 = tf.Variable(tf.random_normal([d_hidden, d_out], mean= 0.01, stddev= 0.01))
b2 = tf.Variable(tf.random_normal([d_out], mean= 0.01, stddev= 0.01))

a1_i = tf.matmul(x_i, W1)+ b1
z1_i = tf.sigmoid(a1_i)
o_i = tf.matmul(z1_i, W2)+ b2

a1_j = tf.matmul(x_j, W1)+ b1
z1_j = tf.sigmoid(a1_j)
o_j = tf.matmul(z1_j, W2)+ b2

o_ij = o_i- o_j
P_ij = tf.exp(o_ij)/ (1+ tf.exp(o_ij))

cross_entropy = -tf.reduce_sum(y_gold* tf.log(tf.clip_by_value(P_ij, 1e-10,1.0)))
optimiser = tf.train.GradientDescentOptimizer(0.015).minimize(cross_entropy)

prediction = (tf.round((tf.clip_by_value(P_ij, 1e-10,0.99) * 3)+ 0.5) -1)/ 2 #Round 0.33-> 0, 0.34-> 0.5

mistakes = tf.not_equal(y_gold, prediction)
accuracy = 1- tf.reduce_mean(tf.cast(mistakes, tf.float32))

In [27]:
# Train and test
batch_sz= 30
iter_= int(Xi.shape[0]/ batch_sz)
epoch= 1

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    init_dict= {x_i: Xi[:1000],
                x_j: Xj[:1000],
                y_gold: P_target_r[:1000]}
    print('Init train accuracy:', (sess.run(accuracy, feed_dict= init_dict)))
    
    test_dict= {x_i: val_Xi[1000:2000],
                x_j: val_Xi[1000:2000],
                y_gold: val_P_target_r[1000:2000]} 
    print('Init test accuracy:', (sess.run(accuracy, feed_dict= test_dict)))
    
    for e in range(epoch):
        e_loss= 0
        
        for i in range(iter_):
            i_loss= 0
            sta= i* batch_sz
            end= (i+ 1)* batch_sz
            iter_dict= {x_i: Xi[sta: end],
                        x_j: Xj[sta: end],
                        y_gold: P_target_r[sta: end]}
            sess.run(optimiser, feed_dict= iter_dict)
            e_loss+= sess.run(cross_entropy, feed_dict= iter_dict)
        
        if e% (epoch/ 10)== 0:
            print('Epoch', e, 'loss:', e_loss)
            print('Epoch train/test accur:', (sess.run(accuracy, feed_dict= init_dict)), (sess.run(accuracy, feed_dict= test_dict)))
        
    print('Final train accuracy:', (sess.run(accuracy, feed_dict= init_dict)))
    print('Final test accuracy:', (sess.run(accuracy, feed_dict= test_dict)))
    
    # TensorFlow save model
    if not os.path.exists('./model/'):
        os.mkdir('./model/')
    saver = tf.train.Saver()
    saver.save(sess, './model/model.checkpoint')

Init train accuracy: 0.335
Init test accuracy: 0.307
Epoch 0 loss: 22689.6815788
Epoch train/test accur: 0.346 0.307
Final train accuracy: 0.346
Final test accuracy: 0.307


In [28]:
# TensorFlow restore model
with tf.Session() as sess:
    saver = tf.train.Saver()
    saver.restore(sess, './model/model.checkpoint')
    
    test_dict= {x_i: val_Xi[1000:2000],
                x_j: val_Xi[1000:2000],
                y_gold: val_P_target_r[1000:2000]} 
    test_predicted = sess.run(prediction, feed_dict=test_dict)
    test_accuracy = sess.run(accuracy, feed_dict=test_dict)

test_accuracy

0.30699998