In [None]:
import math
import numpy as np
import math
# import pandas
from optparse import OptionParser
from sklearn.tree import DecisionTreeRegressor
from sklearn import ensemble
from collections import defaultdict
from copy import deepcopy
from multiprocessing import Pool
from itertools import chain
import time
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
import os

In [None]:
# Functions to extrac the documents, query and rank information
def extractFeatures(split):
    features = []
    for i in range(2, 138):
        features.append(float(split[i].split(':')[1]))
    # Convert to tuples:
    return features

def extractQueryData(split):
    # Add tuples:
    queryFeatures = split[1].split(':')[1]
    return queryFeatures

def readDataset(path):
    print('Reading training data from file...')
    with open(path, 'r') as file:
        #k=0
        features_list=[]
        rank_list=[]
        query_list=[]
        doc_id= []
        i= 0
        for line in file:
            split = line.split()
            features_list.append(extractFeatures(split))
            rank_list.append(int(split[0]))
            query_list.append(extractQueryData(split))
            #k+=1
            #if k==100:
            #    break
    #print('Number of query ID %d' %(len(features_list)))
    return features_list, rank_list, query_list


# Normalisation:
def normalize_features(features):
    features=np.array(features)

    # Substracting the mean:
    mean_features = np.mean(features, axis=0)
    features = features - mean_features

    # Dividing by the std:
    std_features = np.std(features, axis=0)
    features = features / std_features
    #print "features normalized"
    return features


# We put everything in a dictionary (key,value)= (query_id,[features,rank])
def make_dictionary(features,ranks,queries):
    dictio_quid=defaultdict(list)
    doc_id_1 = 0
    for feature_vec, rank, query, in zip(features, ranks, queries):
        dictio_quid[query].append((feature_vec, rank, doc_id_1))
        doc_id_1+= 1
    return dictio_quid

# Given a query ID, we separate on: [Xi,Xj,P_true] where P_true is either 0,0.5 or 1
def get_pairs_features(dictio_quid_featsRank):
    data = []
    qid_lst = []
    #k = 0
    for key in dictio_quid_featsRank.keys():
        # Temporary list of features,rank
        temp_list = dictio_quid_featsRank[key]

        for i in range(0, len(temp_list)):
            X1 = temp_list[i][0]
            rank1 = temp_list[i][1]
            doc1_id = temp_list[i][2]
            
            for j in range(i + 1, len(temp_list)):
                X2 = temp_list[j][0]
                rank2 = temp_list[j][1]
                doc2_id = temp_list[i][2]
                
                doc_id = (doc1_id, doc2_id)

                # Only look at queries with different id:
                if (rank1 == rank2):
                    data.append((X1, X2, 0.5, key, doc_id))
                if (rank1 > rank2):
                    data.append((X1, X2, int(1), key, doc_id))
                else:
                    data.append((X1, X2, int(0), key, doc_id))
    return data

# Putting in the good format for tensorflow:
def separate(data):
    Xi = []
    Xj = []
    P_target = []
    quid = []
    doc_id = []
    for instance in data:
        Xi.append(instance[0])
        Xj.append(instance[1])
        P_target.append(instance[2])
        quid.append(instance[3])
        doc_id.append(instance[4])
    return (np.array(Xi), np.array(Xj), np.array(P_target), np.array(quid), np.array(doc_id))

# Sampling:
def sampling_data(training_data, batch_size):
    N = len(training_data)
    indices = np.random.choice(N, batch_size)
    #print ("%d indices Selected" ) % batch_size
    return [training_data[i] for i in indices]

# TensorFlow save model 
def save_model(sess):
    print('Saving model...')
    if not os.path.exists('./model/'):
        os.mkdir('./model/')
    saver = tf.train.Saver()
    saver.save(sess, './model/model.checkpoint')
    print('Model saved')
    
def load_model():
    print('Loading model...')
    saver = tf.train.Saver()
    saver.restore(sess, './model/model.checkpoint')
    print('Model loaded')
    return sess

In [None]:
#Read training data
features, ranks, queries = readDataset('LEMUR/MSLR-WEB10K/Fold1/vali.txt')
features = normalize_features(features)
dictio_quid = make_dictionary(features, ranks, queries)
training_data = get_pairs_features(dictio_quid)
sampled_data = sampling_data(training_data, 10000)
Xi, Xj, P_target, quid, doc_id  = separate(sampled_data)
P_target_r= np.reshape(P_target, (-1, 1))
quid_r= np.reshape(quid, (-1, 1))

In [None]:
# Validation set
val_features, val_ranks, val_queries = readDataset('LEMUR/MSLR-WEB10K/Fold1/test.txt')
val_features= normalize_features(val_features)
val_dictio_quid = make_dictionary(val_features, val_ranks, val_queries)
val_training_data = get_pairs_features(val_dictio_quid)
val_sampled_data = sampling_data(val_training_data, 10000)
val_Xi, val_Xj, val_P_target = separate(val_sampled_data)
val_P_target_r= np.reshape(val_P_target, (-1, 1))

In [None]:
def reorder_dictio(dictio_eval):
    for key in dictio_eval.keys():
        #dictio_eval[key]=sorted(dictio_eval[key], reverse=True,key=lambda tup: (tup[1], tup[0]))
        dictio_eval[key] = sorted(dictio_eval[key], reverse=True, key=lambda tup: tup[1])
    return dictio_eval

def dcg_score(y_true, y_score, k=10, gains="exponential"):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    best = dcg_score(y_true, y_true, k, gains)
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

def separate(relevance_tuple):
    y_true=[]
    y_pred = []
    for tup in relevance_tuple:
        y_pred.append(tup[0])
        y_true.append(tup[1])
    return y_true,y_pred

def ndcg(dictio_eval):
    """
    """
    b= reorder_dictio(dictio_eval)
    new_b=[]
    for qid in b.keys():
        r_true=[]
        r_pred=[]
        for i,j in zip(pd.DataFrame(b[qid])[0], pd.DataFrame(b[qid])[1]):
            r_pred.append(i)
            r_true.append(j)
        score=ndcg_score(r_true,r_pred)    
        #new_b[qid]= [r_true, r_pred]
        new_b.append(score)
    return new_b

In [None]:
# Functions to calculate ERR:
GAMMA=0.450

## Different:
def get_proba(list_tuples,bins):
    list_proba=[]
    list_score = [i[0] for i in list_tuples]
#    print list_score
    list_true = range(len(list_tuples))
#   print list_true
    pred_relevance = list_score #assign_to_bin(list_score,bins)
#    print pred_relevance
    list_tuples=zip(pred_relevance,list_true)
    for r_pred,r_true in list_tuples:
        proba = ((np.power(2,r_pred))-1)/ np.power(2,4)#np.max(r_pred))
        list_proba.append(proba)
    return list_proba

def get_ERR(list_proba,n=10,gamma=0.5):
    r=2
    err = list_proba[0]
    last_proba=1
    for i in range(1,len(list_proba)):
        actual_proba=list_proba[i]
        previous_proba=(1-list_proba[i-1])*last_proba
        #print proba
        stop_proba=actual_proba*previous_proba
        err+=stop_proba/r
        last_proba=previous_proba
        r+=1
    return err

def ERR(dictio_eval,n=10,gamma=GAMMA):
    list_ERR=[]
    # Get the bins:
    bins=get_bins(dictio_eval)
    for key in dictio_eval.keys():
        list_tuples=dictio_eval[key]
        list_proba=get_proba(list_tuples,bins)
        err_result=get_ERR(list_proba,n,gamma)
        list_ERR.append(err_result)
    return list_ERR

In [None]:
def evaluate(model, dictio_val, mean_Xval, std_Xval):
    dictio_evaluation = defaultdict(list)

    for key in dictio_val.keys():
        temp_list = dictio_val[key]
        for features_vec, relevance in temp_list:
            # Features:
            features_norm = (np.array(features_vec) - mean_Xval) / std_Xval
            features_norm = features_norm.reshape(1,-1)
            #features_norm = features_norm.reshape(-1,1)

            # Prediction:
            prediction = model.predict(features_norm)

            # Dictionary:
            dictio_evaluation[key].append((prediction[0], relevance))

            # print features_vec,relevance

    return dictio_evaluation

In [None]:
class TF_RankNet():
    
    def __init__(self):
        self.self = self
        
    def initilise_model(self):
        
        tf.reset_default_graph()
        
        self.x_i = tf.placeholder("float", [None, 136])
        self.x_j = tf.placeholder("float", [None, 136])
        self.y_gold = tf.placeholder("float", [None, 1])
        self.q_id = tf.placeholder("string", [None, 1])

        d_in = 136
        d_hidden1 = 500
        d_out = 1

        self.W1 = tf.Variable(tf.random_normal([d_in, d_hidden1], mean= 0, stddev= 0.01))
        self.b1 = tf.Variable(tf.random_normal([d_hidden1], mean= 1, stddev= 0.01))
        self.W2 = tf.Variable(tf.random_normal([d_hidden1, d_out], mean= 0, stddev= 0.01))
        self.b2 = tf.Variable(tf.random_normal([d_out], mean= 1, stddev= 0.01))

        self.a1_i = tf.matmul(self.x_i, self.W1)+ self.b1
        self.z1_i = tf.nn.tanh(self.a1_i)
        self.a2_i = tf.matmul(self.z1_i, self.W2)+ self.b2
        self.o_i = tf.nn.tanh(self.a2_i)

        self.a1_j = tf.matmul(self.x_j, self.W1)+ self.b1
        self.z1_j = tf.nn.tanh(self.a1_j)
        self.a2_j = tf.matmul(self.z1_j, self.W2)+ self.b2
        self.o_j = tf.nn.tanh(self.a2_j)

        self.o_ij = tf.multiply(tf.subtract(self.o_i, self.o_j), 2)
        self.P_ij = tf.sigmoid(self.o_ij)

        self.loss = tf.reduce_mean(tf.losses.mean_squared_error(self.y_gold, self.P_ij))

        global_step = tf.Variable(0, trainable=False)
        starter_learning_rate = 0.0001
        self.learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True)

        self.optimiser = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

        self.prediction = (tf.round((tf.clip_by_value(self.P_ij, 0.01, 0.99) * 3)+ 0.5) -1)/ 2 

        self.mistakes = tf.not_equal(self.y_gold, self.prediction)
        self.accuracy = 1- tf.reduce_mean(tf.cast(self.mistakes, tf.float32))
        
        self.saver = tf.train.Saver()
        
    def train_1(self, x_i_train, x_j_train, y_train, quid, epoch):
        print('Train with all data, unbatched')
        
        self.train_dict= {self.x_i: x_i_train[:10],
                          self.x_j: x_j_train[:10],
                          self.y_gold: y_train[:10], 
                          self.q_id: quid[:10]} 
        print('Initial training acc %s' % (sess.run(self.accuracy, feed_dict= self.train_dict)))
        for i in range(epoch):
            sess.run(self.optimiser, feed_dict= self.train_dict)
        print('Final training acc %s' % (sess.run(self.accuracy, feed_dict= self.train_dict)))
        
    def train_full(self, x_i_train, x_j_train, y_train, epoch, batch_sz, x_valid= None, y_valid= None):

        self.iter_= int(x_i_train.shape[0]/ batch_sz)
        
        self.train_dict= {self.x_i: x_i_train,
                          self.x_j: x_j_train,
                          self.y_gold: y_train} 
    
        print('Training: Iters: %s. Epoch:  %s' % (self.iter_, epoch))
        print('Initial training acc %s' % (sess.run(self.accuracy, feed_dict= self.train_dict)))
        
        for e in range(epoch):
            e_loss= 0
                
            for i in range(self.iter_):
                iter_dict= {self.x_i: x_i_train[(i* batch_sz):((i+ 1)* batch_sz)],
                            self.x_j: x_j_train[(i* batch_sz):((i+ 1)* batch_sz)],
                            self.y_gold: y_train[(i* batch_sz):((i+ 1)* batch_sz)]}
                
                sess.run(self.optimiser, feed_dict= iter_dict)
                e_loss+= sess.run(self.loss, feed_dict= iter_dict)
                
                #if i %300 == 0:
                #    print('Epoch %s, iter %s, loss %s' % (e, i, sess.run(self.loss, feed_dict= iter_dict)))
                
            print('Epoch %s, loss %s' % (e, sess.run(self.loss, feed_dict= iter_dict)))
        print('Training accuracy %s' % (sess.run(self.accuracy, feed_dict= self.train_dict)))
        
    def predict(self, x_i_train, x_j_train, quid):
        valid_dict= {self.x_i: x_i_train,
                     self.x_j: x_j_train,
                     self.q_id: quid}
        predict_1 = (sess.run(self.prediction, feed_dict= valid_dict))
        predict = (sess.run(self.P_ij, feed_dict= valid_dict))
        q_id_out = (sess.run(self.q_id, feed_dict= valid_dict))
        
        return predict_1, predict, q_id_out
        
    def test_pointwise(self, x_i_train, x_j_train, y_train):
        self.valid_dict= {self.x_i: x_i_train,
                          self.x_j: x_j_train,
                          self.y_gold: y_train} 
        val_accuracy = (sess.run(self.accuracy, feed_dict= self.valid_dict))
        print('Test accuracy: %s' % (val_accuracy))
        
    def save_model(self, sess, dir_name):
        print('Saving model...')
        if not os.path.exists('./'+ dir_name+ '/'):
            os.mkdir('./'+ dir_name+ '/')
        saver = tf.train.Saver()
        saver.save(sess, './'+ dir_name+ '/model.checkpoint')
        print('Model saved')

    def load_model(self, sess, dir_name):
        print('Loading model...')
        saver = tf.train.Saver()
        saver.restore(sess, './'+ dir_name+ '/model.checkpoint')
        print('Model loaded')
        return sess

In [None]:
print('---Start---')

model1= TF_RankNet()
model1.initilise_model()

with tf.Session() as sess:
    
    print('1. Initialise')
    sess.run(tf.global_variables_initializer())
    #model1.load_model(sess, 'TF_RankNet')
    
    model1.test_pointwise(tes_features, tes_ranks_enc)
    
    #pred_dictio_eval = evaluate(model1, tes_queries, mean_Xval, std_Xval)
    #nDGC_eval_lst = ndcg(pred_dictio_eval)
    #ERR_eval_lst = ERR(pred_dictio_eval)
    print('Pre-train nDGC tes: %s, ERR: %s' % (np.nanmean(nDGC_eval_lst), np.nanmean(ERR_eval_lst)))
    
    print('2. Train')
    model1.train_full(features, ranks_enc, 10, 30)
    
    print('3. Test')
    model1.test_pointwise(tes_features, tes_ranks_enc)
    
    #pred_dictio_eval = evaluate(model1, tes_queries, mean_Xval, std_Xval)
    #nDGC_eval_lst = ndcg(pred_dictio_eval)
    #ERR_eval_lst = ERR(pred_dictio_eval)
    
    print('Post-train nDGC tes: %s, ERR: %s' % (np.nanmean(nDGC_eval_lst), np.nanmean(ERR_eval_lst)))
    model1.save_model(sess, 'TF_RankNet')
    
print('---End---')