In [None]:
import sys,os

import numpy as np
import six
import pickle
import scipy
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import optimizers
from chainer import serializers
from tqdm import tqdm
import scipy.stats as ss
from sklearn.preprocessing import StandardScaler


In [None]:
from itertools import combinations
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np


class Net(nn.Module):
    def __init__(self, D):
        super(Net, self).__init__()
        self.l1 = nn.Linear(D, 10)
        self.l2 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.sigmoid(self.l1(x))
        x = self.l2(x)
        return x


def listnet_loss(y_i, z_i):
    """
    y_i: (n_i, 1)
    z_i: (n_i, 1)
    """

    P_y_i = F.softmax(y_i, dim=0)
    P_z_i = F.softmax(z_i, dim=0)
    return - torch.sum(P_y_i * torch.log(P_z_i))


def make_dataset(N_train, N_valid, D):
    ws = torch.randn(D, 1)

    X_train = torch.randn(N_train, D, requires_grad=True)
    X_valid = torch.randn(N_valid, D, requires_grad=True)

    ys_train_score = torch.mm(X_train, ws)
    ys_valid_score = torch.mm(X_valid, ws)

    bins = [-2, -1, 0, 1]  # 5 relevances
    ys_train_rel = torch.Tensor(
        np.digitize(ys_train_score.clone().detach().numpy(), bins=bins)
    )
    ys_valid_rel = torch.Tensor(
        np.digitize(ys_valid_score.clone().detach().numpy(), bins=bins)
    )

    return X_train, X_valid, ys_train_rel, ys_valid_rel


def swapped_pairs(ys_pred, ys_target):
    N = ys_target.shape[0]
    swapped = 0
    for i in range(N - 1):
        for j in range(i + 1, N):
            if ys_target[i] < ys_target[j]:
                if ys_pred[i] > ys_pred[j]:
                    swapped += 1
            elif ys_target[i] > ys_target[j]:
                if ys_pred[i] < ys_pred[j]:
                    swapped += 1
    return swapped


def ndcg(ys_true, ys_pred):
    def dcg(ys_true, ys_pred):
        _, argsort = torch.sort(ys_pred, descending=True, dim=0)
        ys_true_sorted = ys_true[argsort]
        ret = 0
        for i, l in enumerate(ys_true_sorted, 1):
            ret += (2 ** l - 1) / np.log2(1 + i)
        return ret
    ideal_dcg = dcg(ys_true, ys_true)
    pred_dcg = dcg(ys_true, ys_pred)
    return pred_dcg / ideal_dcg

In [None]:
from sklearn.datasets import load_svmlight_file
X, y, query_ids = load_svmlight_file('/Users/lmac/Documents/inf_search/learning_to_rank/l2r/train.txt', query_id=True)

In [None]:
X = X.todense()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100)

In [None]:
X_train = torch.from_numpy(X_train).float()
X_valid = torch.from_numpy(X_test).float()
ys_train = torch.from_numpy(y_train).float()
ys_valid = torch.from_numpy(y_test).float()

In [None]:
import tqdm
N_train = X_train.size()[0]
N_valid = X_valid.size()[0]
D = X_train.size()[1]
epochs = 10
batch_size = 10000

net = Net(D)
opt = optim.Adam(net.parameters())

for epoch in range(epochs):
        idx = torch.randperm(N_train)

        X_train = X_train[idx]
        ys_train = ys_train[idx]

        cur_batch = 0
        for it in tqdm.tqdm(range(N_train // batch_size)):
            batch_X = X_train[cur_batch: cur_batch + batch_size]
            batch_ys = ys_train[cur_batch: cur_batch + batch_size]
            cur_batch += batch_size

            opt.zero_grad()
            if len(batch_X) > 0:
                batch_pred = net(batch_X)
                batch_loss = listnet_loss(batch_ys, batch_pred)
                batch_loss.backward(retain_graph=True)
                opt.step()

        with torch.no_grad():
            valid_pred = net(X_valid)
            valid_swapped_pairs = swapped_pairs(valid_pred, ys_valid)
            ndcg_score = ndcg(ys_valid, valid_pred).item()
            print(f"epoch: {epoch + 1} valid swapped pairs: {valid_swapped_pairs}/{N_valid * (N_valid - 1) // 2} ndcg: {ndcg_score:.4f}")

In [None]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import ast
import time
from sklearn import preprocessing

In [None]:
from tqdm import tqdm_notebook as tqdm


In [None]:
import pandas as pd
import numpy as np
import os

def get_list_xy(data,query_id='qid',relevancy='relevance degree'):

    """
    returns a dataframe, sectioned by queries in order to train based on them
    """
    all_queries=data[query_id].values
    indexes = np.unique(all_queries, return_index=True)[1]
    queries=[all_queries[index] for index in sorted(indexes)]

    y_list=[]
    x_list=[]
    for i in range(len(queries)):
        data_q=data[data[query_id]==queries[i]]
        q_x=data_q.drop([query_id,relevancy],axis=1)
        q_y=data_q[relevancy]
        y_list.append(q_y)
        x_list.append(q_x)

    return x_list,y_list

def create_folder(directory):

    if not os.path.exists(directory):
        os.makedirs(directory)
        print("The directory",directory," did not exist, and was created")
    else:
        print("The directory",directory," already exists")
        
def create_folders(path,new_folders):
    """
    Checks if a set of folders exist, and if not, creates them
    """
    for folder in new_folders:
        create_folder(path+folder)

In [None]:
from keras.models import load_model
from keras.initializers import glorot_uniform
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from keras.layers import Dropout
import tensorflow.keras.backend as K
import random
import numpy as np
import tensorflow as tf
import keras
from keras.initializers import Constant
from tqdm import tqdm_notebook as tqdm

import time


def Pz_keras(y_pred):
    '''
    Top1 probability, described in the original ListNet paper
    '''

    return K.exp(y_pred)/K.sum(K.exp(y_pred))


def Loss_query_keras(y,y_pred):
    return -K.sum(Pz_keras(y)*K.log(Pz_keras(y_pred)))

def create_model(number_of_features,
                 optimizer,
                 initializer=tf.keras.initializers.glorot_uniform(seed=1),
                 neurons_per_layer=[1],
                 activation_function='relu',
                 final_activation='sigmoid',
                 dropout=0,
                 bias=0.1):
    modelq = Sequential()
    modelq.add(Dense(neurons_per_layer[0], 
                     input_dim=number_of_features,
                     activation=activation_function,
                     kernel_initializer=initializer,
                     bias_initializer=Constant(value=bias)))
    if(neurons_per_layer[0]>1):
        modelq.add(Dropout(dropout))
    if(len(neurons_per_layer)>1 or neurons_per_layer[-1]>1):
        for neurons in neurons_per_layer[1:]:
            modelq.add(Dense(neurons, activation=activation_function,
                             kernel_initializer=initializer,
                             bias_initializer=Constant(value=bias)))
            modelq.add(Dropout(dropout))
        
        modelq.add(Dense(1, activation=final_activation,
                         kernel_initializer=initializer,
                         bias_initializer=Constant(value=bias)))
    
    modelq.compile(loss=Loss_query_keras, optimizer=optimizer)
    return modelq

def generate_predictions_grid(path,
                              data_vali,
                              data_train,
                              data_test,
                              x_list_train,
                              y_list_train,
                              epochs=[50],
                              learning_rate=[0.075],
                              mon=[0.5],
                              act=['relu'],
                              number_neurons=[1],
                              dropout=[0.5],
                              final_activ=['linear'],
                              hidden_layers=[0],
                              name='',
                              index=0.5,
                              decresing_architecture=False,
                              show_summary=False,
                              save_model=False):

    t_ini=time.time()
    total_it=len(epochs)*len(learning_rate)*len(mon)*\
    len(act)*len(number_neurons)*len(dropout)*\
    len(final_activ)*len(hidden_layers)

    print("\n\n\nName of predictions will be of form: "+\
          name+"<index>,\nwith <index> starting at",index,
          "finishing at",index+total_it-1,"\n\n\n_")
    name_orig=name

    counter=1
    for my_iter in (product(epochs,learning_rate,mon,act,number_neurons,
                                dropout,final_activ,hidden_layers)):
        name=name_orig+str(index)
        n_iter=my_iter[0]
        n_=my_iter[1]
        mom_=my_iter[2]
        act_=my_iter[3]
        n_neurons_=my_iter[4]
        dropout_=my_iter[5]
        final_act=my_iter[6]
        hidden_layers_=my_iter[7]
        opt = SGD(lr=n_, momentum=mom_)
        if(hidden_layers_>0):

	        neurons_per_layer=np.repeat(n_neurons_,hidden_layers_)
	        if(decresing_architecture):
	            neurons_per_layer=np.array([neuron//2**index for index,neuron in \
	                                        enumerate(neurons_per_layer)])
	            neurons_per_layer=neurons_per_layer[neurons_per_layer>1]
        else:
        	neurons_per_layer=[n_neurons_]


        modelq=create_model(optimizer=opt,
                        number_of_features=x_list_train[0].shape[1],
                        neurons_per_layer=neurons_per_layer,
                        activation_function=act_,
                        final_activation=final_act,
                        dropout=dropout_)
        if(show_summary):
            display(modelq.summary())


        for j in tqdm(range(n_iter)):

            for ki in range(len(y_list_train)):
                if final_act=='sigmoid':
                    loss_qi=modelq.train_on_batch(x=x_list_train[ki],y=y_list_train[ki]/2)
                else:
                    loss_qi=modelq.train_on_batch(x=x_list_train[ki],y=y_list_train[ki])

        print("Iteration:",counter,"/",total_it)
        print("Time elapsed so far:")
        print(convert_to_time(time.time()-t_ini))
        print(my_iter,"\n\n\n")
        if(save_model):
            modelq.save(path+'models/model_'+name+'.h5')
        index+=1
        counter+=1

        y_pred_train=modelq.predict(data_train.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
        df_train=pd.DataFrame(y_pred_train)
        df_train.to_csv(path+'predictions/y_train_'+name+'.txt',sep=' ',header=False,index=False)

        y_pred_vali=modelq.predict(data_vali.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
        df_vali=pd.DataFrame(y_pred_vali)
        df_vali.to_csv(path+'predictions/y_vali_'+name+'.txt',sep=' ',header=False,index=False)

        y_pred_test=modelq.predict(data_test.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
        df_test=pd.DataFrame(y_pred_test)
        df_test.to_csv(path+'predictions/y_test_'+name+'.txt',sep=' ',header=False,index=False)


def convert_to_time(seconds): 
    return time.strftime("(Hours:Minutes:Seconds)\n%H:%M:%S", time.gmtime(seconds)) 


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.optimizers import SGD
import time
from keras.regularizers import l1
t_ini=time.time()
from itertools import product,count
epochs = [100]
learning_rate=[0.001]
mon=[0.5]
act=['linear']
initializer=keras.initializers.glorot_uniform(seed=1)
bias=0.1
total_it=len(epochs)*len(learning_rate)*len(mon)*len(act)

In [None]:
save_model=False 

In [None]:
def linear_model_iteration(my_iter,name,counter):

    print("Iteration:",counter,"/",total_it)
    name=name+str(counter)
    
    x_list_train,y_list_train=get_list_xy(data_train)
    
    n_iter=my_iter[0]
    n_=my_iter[1]
    mom_=my_iter[2]
    act_=my_iter[3]

    modelq = Sequential()
    modelq.add(Dense(1,input_dim=x_list_train[0].shape[1], activation=act_,
                     kernel_initializer=initializer, bias_initializer=Constant(value=bias)))
    opt = SGD(lr=n_, momentum=mom_)

    modelq.compile(loss=Loss_query_keras, optimizer=opt)
    
    for j in tqdm(range(n_iter)):

        for ki in range(len(y_list_train)):
            if act_=='sigmoid':
                loss_qi=modelq.train_on_batch(x=x_list_train[ki],y=y_list_train[ki]/2)
            else:
                loss_qi=modelq.train_on_batch(x=x_list_train[ki],y=y_list_train[ki])
            
            

    
    print("Time elapsed so far:")
    print(convert_to_time(time.time()-t_ini))
    print(my_iter,"\n\n\n")
    if(save_model):
        modelq.save(path+'new_models/model_'+name+'.h5')
    

    y_pred_train=modelq.predict(data_train.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
    df_train=pd.DataFrame(y_pred_train)
    df_train.to_csv(path+'new_predictions/y_train_'+name+'.txt',sep=' ',header=False,index=False)

    y_pred_vali=modelq.predict(data_vali.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
    df_vali=pd.DataFrame(y_pred_vali)
    df_vali.to_csv(path+'new_predictions/y_vali_'+name+'.txt',sep=' ',header=False,index=False)


    y_pred_test=modelq.predict(data_test.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
    df_test=pd.DataFrame(y_pred_test)
    df_test.to_csv(path+'new_predictions/y_test_'+name+'.txt',sep=' ',header=False,index=False)
    print("                             ",counter,"                                     ")
    print("=================================================================\n\n\n\n")

In [None]:
from sklearn.datasets import load_svmlight_file
X, y, query_ids = load_svmlight_file('/Users/lmac/Documents/inf_search/learning_to_rank/l2r/train.txt', query_id=True)

In [None]:
def form_df(X, y):

    data = pd.DataFrame(X)

    features_names = []
    for i in range(X.shape[1]):
          features_names.append('f'+str(i))
    data.columns = features_names
    data=data.rename(index=str, columns={"f0": "qid"})
    data['relevance degree'] = y
    return data

In [None]:
query_ids = np.array(query_ids)
X = X.todense()
X = np.array(X)

In [None]:
tmp_X = np.column_stack((query_ids, X))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tmp_X, y, test_size=0.1, random_state=100)

In [None]:
data_vali=form_df(X_test, y_test)
data_train=form_df(X_train, y_train)

In [None]:
from sklearn.datasets import load_svmlight_file
testX, testy, testquery_ids = load_svmlight_file('/Users/lmac/Documents/inf_search/learning_to_rank/l2r/test.txt', query_id=True)

In [None]:
testquery_ids = np.array(testquery_ids)
testX = testX.todense()
testX = np.array(testX)

testX = np.column_stack((testquery_ids, testX))

In [None]:
data_test=form_df(testX, testy)

In [None]:
path = '/Users/lmac/Documents/inf_search/learning_to_rank'

create_folders(path,["new_predictions"])

In [None]:
epochs = [100]
for(my_iter,counter) in zip(product(epochs,learning_rate,mon,act), range(1,total_it+1)):
    linear_model_iteration(my_iter, "test", counter)

In [None]:
y_pred = np.loadtxt('/Users/lmac/Documents/inf_search/learning_to_ranknew_predictions/y_test_test1.txt')

In [None]:
query_ids_test = testquery_ids

In [None]:
from collections import defaultdict

test_query_groups = defaultdict(list)
for doc_id, query_id in enumerate(query_ids_test):
    test_query_groups[query_id].append(doc_id)
with open("my_subm_test.csv", 'w') as outp:
    print("QueryId,DocumentId", file=outp)
    for query_id in test_query_groups:
        docs_ids = test_query_groups[query_id]
        y_pred_for_query = y_pred[docs_ids]
        sorted_ids = np.argsort(y_pred_for_query)[::-1]
        ranked_docs = np.array(docs_ids)[sorted_ids]
        for doc_id in ranked_docs:
            print(f"{query_id},{doc_id+1}", file=outp)

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.optimizers import SGD
import time
from keras.regularizers import l1
t_ini=time.time()
from itertools import product
learning_rate=[0.0025]
mon=[0.5]
act=['linear']
inner_act=['relu']
number_layers=[2]
number_neurons=[100]
dropout=[0.65]
counter=1
initializer=keras.initializers.glorot_uniform(seed=my_seed)
bias=0.1
epochs = [100]
total_it=len(epochs)*len(learning_rate)*len(mon)*len(act)*len(inner_act)\
*len(number_layers)*len(number_neurons)*len(dropout)

In [None]:
def NN_model_iteration(my_iter,name,counter):

    name=name+str(counter)
    print("Iteration:",counter,"/",total_it)

    x_list_train,y_list_train=get_list_xy(data_train)
    
    n_iter=my_iter[0]
    n_=my_iter[1]
    mom_=my_iter[2]
    act_=my_iter[3]
    
    inner_act_=my_iter[4]
    number_layers_=my_iter[5]
    number_neurons_=my_iter[6]
    dropout_=my_iter[7]

    modelq = Sequential()
    modelq.add(Dense(number_neurons_,input_dim=x_list_train[0].shape[1], activation=inner_act_,
                     kernel_initializer=initializer, bias_initializer=Constant(value=bias)))
    modelq.add(Dropout(dropout_))
    # Adding hidden layers based on the gridsearch value
    
    for i in range(number_layers_):
        modelq.add(Dense(number_neurons_,activation=inner_act_,
                        kernel_initializer=initializer,
                        bias_initializer=Constant(value=bias)))
        modelq.add(Dropout(dropout_))

    modelq.add(Dense(1,activation=act_,
                kernel_initializer=initializer,
                bias_initializer=Constant(value=bias)))
    opt = SGD(lr=n_, momentum=mom_)

    modelq.compile(loss=Loss_query_keras, optimizer=opt)
    #display(modelq.summary())
    

    for j in tqdm(range(n_iter)):

        for ki in range(len(y_list_train)):
            if act_=='sigmoid':
                loss_qi=modelq.train_on_batch(x=x_list_train[ki],y=y_list_train[ki]/2)
            else:
                loss_qi=modelq.train_on_batch(x=x_list_train[ki],y=y_list_train[ki])
            
            

    
    print("Time elapsed so far:")
    print(convert_to_time(time.time()-t_ini))
    print(my_iter,"\n\n\n")
    if(save_model):
        modelq.save(path+'models/model_'+name+'.h5')
    

    y_pred_train=modelq.predict(data_train.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
    df_train=pd.DataFrame(y_pred_train)
    df_train.to_csv(path+'new_predictions/y_train_'+name+'.txt',sep=' ',header=False,index=False)

    y_pred_vali=modelq.predict(data_vali.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
    df_vali=pd.DataFrame(y_pred_vali)
    df_vali.to_csv(path+'new_predictions/y_vali_'+name+'.txt',sep=' ',header=False,index=False)


    y_pred_test=modelq.predict(data_test.drop(['relevance degree','qid'],axis=1)).astype('float64').ravel()
    df_test=pd.DataFrame(y_pred_test)
    df_test.to_csv(path+'new_predictions/y_test_'+name+'.txt',sep=' ',header=False,index=False)
    print("                             ",counter,"                                     ")
    print("=================================================================\n\n\n\n")

In [None]:
epochs = [10]
for(my_iter,counter) in zip(product(epochs,learning_rate,mon,act,
                                                                   inner_act,number_layers,number_neurons,dropout),
                                                           range(1,total_it+1)):
    NN_model_iteration(my_iter, "test2", counter)

In [None]:
epochs = [100]
for(my_iter,counter) in zip(product(epochs,learning_rate,mon,act,
                                                                   inner_act,number_layers,number_neurons,dropout),
                                                           range(1,total_it+1)):
    NN_model_iteration(my_iter, "test3", counter)

In [None]:
y_pred = np.loadtxt('/Users/lmac/Documents/inf_search/learning_to_ranknew_predictions/y_test_test21.txt')

In [None]:
from collections import defaultdict

test_query_groups = defaultdict(list)
for doc_id, query_id in enumerate(query_ids_test):
    test_query_groups[query_id].append(doc_id)
with open("my_subm_test3.csv", 'w') as outp:
    print("QueryId,DocumentId", file=outp)
    for query_id in test_query_groups:
        docs_ids = test_query_groups[query_id]
        y_pred_for_query = y_pred[docs_ids]
        sorted_ids = np.argsort(y_pred_for_query)[::-1]
        ranked_docs = np.array(docs_ids)[sorted_ids]
        for doc_id in ranked_docs:
            print(f"{query_id},{doc_id+1}", file=outp)