In [0]:
import sys
import os
if 'google.colab' in sys.modules:
  from google.colab import drive
  drive.mount('/content/drive')
  sys.path.append("/content/drive/My Drive/ir1hw3")
  os.chdir('/content/drive/My Drive/ir1hw3')

In [0]:
import dataset
import ranking as rnk
import evaluate as evl
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

import math
import itertools
import matplotlib.pyplot as plt

from collections import Counter

seed = 42
np.random.seed(seed)
torch.manual_seed(seed)

device = torch.device("cuda:0")

# Load Data

In [0]:
data = dataset.get_dataset().get_data_folds()[0]
data.read_data()

# Pointwise LTR

In [0]:
"""
This module implements a LTR (PointwiseLTR) in PyTorch.
You should fill in code into indicated sections.
"""
def init_weights(m):
  if type(m) == nn.Linear:
      torch.nn.init.normal_(m.weight, mean=2, std=2)
      m.bias.data.fill_(0)

class PointwiseLTR(nn.Module):
  """
  PointwiseLTR model
  """

  def __init__(self, n_inputs, n_hidden, n_classes, neg_slope):
    """
    Initializes PointwiseLTR object. 
    
    Args:
      n_inputs: number of inputs.
      n_hidden: latent space length
      n_classes: number of classes of the classification problem.
                 This number is required in order to specify the
                 output dimensions of the PointwiseLTR
      neg_slope: negative slope parameter for LeakyReLU
    """
    super(PointwiseLTR, self).__init__()
    self.network = nn.Sequential(
                                  nn.Linear(n_inputs, n_hidden),
                                  nn.ReLU(),
                                  nn.Linear(n_hidden, n_classes)
    )
    # self.network.apply(init_weights)

  def forward(self, x):
    """
    Performs forward pass of the input. Here an input tensor x is transformed through 
    several layer transformations.
    
    Args:
      x: input to the network
    Returns:
      out: outputs of the network
    """
    out = self.network(x)
    return out

In [0]:
class earlyStopping():
  def __init__(self, patience=7, save_path='models/standard.mdl'):
      """
          class for determing when to stop, sets early_stop to true if 
          stopping condition is met
          patience (int): amount of rising validations in a row until stop 
      """
      self.patience = patience
      self.counter = 0
      self.best_score = None
      self.early_stop = False
      self.val_loss_min = np.Inf
      self.save_path = save_path

  def __call__(self, val_loss, model):
    score = -val_loss
    if self.best_score is None:
        self.best_score = score
        return False
    elif score < self.best_score:
        self.counter += 1
        print(f"{self.counter}/{self.patience}")
        if self.counter >= self.patience:
            self.early_stop = True
            torch.save(model.state_dict(), self.save_path)
            return True
    else:
        self.best_score = score
        self.counter = 0
        return False

In [0]:
def evaluate(model, data_fold, calc_loss=True):
  model.eval()
  data_loader = DataLoader(list(zip(data_fold.feature_matrix, data_fold.label_vector)), batch_size=128, shuffle=False, num_workers=0)
  
  all_predicted_labels = torch.zeros(len(data_fold.label_vector))
  vector_index = 0
  
  for i, (features, labels) in enumerate(data_loader):
    
    features, labels = features.float().to(device), labels.float().to(device)
    batch_size = labels.size(0)


    all_predicted_labels[vector_index: vector_index + batch_size] = model.forward(features).squeeze(1)
    vector_index = vector_index + batch_size

  if calc_loss:
    validation_loss = criterion(all_predicted_labels.view(-1), torch.tensor(data_fold.label_vector).view(-1)).item()
  else:
    validation_loss = 0

  arr = evl.evaluate(data_fold, all_predicted_labels.detach().numpy(), ['ndcg', 'err', 'arr'], print_results=False)

  return arr, validation_loss, all_predicted_labels

In [0]:
def train(model, train_loader, optimizer, criterion, epochs, early_stopping, device, eval_every=200):
  ndcgs = []
  validation_losses = []
  x = []
  for j in range(epochs):
    total_batches = len(train_loader)
    for i, (features, labels) in enumerate(train_loader):
      if early_stopping.early_stop:
        break
      model.train()
      features, labels = features.float().to(device), labels.float().to(device)
      
      optimizer.zero_grad()

      out = model.forward(features)

      loss = criterion(out.view(-1), labels.view(-1))
      loss.backward()

      optimizer.step()
      if i % eval_every == 0 and not(i == 0):
        ndcg, validation_loss, all_predicted_labels = evaluate(model, data.validation, criterion)
        validation_losses.append(validation_loss) 
        ndcgs.append(ndcg)
        x.append(total_batches* j + i)

        print(f"epoch {j}, iteration {i}, train_loss {loss}, validation_loss {validation_loss}, epoch_progress {i}/ {len(train_loader)}")

        early_stopping(validation_loss, model)

  return ndcgs, validation_losses, x      

In [0]:
def train_and_evaluate(lr, batch_size, hidden_layer, model_type = 'pointwise', sped_up=False):
  if model_type == 'pointwise':
    model = PointwiseLTR(n_inputs=data.num_features, n_hidden=hidden_layer, n_classes=1, neg_slope=0.1).to(device)
    criterion = nn.MSELoss()
    save_path = f"models/pointwiseLTR_lr_{lr}_batchsize_{batch_size}_hiddenlayer{hidden_layer}.mdl"
    
    train_loader = DataLoader(list(zip(data.train.feature_matrix, data.train.label_vector)), batch_size=batch_size, shuffle=False, num_workers=0)
    early_stopping = earlyStopping(patience=3, save_path=save_path)
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

    ndcg, validation_loss, x = train(model, train_loader, optimizer, criterion, 10, early_stopping, device, eval_every=math.floor(len(train_loader)/3))

  if model_type == 'pairwise':
    model = PairwiseLTR(n_inputs=data.num_features, n_hidden=hidden_layer, n_classes=1, neg_slope=0.1).to(device)
    criterion = pairwise_ltr_loss
    save_path = f"models/PairwiseLTR_lr_{lr}_hiddenlayer{hidden_layer}.mdl"
    early_stopping = earlyStopping(patience=3, save_path=save_path)
    optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

    ndcg, validation_loss, x = train(model, data.train, 10, optimizer, pairwise_ltr_loss, device, early_stopping, sped_up=sped_up, eval_every = math.floor(data.train.num_queries()/5) )
  return (min(validation_loss).item(), max(ndcg))

In [0]:
## grid search
# results = {}
# for lr in [0.1,0.01, 0.001, 0.0001,0.00001, 0.000001]:
#   batch_size = 64
#   hidden_layer = 250
#   results[str(lr)] = train_and_evaluate(lr, batch_size, hidden_layer)
# print(results)

# results = {}
# for batch_size in [32,64,128,254, 512, 1024]:
#   lr = 0.0001
#   hidden_layer = 250
#   results[str(batch_size)] = train_and_evaluate(lr, batch_size, hidden_layer)
# print(results)

# results = {}
# for hidden_layer in [10, 50, 100, 250, 500, 750, 1000]:
#   batch_size = 64
#   lr = 0.0001
#   results[str(hidden_layer)] = train_and_evaluate(lr, batch_size, hidden_layer)
# print(results)

In [0]:
lr = 0.0001
batch_size = 64
hidden_layer = 750

PLTR = PointwiseLTR(n_inputs=data.num_features, n_hidden=hidden_layer, n_classes=1, neg_slope=0.1).to(device)
train_loader = DataLoader(list(zip(data.train.feature_matrix, data.train.label_vector)), batch_size=batch_size, shuffle=False, num_workers=0)
save_path = f"models/pointwiseLTR_lr_{lr}_batchsize_{batch_size}_hiddenlayer{hidden_layer}.mdl"

early_stopping = earlyStopping(patience=3, save_path=save_path)

optimizer = optim.Adam(PLTR.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
criterion = nn.MSELoss()

ndcg, validation_loss, x = train(PLTR, train_loader, optimizer, criterion, 10, early_stopping, device, eval_every=len(train_loader)-1)

In [0]:
def plot_distribution(prediction, golden, label):
  width = 0.30
  fig, ax = plt.subplots(figsize=(8, 4))
  labels = [0, 1, 2, 3, 4]

  ax.bar(np.array(list(prediction.keys())) - 0.5*width, prediction.values(), width, label='predictions')
  ax.bar(np.array(list(golden.keys())) + 0.5*width, golden.values(), width, label='truth')

  ax.set_ylabel('document count')
  ax.set_title('score distribution predictions ' + label)
  ax.set_xticks(labels)
  ax.set_xticklabels(labels)

  ax.legend()

  plt.show()


def show_predictions(model, datafold, label):
  ndcg, validation_loss, all_predicted_labels = evaluate(model, datafold)

  prediction = Counter(all_predicted_labels.round().int().detach().numpy())
  golden = Counter(datafold.label_vector)

  plot_distribution(prediction, golden, label)

# show_predictions(PLTR, data.validation, 'validation')
# show_predictions(PLTR, data.test, 'test')

In [0]:
def plot_ndcg_validation(ndcg, validation_loss, x, loss_kind='validation loss'):
  plt.plot(x, ndcg, label='ndcg')
  plt.plot(x, validation_loss, label=loss_kind)
  plt.xlabel('iterations')
  plt.ylabel('score/loss')
  plt.title("pointwise LTR validation loss and ndcg")
  plt.legend()
  plt.show()
# plot_ndcg_validation(ndcg, validation_loss, x)

# Pairwise LTR

In [0]:
"""
This module implements a LTR (PairwiseLTR) in PyTorch.
You should fill in code into indicated sections.
"""
def init_weights(m):
  if type(m) == nn.Linear:
      torch.nn.init.xavier_uniform_(m.weight, gain=1)
      # m.bias.data.fill_(0)

class PairwiseLTR(nn.Module):
  """
  PairwiseLTR model
  """

  def __init__(self, n_inputs, n_hidden, n_classes, neg_slope):
    """
    Initializes PointwiseLTR object. 
    
    Args:
      n_inputs: number of inputs.
      n_hidden: latent space length
      n_classes: number of classes of the classification problem.
                 This number is required in order to specify the
                 output dimensions of the PointwiseLTR
      neg_slope: negative slope parameter for LeakyReLU
    """
    super(PairwiseLTR, self).__init__()
    self.network = nn.Sequential(
                                  nn.Linear(n_inputs, n_hidden),
                                  nn.ReLU(),
                                  nn.Linear(n_hidden, n_classes)
    )
    self.network.apply(init_weights)

  def forward(self, x):
    """
    Performs forward pass of the input. Here an input tensor x is transformed through 
    several layer transformations.
    
    Args:
      x: input to the network
    Returns:
      out: outputs of the network
    """
    out = self.network(x)
    return out

In [0]:
def pairwise_ltr_loss_sped_up(predicted_labels, true_labels):
  # true_labels = true_labels / 4
  predicted_labels = predicted_labels.squeeze()

  # pairs = get_pairs(predicted_labels.shape[0])

  tups = list(zip(*itertools.combinations(range(true_labels.size(0)), 2))) 
  #
  first_int, second_int = list(tups[0]), list(tups[1])

  predicted_first = predicted_labels[first_int]
  predicted_second = predicted_labels[second_int]
  
  true_first = true_labels[first_int]
  true_second = true_labels[second_int]

  first_larger = (true_first > true_second).type(torch.ByteTensor)
  second_larger = (true_first < true_second).type(torch.ByteTensor)
  S = (torch.zeros(first_larger.shape) + first_larger - second_larger).to(device)

  sig = torch.sigmoid(predicted_first.float() - predicted_second.float()).to(device)
  C_T = (0.5*(1 - S)*sig + torch.log(1 + torch.exp(-sig)))
  return C_T.mean()

In [0]:
def pairwise_ltr_loss(model, features, labels):
  tups = list(zip(*itertools.combinations(range(labels.size(0)), 2))) 
  # combinations = [(1,1)(1,2)] zip(*combinations) = [[1,1][1,2]] ~ 50x faster than forloop
  index_document_pair1, index_document_pair2 = list(tups[0]), list(tups[1])

  features_first = features[index_document_pair1]
  features_second = features[index_document_pair2]

  true_first = labels[index_document_pair1]
  true_second = labels[index_document_pair2]

  predicted_first = model.forward(features_first).squeeze()
  predicted_second = model.forward(features_second).squeeze()
  
  first_larger = (true_first > true_second).type(torch.ByteTensor)
  second_larger = (true_first < true_second).type(torch.ByteTensor)
  
  S = (torch.zeros(first_larger.shape) + first_larger - second_larger).to(device)

  sig = (predicted_first.float() - predicted_second.float()).sigmoid()

  C_T = (0.5*(1 - S)*sig + torch.log(1 + torch.exp(-sig)))
  return C_T.mean()


In [0]:
def evaluate(model, data_fold):
  model.eval()
  data_loader = DataLoader(list(zip(data_fold.feature_matrix, data_fold.label_vector)), batch_size=128, shuffle=False, num_workers=0)
  
  all_predicted_labels = torch.zeros(len(data_fold.label_vector))
  vector_index = 0
  
  for i, (features, labels) in enumerate(data_loader):
    features, labels = features.float().to(device), labels.float().to(device)
    batch_size = labels.size(0)

    all_predicted_labels[vector_index: vector_index + batch_size] = model.forward(features).squeeze(1)
    vector_index = vector_index + batch_size

  evaluations = evl.evaluate(data_fold, all_predicted_labels.detach().numpy(), ['ndcg', 'err', 'arr'], print_results=False)
  err = evaluations['err'][0]
  arr = evaluations['arr'][0]
  ndcg = evaluations['ndcg'][0]
  return ndcg, arr, err

In [0]:
def train(model, train_data, epochs, optimizer, criterion, device, early_stopping, sped_up=False, eval_every = 200):
  ndcgs = []
  arrs = []
  errs = []
  x = []

  for epoch in range(epochs):
    print("Starting epoch {}".format(epoch))

    num_queries = train_data.num_queries()

    for qid in range(0, num_queries):
      if early_stopping.early_stop:
        break
      
      # get documents from one specific query
      s_i, e_i = train_data.query_range(qid)
      
      if (e_i - s_i) < 2: 
        continue
      
      features, labels = torch.tensor(train_data.feature_matrix[s_i:e_i]).float().to(device), torch.tensor(train_data.label_vector[s_i:e_i]).float().to(device)
      
      model.train()
      optimizer.zero_grad()

      if sped_up == False:
        loss = pairwise_ltr_loss(model, features, labels)
      else:
        loss = pairwise_ltr_loss_sped_up(model.forward(features), labels)

      loss.backward()
      optimizer.step()

      if qid % eval_every == 0 and not(qid == 0):

        ndcg, arr, err = evaluate(model, data.validation)
        ndcgs.append(ndcg)
        arrs.append(arr)
        errs.append(err)
        x.append(num_queries* epoch + qid)

        print(f"epoch {epoch}, iteration {qid}, train_loss {loss}, validation_ndcg {ndcg}, epoch_progress {qid}/ {num_queries}")

        early_stopping(-ndcg, model)

  return ndcgs, arrs, errs, x


In [0]:
import importlib
importlib.reload(evl)

In [0]:
lr = 0.0001
hidden_layer = 1000

device = torch.device('cuda:0')

PairLTR = PairwiseLTR(n_inputs=data.num_features, n_hidden=hidden_layer, n_classes=1, neg_slope=0.1).to(device)

optimizer = optim.Adam(PairLTR.parameters(), lr=lr)

save_path = f"models/PairwiseLTR_lr_{lr}_hiddenlayer{hidden_layer}.mdl"
early_stopping = earlyStopping(patience=25, save_path=save_path)

ndcgs, arrs, errs, x = train(PairLTR, data.train, 10, optimizer, pairwise_ltr_loss, device, early_stopping, eval_every = 200)



lr = 0.0001
hidden_layer = 1000

device = torch.device('cuda:0')

PairLTR = PairwiseLTR(n_inputs=data.num_features, n_hidden=hidden_layer, n_classes=1, neg_slope=0.1).to(device)

optimizer = optim.Adam(PairLTR.parameters(), lr=lr)

save_path = f"models/PairwiseLTR_lr_{lr}_hiddenlayer{hidden_layer}.mdl"
early_stopping = earlyStopping(patience=25, save_path=save_path)

ndcgs_speed, arrs, errs, x_speed = train(PairLTR, data.train, 10, optimizer, pairwise_ltr_loss, device, early_stopping,sped_up=True, eval_every = 200)

# plot_ndcg_validation(ndcg, validation_loss, x, loss_kind='train_loss')



In [0]:
# ndcgs, arrs, errs, x

plt.plot(x, ndcgs, label='ndcg')
plt.xlabel('iterations')
plt.ylabel('score/loss')
plt.title("pairwise LTR ndcg")
plt.legend()
plt.show()

plt.plot(x, arrs, label='arr')
plt.xlabel('iterations')
plt.ylabel('score/loss')
plt.title("pairwise LTR arr")
plt.legend()
plt.show()


# plt.plot(x, errs, label='err')


In [0]:
lr = 0.0001
hidden_layer = 1000

device = torch.device('cuda:0')

PairLTR = PairwiseLTR(n_inputs=data.num_features, n_hidden=hidden_layer, n_classes=1, neg_slope=0.1).to(device)

optimizer = optim.Adam(PairLTR.parameters(), lr=lr)

save_path = f"models/PairwiseLTR_lr_{lr}_hiddenlayer{hidden_layer}.mdl"
early_stopping = earlyStopping(patience=25, save_path=save_path)

ndcgs, arrs, errs, x = train(PairLTR, data.train, 10, optimizer, pairwise_ltr_loss, device, early_stopping, eval_every = 200)


lr = 0.0001
hidden_layer = 1000

device = torch.device('cuda:0')

PairLTR = PairwiseLTR(n_inputs=data.num_features, n_hidden=hidden_layer, n_classes=1, neg_slope=0.1).to(device)

optimizer = optim.Adam(PairLTR.parameters(), lr=lr)

save_path = f"models/PairwiseLTR_lr_{lr}_hiddenlayer{hidden_layer}.mdl"
early_stopping = earlyStopping(patience=25, save_path=save_path)

ndcgs_speed, arrs, errs, x_speed = train(PairLTR, data.train, 10, optimizer, pairwise_ltr_loss, device, early_stopping,sped_up=True, eval_every = 200)

# plot_ndcg_validation(ndcg, validation_loss, x, loss_kind='train_loss')

In [0]:
plt.plot(x, ndcgs, label='ranknet')
plt.plot(x_speed, ndcgs_speed, label='ranknet: speedup')
plt.xlabel('iterations')
plt.ylabel('score')
plt.title("ranknet convergence vs speedup")
plt.legend()
plt.show()

In [0]:
  evaluate(PairLTR, data.test)

In [0]:
# grid search
# results = {}
# for lr in [0.1,0.01, 0.001, 0.0001,0.00001, 0.000001]:
#   batch_size = 0
#   hidden_layer = 250
#   results[str(lr)] = train_and_evaluate(lr, batch_size, hidden_layer, sped_up=True, model_type = 'pairwise')
# print(results)

# results = {}
# for hidden_layer in [50, 100, 250, 500, 750, 1000, 1500]:
#   batch_size = 0
#   lr = 0.0001
#   results[str(hidden_layer)] = train_and_evaluate(lr, batch_size, hidden_layer,sped_up=True, model_type = 'pairwise')
# print(results)

NDCG
Deep learning for model


AQ2.2 (10 points) Compute a distribution of the scores (if you’re using a classification
loss, use the argmax) output by your model on the validation and test sets. Compare this
with the distribution of the actual grades. If your distributions don’t match, reflect on
how you can fix this and if your solution is sufficient for LTR.


210.000.000 vectors