## Inference on Pairwise Model (RankNet)

* input: `candidate-inf2-feats.jsonl` contains `26*30 (780)` lines of `(query, doc_id, *features)`
* Normalize features to [0, 1] scale using pickled scalers
* Generate `n(n-1)/2` pairs per query and run inference using trained model `pairwise_model.pt`
* Convert pairwise inferences to listwise inferences
* Return top 10 list-wise records per query

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [31]:
import json
import numpy as np
import os
import pandas as pd
import pickle
import torch
import torch.nn.functional as F

In [4]:
data_dir = "/content/drive/MyDrive/data"

### Read Input

In [5]:
rows = []
with open(os.path.join(data_dir, "candidate-inf2-feats.jsonl"), "r", encoding="utf-8") as f:
  for line in f:
    rows.append(json.loads(line.strip()))

len(rows), len(rows[0])

(780, 63)

In [7]:
inf_df = pd.DataFrame(rows)
inf_df.head()

Unnamed: 0,query,doc_id,num_query_tokens,num_title_tokens,num_section_title_tokens,num_bread_crumb_tokens,num_text_tokens,num_title_token_overlap,num_section_title_token_overlap,num_bread_crumb_token_overlap,...,text_mean_tf,text_var_tf,text_min_tfidf,text_max_tfidf,text mean_tfidf,text_var_tfidf,title_vec_score,section_title_vec_score,bread_crumb_vec_score,text_vec_score
0,Influenza tx + pregnant,o9wzvYkBeOphH9NsHySg,4,1,2,5,206,1,0,0,...,1.598425,1.547399,0.173889,31.856707,3.956062,13.892614,0.661382,0.081131,0.073746,0.622498
1,Influenza tx + pregnant,YvMAvYkBIHFsYVIUKIri,4,3,1,2,149,1,0,0,...,1.6,1.345882,0.173889,26.547256,3.905038,17.500854,0.499326,0.528598,0.340981,0.643338
2,Influenza tx + pregnant,K-oCvYkBlUvzL_hactoj,4,3,1,1,315,1,0,0,...,1.728395,3.778083,0.245285,47.78506,5.105488,26.088959,0.5908,0.187122,0.187122,0.659157
3,Influenza tx + pregnant,OuoCvYkBlUvzL_hactoj,4,3,1,2,315,1,0,0,...,1.728395,3.778083,0.245285,47.78506,5.105488,26.088959,0.5908,0.528598,0.340981,0.659157
4,Influenza tx + pregnant,NPL7vIkBIHFsYVIUlfNH,4,1,1,2,194,0,0,0,...,1.537037,1.544925,0.201418,42.475609,4.852688,32.859848,0.173295,-0.001387,-0.005145,0.432175


### Normalize (Scale) Features

In [12]:
# got these using polars
int_cols = ['num_query_tokens', 'num_title_tokens', 'num_section_title_tokens',
  'num_bread_crumb_tokens', 'num_text_tokens', 'num_title_token_overlap',
  'num_section_title_token_overlap', 'num_bread_crumb_token_overlap',
  'num_text_token_overlap', 'title_concept_overlap', 'section_title_concept_overlap',
  'bread_crumb_concept_overlap', 'text_concept_overlap', 'title_stygroup_overlap',
  'section_title_stygroup_overlap', 'bread_crumb_stygroup_overlap', 'text_stygroup_overlap',
  'title_ttf', 'title_min_tf', 'title_max_tf', 'section_title_ttf', 'section_title_min_tf',
  'section_title_max_tf', 'bread_crumb_ttf', 'bread_crumb_min_tf', 'bread_crumb_max_tf',
  'text_ttf', 'text_min_tf', 'text_max_tf']
float_cols = ['title_bm25_score', 'section_title_bm25_score', 'bread_crumb_bm25_score',
  'text_bm25_score', 'title mean_tf', 'title_var_tf', 'title_min_tfidf',
  'title_max_tfidf', 'title mean_tfidf', 'title_var_tfidf', 'section_title mean_tf',
  'section_title_var_tf', 'section_title_min_tfidf', 'section_title_max_tfidf',
  'section_title mean_tfidf', 'section_title_var_tfidf', 'bread_crumb mean_tf',
  'bread_crumb_var_tf', 'bread_crumb_min_tfidf', 'bread_crumb_max_tfidf',
  'bread_crumb mean_tfidf', 'bread_crumb_var_tfidf', 'text_mean_tf', 'text_var_tf',
  'text_min_tfidf', 'text_max_tfidf', 'text mean_tfidf', 'text_var_tfidf',
  'title_vec_score', 'section_title_vec_score', 'bread_crumb_vec_score', 'text_vec_score']

with open(os.path.join(data_dir, "feat_scalers_model2.pickle"), "rb") as f:
  feat_scalers = pickle.load(f)


def scale_features(df, int_cols, float_cols, feat_scalers):
  for colname in int_cols:
    if colname not in feat_scalers.keys():
      continue
    scaler = feat_scalers[colname]
    df[colname] = scaler.transform(df[colname].values.reshape(-1, 1))
  for colname in float_cols:
    if colname not in feat_scalers.keys():
      continue
    scaler = feat_scalers[colname]
    df[colname] = scaler.transform(df[colname].values.reshape(-1, 1))


scale_features(inf_df, int_cols, float_cols, feat_scalers)
inf_df.head()

Unnamed: 0,query,doc_id,num_query_tokens,num_title_tokens,num_section_title_tokens,num_bread_crumb_tokens,num_text_tokens,num_title_token_overlap,num_section_title_token_overlap,num_bread_crumb_token_overlap,...,text_mean_tf,text_var_tf,text_min_tfidf,text_max_tfidf,text mean_tfidf,text_var_tfidf,title_vec_score,section_title_vec_score,bread_crumb_vec_score,text_vec_score
0,Influenza tx + pregnant,o9wzvYkBeOphH9NsHySg,0.285714,0.0,0.076923,0.153846,0.287115,0.5,0.0,0.0,...,-0.259492,-0.326866,-0.028038,-0.18712,-0.807509,-0.450582,1.658468,-0.494535,-0.700343,1.646247
1,Influenza tx + pregnant,YvMAvYkBIHFsYVIUKIri,0.285714,0.076923,0.0,0.038462,0.207283,0.5,0.0,0.0,...,-0.25744,-0.372803,-0.028038,-0.408764,-0.83188,-0.391108,0.985627,2.605272,1.161238,1.759258
2,Influenza tx + pregnant,K-oCvYkBlUvzL_hactoj,0.285714,0.076923,0.0,0.0,0.439776,0.5,0.0,0.0,...,-0.09016,0.181624,0.174252,0.477815,-0.258499,-0.249554,1.365416,0.239716,0.089447,1.845041
3,Influenza tx + pregnant,OuoCvYkBlUvzL_hactoj,0.285714,0.076923,0.0,0.038462,0.439776,0.5,0.0,0.0,...,-0.09016,0.181624,0.174252,0.477815,-0.258499,-0.249554,1.365416,2.605272,1.161238,1.845041
4,Influenza tx + pregnant,NPL7vIkBIHFsYVIUlfNH,0.285714,0.0,0.0,0.038462,0.270308,0.0,0.0,0.0,...,-0.339472,-0.32743,0.049962,0.25617,-0.379246,-0.137952,-0.368026,-1.066171,-1.2499,0.614145


### Generate Pair Inferences

Generation of rank matrix and conversion from pairwise ranks to listwise, and top 10 from list.

In [20]:
class PairwiseModel(torch.nn.Module):
  def __init__(self):
    super(PairwiseModel, self).__init__()
    self.fc1 = torch.nn.Linear(61, 128)
    self.fc2 = torch.nn.Linear(128, 32)
    self.fc3 = torch.nn.Linear(32, 2)

  def forward(self, x):
    x1, x2 = x

    z1 = self.fc1(x1)
    z1 = F.relu(z1)
    z1 = self.fc2(z1)
    z1 = F.relu(z1)
    z1 = self.fc3(z1)
    z1 = F.relu6(z1)

    z2 = self.fc1(x2)
    z2 = F.relu(z2)
    z2 = self.fc2(z2)
    z2 = F.relu(z2)
    z2 = self.fc3(z2)
    z2 = F.relu6(z2)

    y = z1 - z2
    y = F.sigmoid(y)
    return y


model = PairwiseModel()
ckpt = torch.load(os.path.join(data_dir, "pairwise_model.pt"),
                  map_location=torch.device("cpu"))
model.load_state_dict(ckpt)

<All keys matched successfully>

In [37]:
def generate_rank_matrix(query, docs, model):
  indices, inputs = [], []
  for i, docid_i in enumerate(docs):
    for j, docid_j in enumerate(docs):
      if i >= j:
        continue
      x_i = torch.from_numpy(
          inf_df[(inf_df["query"] == query) &
                 (inf_df["doc_id"] == docid_i)]
          .values[0, 2:]
          .astype(np.float32))
      x_j = torch.from_numpy(
        inf_df[(inf_df["query"] == query) &
               (inf_df["doc_id"] == docid_j)]
        .values[0, 2:]
        .astype(np.float32))
      indices.append([i, j])
      inputs.append([x_i, x_j])
  with torch.no_grad():
    x_it = torch.stack([x for x, _ in inputs])
    x_jt = torch.stack([x for _, x in inputs])
    outputs = model([x_it, x_jt])
  assert len(outputs) == len(indices)
  rank_matrix = np.zeros((len(docs), len(docs)))
  for (i, j), output in zip(indices, outputs):
    rank = -1 if output[0] < output[1] else 1
    rank_matrix[i, j] = rank
    rank_matrix[j, i] = -rank
  return rank_matrix


queries = inf_df["query"].unique()
for query in queries:
  docs = inf_df[inf_df["query"] == query]["doc_id"].values
  rank_matrix = generate_rank_matrix(query, docs, model)
  row_sums = np.sum(rank_matrix, axis=1)
  topk_idxs = np.argsort(row_sums).tolist()[::-1]
  print(topk_idxs)
  break


[0, 1, 2, 3, 10, 21, 12, 27, 7, 6, 8, 14, 20, 22, 5, 11, 16, 13, 4, 23, 15, 24, 19, 29, 26, 17, 25, 9, 18, 28]


In [None]:
with open(os.path.join(data_dir, "inf2.tsv"), "w", encoding="utf-8") as fout:
  queries = inf_df["query"].unique()
  for query in queries:
    print("inferring best docs for query: '{:s}'".format(query))
    docs = inf_df[inf_df["query"] == query]["doc_id"].values
    rank_matrix = generate_rank_matrix(query, docs, model)
    row_sums = np.sum(rank_matrix, axis=1)
    topk_idxs = np.argsort(row_sums).tolist()[::-1][0:10]
    for idx in topk_idxs:
      fout.write("\t".join([query, docs[idx]]) + "\n")
