## Inference on Pointwise Model

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import json
import numpy as np
import operator
import os
import pickle
import pandas as pd
import torch

from torch.utils.data import TensorDataset, DataLoader

In [4]:
rows = []
data_dir = "/content/drive/MyDrive/data"
with open(os.path.join(data_dir, "candidate-inf1-feats.jsonl"), "r") as fin:
  for line in fin:
    json_line = json.loads(line.strip())
    rows.append(json_line)

In [5]:
len(rows), len(rows[0])

(1300, 63)

In [6]:
df = pd.DataFrame(rows)
df.head()

Unnamed: 0,query,doc_id,num_query_tokens,num_title_tokens,num_section_title_tokens,num_bread_crumb_tokens,num_text_tokens,num_title_token_overlap,num_section_title_token_overlap,num_bread_crumb_token_overlap,...,text_mean_tf,text_var_tf,text_min_tfidf,text_max_tfidf,text mean_tfidf,text_var_tfidf,title_vec_score,section_title_vec_score,bread_crumb_vec_score,text_vec_score
0,Influenza tx + pregnant,o9wzvYkBeOphH9NsHySg,4,1,2,5,206,1,0,0,...,1.598425,1.547399,0.173889,31.856707,3.956062,13.892614,0.661382,0.081131,0.073746,0.622498
1,Influenza tx + pregnant,YvMAvYkBIHFsYVIUKIri,4,3,1,2,149,1,0,0,...,1.6,1.345882,0.173889,26.547256,3.905038,17.500854,0.499326,0.528598,0.340981,0.643338
2,Influenza tx + pregnant,K-oCvYkBlUvzL_hactoj,4,3,1,1,315,1,0,0,...,1.728395,3.778083,0.245285,47.78506,5.105488,26.088959,0.5908,0.187122,0.187122,0.659157
3,Influenza tx + pregnant,OuoCvYkBlUvzL_hactoj,4,3,1,2,315,1,0,0,...,1.728395,3.778083,0.245285,47.78506,5.105488,26.088959,0.5908,0.528598,0.340981,0.659157
4,Influenza tx + pregnant,NPL7vIkBIHFsYVIUlfNH,4,1,1,2,194,0,0,0,...,1.537037,1.544925,0.201418,42.475609,4.852688,32.859848,0.173295,-0.001387,-0.005145,0.432175


In [7]:
# got these using polars
int_cols = ['num_query_tokens', 'num_title_tokens', 'num_section_title_tokens',
  'num_bread_crumb_tokens', 'num_text_tokens', 'num_title_token_overlap',
  'num_section_title_token_overlap', 'num_bread_crumb_token_overlap',
  'num_text_token_overlap', 'title_concept_overlap', 'section_title_concept_overlap',
  'bread_crumb_concept_overlap', 'text_concept_overlap', 'title_stygroup_overlap',
  'section_title_stygroup_overlap', 'bread_crumb_stygroup_overlap', 'text_stygroup_overlap',
  'title_ttf', 'title_min_tf', 'title_max_tf', 'section_title_ttf', 'section_title_min_tf',
  'section_title_max_tf', 'bread_crumb_ttf', 'bread_crumb_min_tf', 'bread_crumb_max_tf',
  'text_ttf', 'text_min_tf', 'text_max_tf']
float_cols = ['title_bm25_score', 'section_title_bm25_score', 'bread_crumb_bm25_score',
  'text_bm25_score', 'title mean_tf', 'title_var_tf', 'title_min_tfidf',
  'title_max_tfidf', 'title mean_tfidf', 'title_var_tfidf', 'section_title mean_tf',
  'section_title_var_tf', 'section_title_min_tfidf', 'section_title_max_tfidf',
  'section_title mean_tfidf', 'section_title_var_tfidf', 'bread_crumb mean_tf',
  'bread_crumb_var_tf', 'bread_crumb_min_tfidf', 'bread_crumb_max_tfidf',
  'bread_crumb mean_tfidf', 'bread_crumb_var_tfidf', 'text_mean_tf', 'text_var_tf',
  'text_min_tfidf', 'text_max_tfidf', 'text mean_tfidf', 'text_var_tfidf',
  'title_vec_score', 'section_title_vec_score', 'bread_crumb_vec_score', 'text_vec_score']

with open('/content/drive/MyDrive/data/feat_scalers.pickle', 'rb') as f:
  feat_scalers = pickle.load(f)


def scale_features(df, int_cols, float_cols, feat_scalers):
  for colname in int_cols:
    if colname not in feat_scalers.keys():
      continue
    scaler = feat_scalers[colname]
    df[colname] = scaler.transform(df[colname].values.reshape(-1, 1))
  for colname in float_cols:
    if colname not in feat_scalers.keys():
      continue
    scaler = feat_scalers[colname]
    df[colname] = scaler.transform(df[colname].values.reshape(-1, 1))


scale_features(df, int_cols, float_cols, feat_scalers)
df.head()

Unnamed: 0,query,doc_id,num_query_tokens,num_title_tokens,num_section_title_tokens,num_bread_crumb_tokens,num_text_tokens,num_title_token_overlap,num_section_title_token_overlap,num_bread_crumb_token_overlap,...,text_mean_tf,text_var_tf,text_min_tfidf,text_max_tfidf,text mean_tfidf,text_var_tfidf,title_vec_score,section_title_vec_score,bread_crumb_vec_score,text_vec_score
0,Influenza tx + pregnant,o9wzvYkBeOphH9NsHySg,0.285714,0.0,0.1,0.16,0.285112,0.5,0.0,0.0,...,-0.261681,-0.310013,0.001123,-0.216338,-0.798322,-0.390824,1.559604,-0.519785,-0.721573,1.529665
1,Influenza tx + pregnant,YvMAvYkBIHFsYVIUKIri,0.285714,0.076923,0.0,0.04,0.205056,0.5,0.0,0.0,...,-0.25969,-0.350092,0.001123,-0.425372,-0.821252,-0.345477,0.906879,2.526157,1.091056,1.64168
2,Influenza tx + pregnant,K-oCvYkBlUvzL_hactoj,0.285714,0.076923,0.0,0.0,0.438202,0.5,0.0,0.0,...,-0.097346,0.133645,0.271526,0.410765,-0.281785,-0.237545,1.275314,0.201707,0.047449,1.726706
3,Influenza tx + pregnant,OuoCvYkBlUvzL_hactoj,0.285714,0.076923,0.0,0.04,0.438202,0.5,0.0,0.0,...,-0.097346,0.133645,0.271526,0.410765,-0.281785,-0.237545,1.275314,2.526157,1.091056,1.726706
4,Influenza tx + pregnant,NPL7vIkBIHFsYVIUlfNH,0.285714,0.0,0.0,0.04,0.268258,0.0,0.0,0.0,...,-0.3393,-0.310505,0.105386,0.201731,-0.39539,-0.15245,-0.406305,-1.081487,-1.256679,0.506664


In [8]:
class PointwiseRanker(torch.nn.Module):
  def __init__(self, input_size, output_size):
    super().__init__()
    self.fc1 = torch.nn.Linear(input_size, 100)
    self.relu = torch.nn.ReLU()
    self.fc2 = torch.nn.Linear(100, output_size)
    # self.criterion = MSELoss()

  def forward(self, x):
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x

model = PointwiseRanker(61, 1)
ckpt = torch.load("/content/drive/MyDrive/data/pointwise_model")
model.load_state_dict(ckpt)

<All keys matched successfully>

In [None]:
queries = df["query"].unique().tolist()

In [12]:
def rerank(df, query, top_k, model):
  dfq = df[df["query"] == query]
  data = dfq.to_numpy()
  qdocs = data[:, 0:2].tolist()
  X = data[:, 2:].astype(np.float32)
  dataset = TensorDataset(torch.Tensor(X))
  dataloader = DataLoader(dataset, batch_size=32)
  scores = []
  for batch_data in dataloader:
    with torch.no_grad():
      batch_scores = model(batch_data[0])
      scores.extend([s.item() for s in batch_scores])
  id_scores = [(i, s) for i, s in enumerate(scores)]
  sorted_scores = sorted(id_scores, key=operator.itemgetter(1), reverse=True)
  reranked_docs = []
  for idx, score in sorted_scores[0:top_k]:
    reranked_docs.append((qdocs[idx][0], qdocs[idx][1], score))
  return reranked_docs


with open("/content/drive/MyDrive/data/inf1.tsv", "w", encoding="utf-8") as fout:
  for query in queries:
    reranked_docs = rerank(df, query, 10, model)
    for query, doc_id, _ in reranked_docs:
      fout.write("{:s}\t{:s}\n".format(query, doc_id))