# Information retrieval challenge Task 2
## Done by Ekaterina Timofeeva

In [1]:
import json

# Utility function to load JSON
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

# Load data
train_queries = load_json("train_queries.json")
gold_mappings = load_json("train_gold_mapping.json")
shuffled_ranking = load_json("shuffled_pre_ranking.json")
query_contents = load_json("queries_content_with_features.json")
document_contents = load_json("documents_content_with_features.json")

# Confirm data structure
print(f"Number of train queries: {len(train_queries)}")
print(f"Sample query ID: {train_queries[0]}")
print(f"Gold docs for first query: {gold_mappings[train_queries[0]]}")
print(f"Documents to rerank for first query: {shuffled_ranking[train_queries[0]]}")


Number of train queries: 20
Sample query ID: 79098180
Gold docs for first query: ['1376881', '68722856', '7345574']
Documents to rerank for first query: ['84117280', '86237859', '44417984', '43094034', '45901317', '42886784', '7345574', '43878837', '1354473', '86301854', '66025864', '86696791', '42365601', '86536566', '18158001', '70371603', '1376881', '81822743', '4082413', '4169775', '69328996', '68722856', '60044376', '84416764', '75998807', '42810307', '78288530', '103929967', '82465446', '87437516']


In [2]:
len(document_contents)

900

In [3]:
def list_to_dict_by_fan(data_list):
    """
    Converts list of patent dicts into a dict keyed by patent ID (from FAN field).
    """
    return {item["FAN"]: item for item in data_list}

# Apply conversion
query_contents_dict = list_to_dict_by_fan(query_contents)
document_contents_dict = list_to_dict_by_fan(document_contents)


# Model

In [None]:
# ! pip install transformers torch

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from tqdm import tqdm


In [None]:
# Load tokenizer and model
model_name = "intfloat/e5-large-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-large-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

# Title + abstract

0.211 on test

In [None]:
def extract_text_pairs_ta(query_ids, shuffled_ranking, query_contents, document_contents):
    """
    Returns query-doc text pairs using Content['title'] + Content['pa01'] as 'TA' (Title + Abstract).
    """
    data_pairs = {}

    for qid in query_ids:
        query_info = query_contents.get(qid, {}).get("Content", {})
        query_title = query_info.get("title", "")
        query_abstract = query_info.get("pa01", "")
        query_text = f"{query_title}. {query_abstract}"

        doc_ids = shuffled_ranking[qid]
        doc_tuples = []

        for doc_id in doc_ids:
            doc_info = document_contents.get(doc_id, {}).get("Content", {})
            doc_title = doc_info.get("title", "")
            doc_abstract = doc_info.get("pa01", "")
            doc_text = f"{doc_title}. {doc_abstract}"

            doc_tuples.append((doc_id, query_text, doc_text))

        data_pairs[qid] = doc_tuples

    return data_pairs


In [None]:
ta_pairs = extract_text_pairs_ta(
    query_ids=train_queries,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict
)

sample_qid = train_queries[0]
print("Query Text Sample:\n", ta_pairs[sample_qid][0][1][:300])
print("First Doc Text Sample:\n", ta_pairs[sample_qid][0][2][:300])


Query Text Sample:
 Universal dispenser monitor. A retrofit dispenser monitor is disclosed. The dispenser monitor has a connector allowing it to be connected directly to one of a number of dispensers. The dispenser monitor also comprises a sensor configured to detect the dispensing action of the attached dispenser by l
First Doc Text Sample:
 Dispenser tool, robot system with dispenser tool and method for dispensing viscous material onto wind turbine blade surface. A dispenser tool is provided with multiple cartridges for dispensing viscous material onto the surface of a wind turbine blade . The dispenser tool is advantageously part of a


In [None]:
def score_query_doc_pairs(model, tokenizer, data_pairs, max_length=512, batch_size=8):
    reranked_results = {}

    for query_id, doc_pairs in tqdm(data_pairs.items(), desc="Scoring Queries"):
        scores = []
        texts = [(query, doc) for (_, query, doc) in doc_pairs]
        doc_ids = [doc_id for (doc_id, _, _) in doc_pairs]

        # Batch processing
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(
                [f"{q} [SEP] {d}" for q, d in batch],
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            ).to(device)

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits.squeeze(-1)
                batch_scores = logits.cpu().numpy().tolist()
                scores.extend(batch_scores)

        # Sort doc_ids by score (descending)
        doc_score_pairs = sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)
        reranked_results[query_id] = [doc_id for doc_id, _ in doc_score_pairs]

    return reranked_results


In [None]:
predictions_ta = score_query_doc_pairs(model, tokenizer, ta_pairs)


Scoring Queries: 100%|██████████| 20/20 [00:49<00:00,  2.46s/it]


In [None]:
def compute_map_and_recall(predictions, gold_mapping, k_values=[3, 5, 10, 20]):
    results = {
        "MAP": 0.0,
        "Recall@k": {k: 0.0 for k in k_values},
        "Mean Rank": 0.0,
        "Mean Inverse Rank": 0.0
    }

    total_queries = 0
    map_sum = 0.0
    mean_rank_sum = 0.0
    mean_inv_rank_sum = 0.0
    recall_hits = {k: 0 for k in k_values}

    for qid, ranked_docs in predictions.items():
        if qid not in gold_mapping:
            continue

        gold_docs = set(gold_mapping[qid])
        if not gold_docs:
            continue

        total_queries += 1
        ap_sum = 0.0
        hit_count = 0
        first_hit_rank = None

        for rank, doc_id in enumerate(ranked_docs, 1):
            if doc_id in gold_docs:
                hit_count += 1
                ap_sum += hit_count / rank
                if first_hit_rank is None:
                    first_hit_rank = rank

        # average precision
        map_sum += ap_sum / len(gold_docs)

        if first_hit_rank:
            mean_rank_sum += first_hit_rank
            mean_inv_rank_sum += 1 / first_hit_rank

        # Recall@k logic
        for k in k_values:
            top_k = set(ranked_docs[:k])
            if gold_docs & top_k:
                recall_hits[k] += 1

    if total_queries == 0:
        return results

    results["MAP"] = map_sum / total_queries
    results["Mean Rank"] = mean_rank_sum / total_queries
    results["Mean Inverse Rank"] = mean_inv_rank_sum / total_queries

    for k in k_values:
        results["Recall@k"][k] = recall_hits[k] / total_queries

    return results


In [None]:
eval_results = compute_map_and_recall(
    predictions=predictions_ta,
    gold_mapping=gold_mappings,  # from train_gold_mapping.json
    k_values=[3, 5, 10, 20]
)

# Print metrics
print("\n🔍 Evaluation Results on Training Data:")
for metric, value in eval_results.items():
    if isinstance(value, dict):
        for k, v in value.items():
            print(f"{metric}@{k}: {v:.4f}")
    else:
        print(f"{metric}: {value:.4f}")



🔍 Evaluation Results on Training Data:
MAP: 0.1876
Recall@k@3: 0.3000
Recall@k@5: 0.3500
Recall@k@10: 0.7000
Recall@k@20: 0.9500
Mean Rank: 8.1000
Mean Inverse Rank: 0.2380


# TA + claim 1

0.242 on test

In [None]:
def extract_text_pairs_tac1(query_ids, shuffled_ranking, query_contents, document_contents):
    """
    Returns query-doc text pairs using:
    Title + Abstract + First Claim (tac1)
    """
    def get_first_claim(content_dict):
        # Claims are labeled like 'c-en-0001', 'c-en-0002', etc.
        for key in sorted(content_dict.keys()):
            if key.startswith("c-en-"):
                return content_dict[key]
        return ""

    data_pairs = {}

    for qid in query_ids:
        query_info = query_contents.get(qid, {}).get("Content", {})
        query_title = query_info.get("title", "")
        query_abstract = query_info.get("pa01", "")
        query_first_claim = get_first_claim(query_info)
        query_text = f"{query_title}. {query_abstract}. {query_first_claim}"

        doc_ids = shuffled_ranking[qid]
        doc_tuples = []

        for doc_id in doc_ids:
            doc_info = document_contents.get(doc_id, {}).get("Content", {})
            doc_title = doc_info.get("title", "")
            doc_abstract = doc_info.get("pa01", "")
            doc_first_claim = get_first_claim(doc_info)
            doc_text = f"{doc_title}. {doc_abstract}. {doc_first_claim}"

            doc_tuples.append((doc_id, query_text, doc_text))

        data_pairs[qid] = doc_tuples

    return data_pairs


In [None]:
tac1_pairs = extract_text_pairs_tac1(
    query_ids=train_queries,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict
)


In [None]:
predictions_tac1 = score_query_doc_pairs(
    model=model,
    tokenizer=tokenizer,
    data_pairs=tac1_pairs,
    batch_size=4
)

# Evaluate
eval_tac1 = compute_map_and_recall(predictions_tac1, gold_mappings)

# Display metrics
for metric, value in eval_tac1.items():
    if isinstance(value, dict):
        for k, v in value.items():
            print(f"{metric}@{k}: {v:.4f}")
    else:
        print(f"{metric}: {value:.4f}")


Scoring Queries: 100%|██████████| 20/20 [01:03<00:00,  3.18s/it]

MAP: 0.2065
Recall@k@3: 0.3000
Recall@k@5: 0.6000
Recall@k@10: 0.8000
Recall@k@20: 0.9500
Mean Rank: 6.9000
Mean Inverse Rank: 0.2965





# TA + all claims

0.203 on test

In [None]:
def extract_text_pairs_all_claims(query_ids, shuffled_ranking, query_contents, document_contents):
    def get_all_claims(content_dict):
        return " ".join(
            content_dict[k] for k in sorted(content_dict) if k.startswith("c-en-")
        )

    data_pairs = {}

    for qid in query_ids:
        query_info = query_contents.get(qid, {}).get("Content", {})
        query_title = query_info.get("title", "")
        query_abstract = query_info.get("pa01", "")
        query_claims = get_all_claims(query_info)
        query_text = f"{query_title}. {query_abstract}. {query_claims}"

        doc_ids = shuffled_ranking[qid]
        doc_tuples = []

        for doc_id in doc_ids:
            doc_info = document_contents.get(doc_id, {}).get("Content", {})
            doc_title = doc_info.get("title", "")
            doc_abstract = doc_info.get("pa01", "")
            doc_claims = get_all_claims(doc_info)
            doc_text = f"{doc_title}. {doc_abstract}. {doc_claims}"

            doc_tuples.append((doc_id, query_text, doc_text))

        data_pairs[qid] = doc_tuples

    return data_pairs


In [None]:
claims_pairs = extract_text_pairs_all_claims(
    query_ids=train_queries,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict
)


In [None]:
predictions_claims = score_query_doc_pairs(
    model=model,
    tokenizer=tokenizer,
    data_pairs=claims_pairs,
    batch_size=4  # reduce if needed for long inputs
)


Scoring Queries: 100%|██████████| 20/20 [01:05<00:00,  3.29s/it]


In [None]:
eval_claims = compute_map_and_recall(predictions_claims, gold_mappings)

for metric, value in eval_claims.items():
    if isinstance(value, dict):
        for k, v in value.items():
            print(f"{metric}@{k}: {v:.4f}")
    else:
        print(f"{metric}: {value:.4f}")


MAP: 0.2208
Recall@k@3: 0.4000
Recall@k@5: 0.6000
Recall@k@10: 0.9000
Recall@k@20: 0.9500
Mean Rank: 6.3500
Mean Inverse Rank: 0.2618


# TAC1 + features

Not submitted to codabench, since poor results on train set

In [None]:
def extract_text_pairs_tac1f(query_ids, shuffled_ranking, query_contents, document_contents):
    def get_first_claim(content_dict):
        for key in sorted(content_dict.keys()):
            if key.startswith("c-en-"):
                return content_dict[key]
        return ""

    data_pairs = {}

    for qid in query_ids:
        query_data = query_contents.get(qid, {})
        content = query_data.get("Content", {})
        features = str(query_data.get("features", ""))
        query_text = f"{content.get('title', '')}. {content.get('pa01', '')}. {get_first_claim(content)}. {features}"

        doc_ids = shuffled_ranking[qid]
        doc_tuples = []

        for doc_id in doc_ids:
            doc_data = document_contents.get(doc_id, {})
            content = doc_data.get("Content", {})
            features = str(doc_data.get("features", ""))
            doc_text = f"{content.get('title', '')}. {content.get('pa01', '')}. {get_first_claim(content)}. {features}"

            doc_tuples.append((doc_id, query_text, doc_text))

        data_pairs[qid] = doc_tuples

    return data_pairs


In [None]:
tac1f_pairs = extract_text_pairs_tac1f(
    query_ids=train_queries,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict
)


In [None]:
predictions_tac1f = score_query_doc_pairs(
    model=model,
    tokenizer=tokenizer,
    data_pairs=tac1f_pairs,
    batch_size=4
)


Scoring Queries: 100%|██████████| 20/20 [01:00<00:00,  3.03s/it]


In [None]:
eval_tac1f = compute_map_and_recall(predictions_tac1f, gold_mappings)

for metric, value in eval_tac1f.items():
    if isinstance(value, dict):
        for k, v in value.items():
            print(f"{metric}@{k}: {v:.4f}")
    else:
        print(f"{metric}: {value:.4f}")


MAP: 0.1819
Recall@k@3: 0.2500
Recall@k@5: 0.5500
Recall@k@10: 0.8500
Recall@k@20: 0.9500
Mean Rank: 7.6000
Mean Inverse Rank: 0.2079


# Features

0.202 on test

In [None]:
def extract_text_pairs_features_only(query_ids, shuffled_ranking, query_contents, document_contents):
    data_pairs = {}

    for qid in query_ids:
        query_text = str(query_contents.get(qid, {}).get("features", ""))

        doc_ids = shuffled_ranking[qid]
        doc_tuples = []

        for doc_id in doc_ids:
            doc_text = str(document_contents.get(doc_id, {}).get("features", ""))
            doc_tuples.append((doc_id, query_text, doc_text))

        data_pairs[qid] = doc_tuples

    return data_pairs


In [None]:
features_pairs = extract_text_pairs_features_only(
    query_ids=train_queries,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict
)


In [None]:
predictions_features = score_query_doc_pairs(
    model=model,
    tokenizer=tokenizer,
    data_pairs=features_pairs,
    batch_size=4
)


Scoring Queries: 100%|██████████| 20/20 [00:02<00:00,  7.12it/s]


In [None]:
eval_features = compute_map_and_recall(predictions_features, gold_mappings)

for metric, value in eval_features.items():
    if isinstance(value, dict):
        for k, v in value.items():
            print(f"{metric}@{k}: {v:.4f}")
    else:
        print(f"{metric}: {value:.4f}")


MAP: 0.2140
Recall@k@3: 0.4000
Recall@k@5: 0.5500
Recall@k@10: 0.9000
Recall@k@20: 0.9000
Mean Rank: 7.2000
Mean Inverse Rank: 0.2984


# Encoder fine-tuning

Results on test set:

distilroberta (fine-tuned)	0.227

e5-large-v2 (fine-tuned)	0.200

MiniLM v2 (fine-tuned)	0.225

In [None]:
def prepare_training_data_tac1(
    train_queries,
    gold_mappings,
    shuffled_ranking,
    query_contents,
    document_contents,
    max_negatives_per_query=10
):
    def get_first_claim(content_dict):
        for key in sorted(content_dict):
            if key.startswith("c-en-"):
                return content_dict[key]
        return ""

    training_data = []

    for qid in train_queries:
        query_data = query_contents.get(qid, {})
        query_content = query_data.get("Content", {})
        query_text = f"{query_content.get('title', '')}. {query_content.get('pa01', '')}. {get_first_claim(query_content)}"

        gold_docs = set(gold_mappings.get(qid, []))
        candidates = shuffled_ranking.get(qid, [])

        # Add positives
        for doc_id in candidates:
            if doc_id in gold_docs:
                doc_data = document_contents.get(doc_id, {})
                doc_content = doc_data.get("Content", {})
                doc_text = f"{doc_content.get('title', '')}. {doc_content.get('pa01', '')}. {get_first_claim(doc_content)}"
                training_data.append((query_text, doc_text, 1))

        # Add negatives (not in gold)
        negatives_added = 0
        for doc_id in candidates:
            if doc_id not in gold_docs:
                doc_data = document_contents.get(doc_id, {})
                doc_content = doc_data.get("Content", {})
                doc_text = f"{doc_content.get('title', '')}. {doc_content.get('pa01', '')}. {get_first_claim(doc_content)}"
                training_data.append((query_text, doc_text, 0))
                negatives_added += 1
                if negatives_added >= max_negatives_per_query:
                    break

    print(f"Prepared {len(training_data)} training pairs.")
    return training_data


In [None]:
train_data = prepare_training_data_tac1(
    train_queries=train_queries,
    gold_mappings=gold_mappings,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict,
    max_negatives_per_query=10
)


Prepared 269 training pairs.


In [None]:
# Print a sample
for i in range(2):
    print(f"\n[Label: {train_data[i][2]}]")
    print("Query:", train_data[i][0][:200], "...")
    print("Doc:  ", train_data[i][1][:200], "...")



[Label: 1]
Query: Universal dispenser monitor. A retrofit dispenser monitor is disclosed. The dispenser monitor has a connector allowing it to be connected directly to one of a number of dispensers. The dispenser monit ...
Doc:   Hygiene compliance module. A hygiene compliance module is configured to be retrofit with a compatible dispenser to enable hygiene compliance monitoring functions. The hygiene compliance module is conf ...

[Label: 1]
Query: Universal dispenser monitor. A retrofit dispenser monitor is disclosed. The dispenser monitor has a connector allowing it to be connected directly to one of a number of dispensers. The dispenser monit ...
Doc:   System for managing multiple dispensing units and method of operation. A system for managing multiple dispensing units by communicating information through a communications network is provided. The sy ...


In [None]:
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1  # binary relevance score (0 or 1)
)


In [None]:
from torch.utils.data import Dataset

class PatentPairDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query_text, doc_text, label = self.data[idx]

        encoding = self.tokenizer(
            query_text,
            doc_text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.float)
        return item


In [None]:
from torch.utils.data import DataLoader

train_dataset = PatentPairDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-1

In [None]:
from torch.optim import AdamW
from torch.nn import BCEWithLogitsLoss

optimizer = AdamW(model.parameters(), lr=0.5e-5)
loss_fn = BCEWithLogitsLoss()


In [None]:
from tqdm import tqdm

def train(model, dataloader, optimizer, loss_fn, device, epochs=3):
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}")

        for batch in progress:
            # Move to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits.squeeze(-1)

            loss = loss_fn(logits, labels)

            # Backward + optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"✅ Epoch {epoch+1} complete. Avg loss: {avg_loss:.4f}")


In [None]:
train(model, train_loader, optimizer, loss_fn, device, epochs=3)


Epoch 1: 100%|██████████| 68/68 [00:06<00:00, 10.48it/s, loss=0.221]


✅ Epoch 1 complete. Avg loss: 0.7825


Epoch 2: 100%|██████████| 68/68 [00:06<00:00, 10.86it/s, loss=0.484]


✅ Epoch 2 complete. Avg loss: 0.6029


Epoch 3: 100%|██████████| 68/68 [00:06<00:00, 10.55it/s, loss=0.241]

✅ Epoch 3 complete. Avg loss: 0.5647





In [None]:
def rerank_with_model(model, tokenizer, data_pairs, max_length=512, batch_size=4):
    model.eval()
    reranked_results = {}

    for query_id, doc_pairs in tqdm(data_pairs.items(), desc="Reranking"):
        scores = []
        texts = [(query, doc) for (_, query, doc) in doc_pairs]
        doc_ids = [doc_id for (doc_id, _, _) in doc_pairs]

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(
                [f"{q} [SEP] {d}" for q, d in batch],
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            ).to(device)

            with torch.no_grad():
                outputs = model(**inputs)
                logits = outputs.logits.squeeze(-1)
                scores_batch = logits.cpu().numpy().tolist()
                scores.extend(scores_batch)

        # Sort docs by score
        doc_score_pairs = sorted(zip(doc_ids, scores), key=lambda x: x[1], reverse=True)
        reranked_results[query_id] = [doc_id for doc_id, _ in doc_score_pairs]

    return reranked_results


In [None]:
predictions_ft_train = rerank_with_model(
    model=model,
    tokenizer=tokenizer,
    data_pairs=tac1_pairs,
    batch_size=4
)


Reranking: 100%|██████████| 20/20 [00:04<00:00,  4.21it/s]


In [None]:
eval_ft = compute_map_and_recall(predictions_ft_train, gold_mappings)

for metric, value in eval_ft.items():
    if isinstance(value, dict):
        for k, v in value.items():
            print(f"{metric}@{k}: {v:.4f}")
    else:
        print(f"{metric}: {value:.4f}")


MAP: 0.2500
Recall@k@3: 0.5000
Recall@k@5: 0.6500
Recall@k@10: 0.9000
Recall@k@20: 1.0000
Mean Rank: 4.9500
Mean Inverse Rank: 0.3207


# GENERATE TEST PREDICTIONS

In [None]:
# 1. Load test query IDs
test_queries = load_json("test_queries.json")

In [None]:
tac1_pairs_test = extract_text_pairs_tac1(
    query_ids=test_queries,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict
)
predictions_ft_test = rerank_with_model(
    model=model,
    tokenizer=tokenizer,
    data_pairs=tac1_pairs_test,
    batch_size=4
)
with open("prediction2.json", "w") as f:
    json.dump(predictions_ft_test, f, indent=2)

print("✅ predictions2.json is ready for test submission.")


Reranking: 100%|██████████| 10/10 [00:02<00:00,  4.02it/s]

✅ predictions2.json is ready for test submission.





In [None]:
features_pairs_test = extract_text_pairs_features_only(
    query_ids=test_queries,
    shuffled_ranking=shuffled_ranking,
    query_contents=query_contents_dict,
    document_contents=document_contents_dict
)

predictions_test_features = score_query_doc_pairs(
    model=model,
    tokenizer=tokenizer,
    data_pairs=features_pairs_test,
    batch_size=4
)

with open("prediction2.json", "w") as f:
    json.dump(predictions_test_features, f, indent=2)

print("✅ Submission file ready for test upload (features only)")


Scoring Queries: 100%|██████████| 10/10 [00:02<00:00,  4.24it/s]

✅ Submission file ready for test upload (features only)





In [None]:
# claims_pairs_test = extract_text_pairs_all_claims(
#     query_ids=test_queries,
#     shuffled_ranking=shuffled_ranking,
#     query_contents=query_contents_dict,
#     document_contents=document_contents_dict
# )

# predictions_test_claims = score_query_doc_pairs(
#     model=model,
#     tokenizer=tokenizer,
#     data_pairs=claims_pairs_test,
#     batch_size=4
# )

# with open("prediction2.json", "w") as f:
#     json.dump(predictions_test_claims, f, indent=2)

# print("✅ Test prediction saved. Ready to upload.")


Scoring Queries: 100%|██████████| 10/10 [00:32<00:00,  3.28s/it]

✅ Test prediction saved. Ready to upload.





In [None]:
# # ta_pairs_test = extract_text_pairs_ta(
# #     query_ids=test_queries,
# #     shuffled_ranking=shuffled_ranking,
# #     query_contents=query_contents_dict,
# #     document_contents=document_contents_dict
# # )
# tac1_pairs_test = extract_text_pairs_tac1(
#     query_ids=test_queries,
#     shuffled_ranking=shuffled_ranking,
#     query_contents=query_contents_dict,
#     document_contents=document_contents_dict
# )
# # 3. Run scoring on test pairs using your model
# predictions_test = score_query_doc_pairs(
#     model=model,
#     tokenizer=tokenizer,
#     data_pairs=tac1_pairs_test,
#     max_length=512,
#     batch_size=8
# )

# # 4. Save predictions to JSON
# with open("prediction2.json", "w", encoding="utf-8") as f:
#     json.dump(predictions_test, f, indent=2)

# print("✅ Saved test predictions to prediction2.json")


Scoring Queries: 100%|██████████| 10/10 [00:28<00:00,  2.84s/it]

✅ Saved test predictions to prediction2.json





## Done by Kseniia Pavlova

In [14]:
import json
from sentence_transformers import SentenceTransformer, util
import torch
from tqdm import tqdm
import numpy as np

# Utility function to load JSON
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

# Load data
train_queries = load_json("train_queries.json")
gold_mappings = load_json("train_gold_mapping.json")
shuffled_ranking = load_json("shuffled_pre_ranking.json")
query_contents = load_json("queries_content_with_features.json")
document_contents = load_json("documents_content_with_features.json")

# Convert to dict format
def list_to_dict_by_fan(data_list):
    return {item["FAN"]: item for item in data_list}

query_contents_dict = list_to_dict_by_fan(query_contents)
document_contents_dict = list_to_dict_by_fan(document_contents)

In [19]:
# Define all text extraction functions
def extract_ta(content_dict):
    """Title + Abstract"""
    content = content_dict.get("Content", {})
    return f"{content.get('title', '')}. {content.get('pa01', '')}"

# Corrected text extraction functions
def extract_tac1(content_dict):
    """Title + Abstract + Claim 1"""
    content = content_dict.get("Content", {})
    first_claim = next((content[k] for k in sorted(content) if k.startswith("c-en-")), "")
    return f"{content.get('title', '')}. {content.get('pa01', '')}. {first_claim}"

def extract_tac1f(content_dict):
    """Title + Abstract + Claim 1 + Features"""
    content = content_dict.get("Content", {})
    features = content_dict.get("features", "")
    first_claim = next((content[k] for k in sorted(content) if k.startswith("c-en-")), "")
    return f"{content.get('title', '')}. {content.get('pa01', '')}. {first_claim}. {features}"

def extract_all_claims(content_dict):
    """Title + Abstract + All Claims"""
    content = content_dict.get("Content", {})
    claims = " ".join(content[k] for k in sorted(content) if k.startswith("c-en-"))
    return f"{content.get('title', '')}. {content.get('pa01', '')}. {claims}"

def extract_features_only(content_dict):
    """Features only"""
    return str(content_dict.get("features", ""))

In [20]:
# Load bi-encoder model
model_name = "BAAI/bge-large-en-v1.5"
model = SentenceTransformer(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model loaded on {device}")

Model loaded on cuda


In [21]:
# Encode all documents for each representation type
def encode_corpus(docs_dict, text_extractor, batch_size=32):
    doc_ids = list(docs_dict.keys())
    texts = [text_extractor(docs_dict[doc_id]) for doc_id in doc_ids]
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
    return {doc_id: emb for doc_id, emb in zip(doc_ids, embeddings)}

print("Encoding documents for all representations...")
doc_embeddings_ta = encode_corpus(document_contents_dict, extract_ta)
doc_embeddings_tac1 = encode_corpus(document_contents_dict, extract_tac1)
doc_embeddings_tac1f = encode_corpus(document_contents_dict, extract_tac1f)
doc_embeddings_all_claims = encode_corpus(document_contents_dict, extract_all_claims)
doc_embeddings_features = encode_corpus(document_contents_dict, extract_features_only)

Encoding documents for all representations...


Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

In [22]:
def rerank_queries(query_ids, shuffled_ranking, query_contents, doc_embeddings, text_extractor):
    reranked_results = {}

    for qid in tqdm(query_ids, desc="Reranking queries"):
        # Get query text and encode
        query_text = text_extractor(query_contents.get(qid, {}))
        query_embedding = model.encode(query_text)

        # Get candidate docs and their embeddings
        candidate_docs = shuffled_ranking[qid]
        doc_embs = torch.stack([torch.tensor(doc_embeddings[doc_id]) for doc_id in candidate_docs])

        # Compute similarities
        query_emb_tensor = torch.tensor(query_embedding).to(device)
        doc_embs = doc_embs.to(device)
        scores = util.cos_sim(query_emb_tensor, doc_embs)[0]

        # Sort by score
        sorted_indices = torch.argsort(scores, descending=True)
        reranked_results[qid] = [candidate_docs[i] for i in sorted_indices]

    return reranked_results

In [23]:
# Evaluation function
def compute_map_and_recall(predictions, gold_mapping, k_values=[3, 5, 10, 20]):
    results = {
        "MAP": 0.0,
        "Recall@k": {k: 0.0 for k in k_values},
        "Mean Rank": 0.0,
        "Mean Inverse Rank": 0.0
    }

    total_queries = 0
    map_sum = 0.0
    mean_rank_sum = 0.0
    mean_inv_rank_sum = 0.0
    recall_hits = {k: 0 for k in k_values}

    for qid, ranked_docs in predictions.items():
        if qid not in gold_mapping:
            continue

        gold_docs = set(gold_mapping[qid])
        if not gold_docs:
            continue

        total_queries += 1
        ap_sum = 0.0
        hit_count = 0
        first_hit_rank = None

        for rank, doc_id in enumerate(ranked_docs, 1):
            if doc_id in gold_docs:
                hit_count += 1
                ap_sum += hit_count / rank
                if first_hit_rank is None:
                    first_hit_rank = rank

        map_sum += ap_sum / len(gold_docs)

        if first_hit_rank:
            mean_rank_sum += first_hit_rank
            mean_inv_rank_sum += 1 / first_hit_rank

        for k in k_values:
            top_k = set(ranked_docs[:k])
            if gold_docs & top_k:
                recall_hits[k] += 1

    if total_queries > 0:
        results["MAP"] = map_sum / total_queries
        results["Mean Rank"] = mean_rank_sum / total_queries
        results["Mean Inverse Rank"] = mean_inv_rank_sum / total_queries
        for k in k_values:
            results["Recall@k"][k] = recall_hits[k] / total_queries

    return results

def print_results(results, name):
    print(f"\n🔍 Evaluation Results ({name}):")
    for metric, value in results.items():
        if isinstance(value, dict):
            for k, v in value.items():
                print(f"{metric}@{k}: {v:.4f}")
        else:
            print(f"{metric}: {value:.4f}")

In [24]:
# Evaluate all representations
print("Evaluating all representations...")

# 1. Title + Abstract (TA)
predictions_ta = rerank_queries(train_queries, shuffled_ranking, query_contents_dict, doc_embeddings_ta, extract_ta)
results_ta = compute_map_and_recall(predictions_ta, gold_mappings)
print_results(results_ta, "Title + Abstract")

# 2. Title + Abstract + Claim 1 (TAC1)
predictions_tac1 = rerank_queries(train_queries, shuffled_ranking, query_contents_dict, doc_embeddings_tac1, extract_tac1)
results_tac1 = compute_map_and_recall(predictions_tac1, gold_mappings)
print_results(results_tac1, "Title + Abstract + Claim 1")

# 3. Title + Abstract + Claim 1 + Features (TAC1F)
predictions_tac1f = rerank_queries(train_queries, shuffled_ranking, query_contents_dict, doc_embeddings_tac1f, extract_tac1f)
results_tac1f = compute_map_and_recall(predictions_tac1f, gold_mappings)
print_results(results_tac1f, "Title + Abstract + Claim 1 + Features")

# 4. All Claims
predictions_all_claims = rerank_queries(train_queries, shuffled_ranking, query_contents_dict, doc_embeddings_all_claims, extract_all_claims)
results_all_claims = compute_map_and_recall(predictions_all_claims, gold_mappings)
print_results(results_all_claims, "All Claims")

# 5. Features Only
predictions_features = rerank_queries(train_queries, shuffled_ranking, query_contents_dict, doc_embeddings_features, extract_features_only)
results_features = compute_map_and_recall(predictions_features, gold_mappings)
print_results(results_features, "Features Only")

Evaluating all representations...


Reranking queries: 100%|██████████| 20/20 [00:01<00:00, 19.89it/s]



🔍 Evaluation Results (Title + Abstract):
MAP: 0.2199
Recall@k@3: 0.3500
Recall@k@5: 0.4500
Recall@k@10: 0.7500
Recall@k@20: 0.8500
Mean Rank: 7.9500
Mean Inverse Rank: 0.2749


Reranking queries: 100%|██████████| 20/20 [00:01<00:00, 10.61it/s]



🔍 Evaluation Results (Title + Abstract + Claim 1):
MAP: 0.2016
Recall@k@3: 0.3000
Recall@k@5: 0.5000
Recall@k@10: 0.7500
Recall@k@20: 0.9000
Mean Rank: 8.0500
Mean Inverse Rank: 0.2472


Reranking queries: 100%|██████████| 20/20 [00:01<00:00, 10.66it/s]



🔍 Evaluation Results (Title + Abstract + Claim 1 + Features):
MAP: 0.2023
Recall@k@3: 0.3000
Recall@k@5: 0.5000
Recall@k@10: 0.7500
Recall@k@20: 0.9000
Mean Rank: 8.1000
Mean Inverse Rank: 0.2470


Reranking queries: 100%|██████████| 20/20 [00:02<00:00,  8.64it/s]



🔍 Evaluation Results (All Claims):
MAP: 0.2023
Recall@k@3: 0.3000
Recall@k@5: 0.4000
Recall@k@10: 0.7500
Recall@k@20: 0.9000
Mean Rank: 8.3500
Mean Inverse Rank: 0.2557


Reranking queries: 100%|██████████| 20/20 [00:00<00:00, 49.40it/s]


🔍 Evaluation Results (Features Only):
MAP: 0.2127
Recall@k@3: 0.4000
Recall@k@5: 0.4000
Recall@k@10: 0.7000
Recall@k@20: 0.9500
Mean Rank: 8.2000
Mean Inverse Rank: 0.2920





In [32]:
# Other vertion of TAC1+Features

CONFIG = {
    "data_dir": ".",
    "output_file": "prediction2.json",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "bi_encoder": "BAAI/bge-large-en-v1.5",
    "cross_encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    "max_length": 512,
    "batch_size": 32
}

class PatentReranker:
    def __init__(self, config):
        self.config = config
        self.data = self._load_data()
        self.feature_stats = self._get_feature_stats()
        self._init_models()

    def _load_data(self):
        """Load all data files"""
        files = {
            "test_queries": "test_queries.json",
            "queries_content": "queries_content_with_features.json",
            "docs_content": "documents_content_with_features.json",
            "pre_ranking": "shuffled_pre_ranking.json"
        }

        data = {}
        for key, filename in files.items():
            with open(f"{self.config['data_dir']}/{filename}", "r") as f:
                data[key] = json.load(f)

        # Convert content lists to dictionaries
        for content_type in ["queries_content", "docs_content"]:
            if isinstance(data[content_type], list):
                data[content_type] = {item["FAN"]: item for item in data[content_type]}

        return data

    def _get_feature_stats(self):
        """Calculate feature frequencies for weighting"""
        feature_counts = defaultdict(int)
        for doc in self.data["docs_content"].values():
            for feature in doc["Content"]["features"]:
                feature_counts[feature] += 1
        return feature_counts

    def _init_models(self):
        """Initialize the models"""
        self.bi_encoder = SentenceTransformer(
            self.config["bi_encoder"],
            device=self.config["device"]
        )
        self.bi_encoder.max_seq_length = self.config["max_length"]

        self.cross_encoder = CrossEncoder(
            self.config["cross_encoder"],
            device=self.config["device"]
        )

    def _get_text(self, patent_data):
        """Enhanced text extraction"""
        content = patent_data["Content"]
        parts = [
            content.get("title", ""),
            content.get("pa01", ""),  # First paragraph as abstract
            *[v for k,v in content.items() if k.startswith("c-en")],  # English claims
            *[v for k,v in content.items() if k.startswith("p") and len(k.split('-')) == 2]  # Main paragraphs
        ]
        return " ".join(parts).strip()

    def _get_feature_score(self, query_features, doc_features):
        """Weighted feature overlap scoring"""
        if not self.feature_stats:
            return len(set(query_features) & set(doc_features))

        total_docs = len(self.feature_stats)
        weights = {f: np.log(total_docs/(1+self.feature_stats[f])) for f in query_features}
        return sum(weights.get(f, 0) for f in doc_features if f in weights)

    def _rerank_documents(self, query_id, doc_ids):
        """Core re-ranking logic"""
        # Get query data
        query_content = self.data["queries_content"][query_id]
        query_text = self._get_text(query_content)
        query_features = query_content["Content"]["features"]

        # Get document data
        doc_contents = [self.data["docs_content"][did] for did in doc_ids]
        doc_texts = [self._get_text(doc) for doc in doc_contents]
        doc_features = [doc["Content"]["features"] for doc in doc_contents]

        # Stage 1: Bi-encoder
        query_embed = self.bi_encoder.encode([query_text])[0]
        doc_embeds = self.bi_encoder.encode(doc_texts, batch_size=self.config["batch_size"])
        bi_scores = util.cos_sim(query_embed, doc_embeds)[0].cpu().numpy()

        # Stage 2: Cross-encoder on top candidates
        top_k = min(15, len(doc_ids))
        top_indices = np.argsort(bi_scores)[-top_k:][::-1]

        cross_inputs = [(query_text, doc_texts[i]) for i in top_indices]
        cross_scores = self.cross_encoder.predict(cross_inputs)

        # Feature scores
        feature_scores = np.array([
            self._get_feature_score(query_features, features)
            for features in doc_features
        ])

        # Normalize and combine scores
        bi_scores = (bi_scores - bi_scores.min()) / (bi_scores.max() - bi_scores.min() + 1e-8)
        feature_scores = (feature_scores - feature_scores.min()) / (feature_scores.max() - feature_scores.min() + 1e-8)

        # Combined scoring
        final_scores = 0.6*bi_scores + 0.3*feature_scores
        for idx, score in zip(top_indices, cross_scores):
            final_scores[idx] = 0.4*bi_scores[idx] + 0.4*score + 0.2*feature_scores[idx]

        return [doc_ids[i] for i in np.argsort(final_scores)[::-1]]

    def generate_submission(self):
        """Generate the submission file"""
        predictions = {}

        for query_id in tqdm(self.data["test_queries"], desc="Processing queries"):
            original_ranking = self.data["pre_ranking"][query_id]

            # Ensure we work with exactly 30 documents
            working_docs = original_ranking[:30]

            # Re-rank
            reranked = self._rerank_documents(query_id, working_docs)

            # Preserve all original documents
            predictions[query_id] = [did for did in reranked if did in original_ranking]
            predictions[query_id].extend(did for did in original_ranking if did not in predictions[query_id])

            # Validation
            assert len(predictions[query_id]) == len(original_ranking)
            assert set(predictions[query_id]) == set(original_ranking)

        # Save with exact required format
        with open(self.config["output_file"], "w") as f:
            json.dump(predictions, f)

        print(f"Submission file generated: {self.config['output_file']}")
        return predictions

In [33]:
reranker = PatentReranker(CONFIG)

In [34]:
submission = reranker.generate_submission()

Processing queries: 100%|██████████| 10/10 [00:35<00:00,  3.58s/it]

Submission file generated: prediction2.json



