# TASK 1

- Chakir EL ARRAG

NB: The comments and some parts of the code were done with the help of AI. The ideas and approaches used are original.

## 1. Imports

In [None]:
import pandas as pd
import json
import joblib
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import issparse
from scipy.sparse import save_npz
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


## Helper Functions

In [5]:
def load_json_data(file_path):
    with open(file_path, "r") as file:
        contents = json.load(file)
    return contents


def create_tfidf_matrix(citing_dataset, nonciting_dataset, vectorizer=TfidfVectorizer()):
    """
    Creates TF-IDF matrix for the given citing and non-citing datasets based on the specified text column.

    Parameters:
    citing_dataset (json)): DataFrame containing citing patents.
    nonciting_dataset (json): DataFrame containing non-citing patents.
    vectorizer (TfidfVectorizer, optional): TfidfVectorizer object for vectorizing text data.
                                             Defaults to TfidfVectorizer().

    Returns:
    tuple: A tuple containing TF-IDF matrices for citing and non-citing patents respectively.
           (tfidf_matrix_citing, tfidf_matrix_nonciting)
    """
    all_text = [patent['text'] for patent in citing_dataset + nonciting_dataset]

    # Vectorizing descriptions
    print("Vectorizing descriptions...")
    tfidf_matrix = vectorizer.fit_transform(tqdm(all_text, desc="TF-IDF"))

    # Since we're interested in similarities between citing and cited patents,
    # we need to split the TF-IDF matrix back into two parts
    split_index = len(citing_dataset)
    tfidf_matrix_citing = tfidf_matrix[:split_index]
    tfidf_matrix_nonciting = tfidf_matrix[split_index:]

    # Size of vocabulary
    print("Size of vocabulary:", len(vectorizer.vocabulary_))

    return tfidf_matrix_citing, tfidf_matrix_nonciting



def get_mapping_dict(mapping_df):
    """
    Creates dictionary of citing ids to non-citing id based on given dataframe (which is based on providedjson)

    Parameters:
    mapping_df (DataFrame): DataFrame containing mapping between citing and cited patents
    Returns:
    dict: dictionary of unique citing patent ids to list of cited patent ids
    """
    mapping_dict = {}

    for _, row in mapping_df.iterrows():
        key = row[0]  # Value from column 0
        value = row[2]  # Value from column 2
        if key in mapping_dict:
            mapping_dict[key].append(value)
        else:
            mapping_dict[key] = [value]

    return mapping_dict

def create_corpus(corpus, text_type):
    """
    Extracts text data from a corpus based on the specified text type.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    text_type (str): Type of text to extract ('title', 'abstract', 'claim1', 'claims', 'description', 'fulltext').

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """

    app_ids = [doc['Application_Number'] + doc['Application_Category'] for doc in corpus]

    cnt = 0 # count the number of documents without text
    texts = []  # list of texts
    ids_to_remove = []  # list of ids of documents without text, to remove them from the corpus

    if text_type == 'title':
        for doc in corpus:
            try:
                texts.append(doc['Content']['title'])
            except: # if the document does not have a title
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without title: {cnt}")

    elif text_type == 'abstract':
        for doc in corpus:
            try:
                texts.append(doc['Content']['pa01'])
            except: # if the document does not have an abstract
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without abstract: {cnt}")

    elif text_type == 'claim1':
        for doc in corpus:
            try:
                texts.append(doc['Content']['c-en-0001'])
            except: # if the document does not have claim 1
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without claim 1: {cnt}")

    elif text_type == 'claims':
        # all the values with the key starting with 'c-en-', each element in the final list is a list of claims
        for doc in corpus:
            doc_claims = []
            for key in doc['Content'].keys():
                if key.startswith('c-en-'):
                    doc_claims.append(doc['Content'][key])
            if len(doc_claims) == 0:    # if the document does not have any claims
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_claims)
                texts.append(doc_text_string)
        print(f"Number of documents without claims: {cnt}")

    elif text_type == 'description':
        # all the values with the key starting with 'p'
        for doc in corpus:
            doc_text = []
            for key in doc['Content'].keys():
                if key.startswith('p'):
                    doc_text.append(doc['Content'][key])
            if len(doc_text) == 0:  # if the document does not have any description
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_text)
                texts.append(doc_text_string)
        print(f"Number of documents without description: {cnt}")

    elif text_type == 'fulltext':
        for doc in corpus:
            doc_text = list(doc['Content'].values())
            doc_text_string = ' '.join(doc_text)
            texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without any text: {cnt}")

    else:
        raise ValueError("Invalid text type")

    if len(ids_to_remove) > 0:
        print(f"Removing {len(ids_to_remove)} documents without required text")
        for id_ in ids_to_remove[::-1]:
            idx = app_ids.index(id_)
            del app_ids[idx]

    # Create a list of dictionaries with app_ids and texts
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(app_ids, texts)]

    return corpus_data


def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citing_to_cited_dict : dict of str : list of str
        Mapping between citing patents and the list of their cited patents
    recommendations_dict : dict of str : list of str
        Mapping between citing patents and the sorted list of recommended patents

    Returns:
    list of list
        True relevant items for each recommendation list.
    list of list
        Predicted recommended items for each recommendation list.
    int
        Number of patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping



def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Recall@k value.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set))

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k)

    return mean_recall

def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean inverse rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks


def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks



def mean_average_precision(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Average Precision for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Average Precision value.
    """
    average_precisions = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Average Precision for each recommendation list
        true_set = set(true)
        precision_at_k = []
        relevant_count = 0
        for i, item in enumerate(pred[:k]):
            if item in true_set:
                relevant_count += 1
                precision_at_k.append(relevant_count / (i + 1))
        average_precision = sum(precision_at_k) / len(true_set)
        average_precisions.append(average_precision)

    # Calculate the mean Average Precision
    mean_average_precision = sum(average_precisions) / len(average_precisions)

    return mean_average_precision

def top_k_ranks(citing, cited, cosine_similarities, k=10):
    # Create a dictionary to store the top k ranks for each citing patent
    top_k_ranks = {}
    for i, content_id in enumerate(citing):
        top_k_ranks[content_id['id']] = [cited[j]['id'] for j in np.argsort(cosine_similarities[i])[::-1][:k]]
    return top_k_ranks


## Load Datasets

In [6]:
json_citing_train = load_json_data("./Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json")
json_citing_test = load_json_data("./Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json")

json_nonciting = load_json_data("./Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json")
json_citing_to_cited = load_json_data("./Citation_JSONs/Citation_Train.json") # Citing ids are unique

In [7]:
citing_dataset_df = pd.DataFrame(json_citing_train)

nonciting_dataset_df = pd.DataFrame(json_nonciting)
mapping_dataset_df = pd.DataFrame(json_citing_to_cited)
citing_dataset_df.shape, nonciting_dataset_df.shape, mapping_dataset_df.shape

((6831, 4), (16837, 4), (8594, 5))

# 2 Retrieval model

To perform the retrieval, we will embed each section of the patent using different embeddings:
- Title: TF-IDF
- Abstract: Dense (BAAI/bge-base-en-v1.5)
- First Claim: TF-IDF
- Claims: Dense (BAAI/bge-base-en-v1.5)
- Description: Dense (BAAI/bge-base-en-v1.5)

After this, the scores will be aggregated in order to produce the final ranking. The weights won't be chosen manually, but they will be optimised to maximise the recall@100 on the training set. K-Fold Cross Validation will be used to prevent overfitting on the test set.

The dense embedding is done by chunking and averaging the resul between the chunks.

In [None]:
# Load the pre-trained BGE embedding model from SentenceTransformers
bge_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

# Function to generate dense embeddings using a retrieval-specific prefix
def get_dense_embeddings(texts, prefix="Represent this sentence for retrieval: "):
    # Add the prefix to each input text to align with how the model was trained
    processed = [prefix + t for t in texts]
    
    # Encode the text using the BGE model
    return bge_model.encode(
        processed,
        convert_to_numpy=True,       # Return results as NumPy arrays
        batch_size=16,               # Batch size for efficient processing
        show_progress_bar=True       # Display progress bar
    )

def compute_chunked_embeddings(texts, chunk_size=300):
    chunked_vecs = []  # To store final averaged vectors for each input

    for text in tqdm(texts, desc="Chunking & encoding"):
        # Split text into chunks of max length 'chunk_size' characters
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        
        # Get embeddings for each chunk
        emb_chunks = get_dense_embeddings(chunks)
        
        # Average the chunk embeddings to get a single vector for the full text
        vec = np.mean(emb_chunks, axis=0)
        
        # Store the averaged vector
        chunked_vecs.append(vec)
    
    # Return all document embeddings as a NumPy array
    return np.array(chunked_vecs)

### Title Embedding

In [None]:
title_citing = create_corpus(json_citing_train, 'title')
title_nonciting = create_corpus(json_nonciting, 'title')

tfidf_vectorizer_title = TfidfVectorizer(stop_words='english', max_features=20000)
tfidf_citing_title, tfidf_nonciting_title = create_tfidf_matrix(title_citing, title_nonciting, tfidf_vectorizer_title)

save_npz("embeddings/tfidf_title_nonciting.npz", tfidf_nonciting_title)
joblib.dump(tfidf_vectorizer_title, "embeddings/tfidf_vectorizer_title.pkl")

score_title = cosine_similarity(tfidf_citing_title, tfidf_nonciting_title)

Number of documents without title: 0
Number of documents without title: 0
Vectorizing descriptions...


TF-IDF: 100%|██████████| 23668/23668 [00:00<00:00, 222022.20it/s]

Size of vocabulary: 12168





### Abstract Embedding

In [19]:
abstract_citing = create_corpus(json_citing_train, 'abstract')
abstract_nonciting = create_corpus(json_nonciting, 'abstract')

emb_citing_abstract = get_dense_embeddings([doc['text'] for doc in abstract_citing])
emb_nonciting_abstract = get_dense_embeddings([doc['text'] for doc in abstract_nonciting])

np.save("embeddings/abstract_nonciting.npy", emb_nonciting_abstract)
score_abstract = cosine_similarity(emb_citing_abstract, emb_nonciting_abstract)

Number of documents without abstract: 0
Number of documents without abstract: 7811
Removing 7811 documents without required text


Batches: 100%|██████████| 427/427 [03:03<00:00,  2.33it/s]
Batches: 100%|██████████| 565/565 [04:00<00:00,  2.35it/s]


### First Claim Embedding

In [None]:
claim1_citing = create_corpus(json_citing_train, 'claim1')
claim1_nonciting = create_corpus(json_nonciting, 'claim1')

tfidf_vectorizer_claim1 = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_citing_claim1, tfidf_nonciting_claim1 = create_tfidf_matrix(claim1_citing, claim1_nonciting, tfidf_vectorizer_claim1)

save_npz("embeddings/tfidf_claim1_nonciting.npz", tfidf_nonciting_claim1)
joblib.dump(tfidf_vectorizer_claim1, "embeddings/tfidf_vectorizer_claim1.pkl")

score_claim1 = cosine_similarity(tfidf_citing_claim1, tfidf_nonciting_claim1)

Number of documents without claim 1: 0
Number of documents without claim 1: 3
Removing 3 documents without required text
Vectorizing descriptions...


TF-IDF: 100%|██████████| 23665/23665 [00:01<00:00, 23644.28it/s]


Size of vocabulary: 41111


### Claims Embedding

In [21]:
claims_citing = create_corpus(json_citing_train, 'claims')
claims_nonciting = create_corpus(json_nonciting, 'claims')

emb_citing_claims = compute_chunked_embeddings([doc['text'] for doc in claims_citing])
emb_nonciting_claims = compute_chunked_embeddings([doc['text'] for doc in claims_nonciting])

np.save("embeddings/claims_nonciting.npy", emb_nonciting_claims)
score_claims = cosine_similarity(emb_citing_claims, emb_nonciting_claims)

Number of documents without claims: 0
Number of documents without claims: 3
Removing 3 documents without required text


Batches: 100%|██████████| 2/2 [00:01<00:00,  1.44it/s] ?it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.72it/s]39:51,  1.40s/it]
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.38it/s]41:59,  1.12it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s]19:20,  1.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.18it/s]01:11,  1.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.06it/s]:07,  2.32it/s]  
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.12it/s]:12,  2.83it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.12it/s]:12,  3.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.94it/s]:35,  3.39it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  6.17it/s]:13,  3.53it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  5.29it/s]3:44,  3.37it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.35it/s]6:40,  3.10it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  5.57it/s]3:33,  3.39it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.94it/s]5:51,  3.17it/s]
Batches: 100%|

### Description Embedding

The description is long and therefore on my machine it would take a very long time to perform the embedding.

Therefore, I had the idea to use TF-IDF to get the most important words in the description and keep the sentences in which they're present.

Then these sentences are embedded like it was done previously.

In [None]:
description_citing = create_corpus(json_citing_train, 'description')
description_nonciting = create_corpus(json_nonciting, 'description')

all_descriptions = [doc['text'] for doc in description_citing + description_nonciting]

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(all_descriptions)
vocab = set(tfidf_vectorizer.get_feature_names_out())

# Check if sentence contains any high-TFIDF term
def contains_special_word(sentence, special_words):
    words = sentence.translate(str.maketrans('', '', string.punctuation)).lower().split()
    return any(word in special_words for word in words)


def clean_text(text):
    return text.replace('\n', ' ').replace('\xa0', ' ').strip()

# Extract sentences with special words
def extract_informative_sentences(description, special_words):
    cleaned = clean_text(description)
    sentences = nltk.sent_tokenize(cleaned)
    informative = [s for s in sentences if contains_special_word(s, special_words)]
    return " ".join(informative) if informative else (sentences[0] if sentences else "")

# Select top N TF-IDF words as "special words"
tfidf_scores = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
top_indices = np.argsort(tfidf_scores)[-10:]  # top 200 words
special_words = set(tfidf_vectorizer.get_feature_names_out()[top_indices])

# Build corpus using informative sentences only
def create_informative_corpus(data):
    return [{
        'id': doc['id'],
        'text': extract_informative_sentences(doc['text'], special_words)} 
        for doc in data]

description_citing_short = create_informative_corpus(description_citing)
description_nonciting_short = create_informative_corpus(description_nonciting)

print("Original length:", len(description_nonciting[0]['text'].split()))
print("Filtered length:", len(description_nonciting_short[0]['text'].split()))

Number of documents without description: 0
Number of documents without description: 0
Original length: 10310
Filtered length: 3555


In [23]:
emb_citing_desc = compute_chunked_embeddings([doc['text'] for doc in description_citing_short])
emb_nonciting_desc = compute_chunked_embeddings([doc['text'] for doc in description_nonciting_short])

np.save("embeddings/description_nonciting.npy", emb_nonciting_desc)
score_description = cosine_similarity(emb_citing_desc, emb_nonciting_desc)

Batches: 100%|██████████| 3/3 [00:06<00:00,  2.04s/it] ?it/s]
Batches: 100%|██████████| 38/38 [00:06<00:00,  5.86it/s]6:09,  6.29s/it]
Batches: 100%|██████████| 7/7 [00:00<00:00,  7.14it/s]:10:10,  6.42s/it]
Batches: 100%|██████████| 2/2 [00:00<00:00,  5.91it/s]28:03,  3.94s/it] 
Batches: 100%|██████████| 5/5 [00:00<00:00,  5.61it/s]46:31,  2.52s/it]
Batches: 100%|██████████| 8/8 [00:01<00:00,  6.88it/s]39:58,  1.93s/it]
Batches: 100%|██████████| 2/2 [00:00<00:00,  7.72it/s]10:19,  1.67s/it]
Batches: 100%|██████████| 2/2 [00:00<00:00,  8.51it/s]17:51,  1.21s/it]
Batches: 100%|██████████| 4/4 [00:00<00:00,  7.24it/s]42:35,  1.11it/s]
Batches: 100%|██████████| 5/5 [00:00<00:00,  7.04it/s]30:17,  1.26it/s]
Batches: 100%|██████████| 5/5 [00:00<00:00,  7.56it/s]:27:31,  1.30it/s]
Batches: 100%|██████████| 5/5 [00:00<00:00,  6.50it/s]:23:58,  1.35it/s]
Batches: 100%|██████████| 2/2 [00:05<00:00,  2.89s/it]:25:11,  1.33it/s]
Batches: 100%|██████████| 6/6 [00:00<00:00,  7.43it/s]:18:50,  2.28s

In [103]:
print("Title shape: ", tfidf_nonciting_title.shape)
print("Abstract shape: ", emb_citing_abstract.shape)
print("Claim1 shape: ", tfidf_nonciting_claim1.shape)
print("Claims shape: ", emb_nonciting_claims.shape)
print("Description shape: ", emb_nonciting_desc.shape)

Title shape:  (16837, 12168)
Abstract shape:  (6831, 768)
Claim1 shape:  (16834, 41111)
Claims shape:  (16834, 768)
Description shape:  (16837, 768)


We notice that the shape of the embeddings is different for the non-citing patents and this is due to the fact that some documents have sections missing.

Therefore, I thought of padding zero vectors for the documents with missing sections. I thought this would be fine for the first claim and the claims, but the abstract section is missing in a lot of documents, and padding zero vectors in this case might not be the best choice. Therefore I thought of replacing the abstract section with a concatenation of the title and the abstract and make a dense embedding of the concatenation. Which at the end improved the recall@100 by 2%.

In [None]:
def create_abstract_missing(corpus, text_type):
    # Generate a list of unique application IDs by concatenating the number and category
    app_ids = [doc['Application_Number'] + doc['Application_Category'] for doc in corpus]

    cnt = 0  # Counter for documents missing the abstract
    texts = []  # Will hold the abstract text or placeholder
    ids_to_remove = []  # (Unused here, could be removed unless reused later)

    # Only handles abstract extraction
    if text_type == 'abstract':
        for doc in corpus:
            try:
                # Attempt to extract the abstract ('pa01') from Content
                texts.append(doc['Content']['pa01'])
            except:
                # If missing, append an empty string and count it
                texts.append(' ')
                cnt += 1
        print(f"Number of documents without abstract: {cnt}")
    else:
        # Raise an error if text_type is not handled
        raise ValueError("Invalid text type")

    # This block is not used currently (ids_to_remove remains empty), safe to remove
    if len(ids_to_remove) > 0:
        print(f"Removing {len(ids_to_remove)} documents without required text")
        for id_ in ids_to_remove[::-1]:
            idx = app_ids.index(id_)
            del app_ids[idx]

    # Combine the app_ids and their corresponding texts into a list of dictionaries
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(app_ids, texts)]

    return corpus_data

In [None]:
# Initialize lists for combined title + abstract fields
title_abs_citing = []
title_abs_nonciting = []

# Combine title and abstract for citing patents
for i in range(len(title_citing)):
    title_abs_citing.append({
        'id': title_citing[i]['id'],  # Keep the patent ID
        'text': title_citing[i]['text'] + ' ' + abstract_citing[i]['text']  # Concatenate title and abstract
    })

# Combine title and abstract for non-citing patents
for i in range(len(title_nonciting)):
    title_abs_nonciting.append({
        'id': title_nonciting[i]['id'],  # Keep the patent ID
        'text': title_nonciting[i]['text'] + ' ' + abstract_nonciting[i]['text']  # Concatenate title and abstract
    })

In [None]:
title_abs_citing_embedding = get_dense_embeddings([doc['text'] for doc in title_abs_citing])
title_abs_nonciting_embedding = get_dense_embeddings([doc['text'] for doc in title_abs_nonciting])

### Saving Embeddings

In [None]:
np.save("embeddings/title_abstract_citing.npy", title_abs_citing_embedding)
np.save("embeddings/title_abstract_nonciting.npy", title_abs_nonciting_embedding)
np.save("embeddings/description_citing.npy", emb_citing_desc)
np.save("embeddings/claims_citing.npy", emb_citing_claims)
np.save("embeddings/abstract_citing.npy", emb_citing_abstract)

The next section is used to load the saved embeddings when needed.

In [None]:
emb_citing_desc = np.load("embeddings/description_citing.npy")
emb_citing_claims = np.load("embeddings/claims_citing.npy")
emb_citing_abstract = np.load("embeddings/abstract_citing.npy")
emb_nonciting_desc = np.load("embeddings/description_nonciting.npy")
emb_nonciting_claims = np.load("embeddings/claims_nonciting.npy")
emb_nonciting_abstract = np.load("embeddings/abstract_nonciting.npy")

In [None]:
def pad_missing_embeddings(nonciting_df, embedded_docs, embedded_vectors, embedding_dim=768):
    
    # Check if input embeddings are sparse (e.g., from TF-IDF)
    is_sparse = issparse(embedded_vectors)

    # Map each document ID to its corresponding vector
    id_to_vec = {}
    for doc, vec in zip(embedded_docs, embedded_vectors):
        if is_sparse:
            # Convert sparse vector to dense array
            id_to_vec[doc['id']] = vec.toarray().flatten()
        else:
            id_to_vec[doc['id']] = vec

    # Create an empty matrix of zeros for all documents
    padded_embeddings = np.zeros((len(nonciting_df), embedding_dim), dtype=np.float32)

    # Fill in the actual embeddings where available
    for i, row in enumerate(nonciting_df.itertuples(index=False)):
        doc_id = str(row.Application_Number) + row.Application_Category
        if doc_id in id_to_vec:
            padded_embeddings[i] = id_to_vec[doc_id]
        # If missing, the row stays as a zero vector

    return padded_embeddings

Padding with zero vectors is performed on the first claim and claims embeddings.

In [None]:
emb_nonciting_claims2 = pad_missing_embeddings(
    nonciting_df=nonciting_dataset_df,
    embedded_docs=claims_nonciting,
    embedded_vectors=emb_nonciting_claims,
)

tfidf_nonciting_claim12 = pad_missing_embeddings(
    nonciting_df=nonciting_dataset_df,
    embedded_docs=claim1_nonciting,
    embedded_vectors=tfidf_nonciting_claim1,
    embedding_dim=41111
)

Number of documents without abstract: 7811
Removing 7811 documents without required text
Number of documents without claim 1: 3
Removing 3 documents without required text
Number of documents without claims: 3
Removing 3 documents without required text


### Similarity Scores

In [None]:
score_title = cosine_similarity(tfidf_citing_title, tfidf_nonciting_title)
score_abstract = cosine_similarity(title_abs_citing_embedding, title_abs_nonciting_embedding)
score_claim1 = cosine_similarity(tfidf_citing_claim1, tfidf_nonciting_claim12)
score_claims = cosine_similarity(emb_citing_claims, emb_nonciting_claims2)
score_description = cosine_similarity(emb_citing_desc, emb_nonciting_desc)

### Weight Optimisation

In this section, the scores between sections will be aggregated using a weighted average.

In order to obtain the appropriate weights, we perform K-Fold CV on the training set to get the optimal weights on the training set maximising the Recall@100.

In [None]:
import joblib

# Dictionary containing scores
scores_dict = {
    "title": score_title,
    "abstract": score_abstract,
    "claim1": score_claim1,
    "claims": score_claims,
    "description": score_description
}

joblib.dump(scores_dict, "embeddings/scores_dict.pkl")

['embeddings/scores_dict.pkl']

Loading the dictionary with the scores.

In [None]:
import joblib
scores_dict = joblib.load("embeddings/scores_dict.pkl")

In [None]:
from sklearn.model_selection import KFold
all_citing_docs = title_citing
mapping_dict = get_mapping_dict(mapping_dataset_df)

We make a function that evaluates the recall@100 value for specific weights.

In [None]:
def evaluate_weights(weights, scores_dict, mapping_dict, citing_docs, cited_docs, citing_ids, k=100):

    # Unpack the weights for the 5 components
    w_title, w_abstract, w_claim1, w_claims, w_description = weights

    # Compute a linear combination of similarity scores
    combined_score = (
        w_title * scores_dict["title"] +
        w_abstract * scores_dict["abstract"] +
        w_claim1 * scores_dict["claim1"] +
        w_claims * scores_dict["claims"] +
        w_description * scores_dict["description"]
    )

    # Map from citing patent ID to its index in the score matrix
    id_to_index = {doc['id']: i for i, doc in enumerate(citing_docs)}
    citing_indices = [id_to_index[pid] for pid in citing_ids]

    # Get the relevant citing documents and their corresponding similarity scores
    citing_subset = [citing_docs[i] for i in citing_indices]
    score_subset = combined_score[citing_indices]

    # Get the top-k ranked results for each query based on combined scores
    top_k = top_k_ranks(citing_subset, cited_docs, score_subset, k=k)

    # Retrieve true and predicted results for evaluation
    true, pred, _ = get_true_and_predicted(mapping_dict, top_k)

    # Compute recall@k (mean recall over all queries)
    recall = mean_recall_at_k(true, pred, k=k)

    return recall

Next, we define a function that performs a randomized search to find the best combination of weights (that sum to 1) for combining the scores, using k-fold cross-validation to maximize average Recall@k.

In [None]:
def optimize_weights_cv(
    citing_docs, cited_docs, scores_dict, mapping_dict, k=100,
    n_trials=50, n_splits=5, seed=42
):
    # Set seed for reproducibility
    np.random.seed(seed)

    # Get all citing document IDs
    all_ids = [doc['id'] for doc in citing_docs]

    # Initialize K-Fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    # Variables to store best weights and best average recall
    best_weights = None
    best_avg_recall = -1

    print(f"Running Random Search with {n_trials} trials and {n_splits}-fold CV...\n")

    # Run random search for the specified number of trials
    for trial in tqdm(range(n_trials), desc="Trials"):
        # Randomly generate a set of weights that sum to 1 using Dirichlet distribution
        weights = np.random.dirichlet(np.ones(5), size=1).flatten()
        recalls = []

        # Cross-validation loop
        for train_idx, val_idx in kf.split(all_ids):
            val_ids = [all_ids[i] for i in val_idx]

            # Evaluate recall on validation fold using current weights
            recall_val = evaluate_weights(
                weights, scores_dict, mapping_dict,
                citing_docs, cited_docs, val_ids, k
            )
            recalls.append(recall_val)

        # Average recall over all folds
        avg_recall = np.mean(recalls)

        # Store weights if they're the best so far
        if avg_recall > best_avg_recall:
            best_avg_recall = avg_recall
            best_weights = weights

    print("\nRandom Search CV Complete")
    print(f"Best Avg Recall@{k}: {best_avg_recall:.4f}")
    print("Best Weights:", np.round(best_weights, 3))

    return best_weights, best_avg_recall

Next, we perform the optimisation.

In [None]:
best_weights, best_recall = optimize_weights_cv(
    citing_docs=title_citing,
    cited_docs=title_nonciting,
    scores_dict=scores_dict,
    mapping_dict=mapping_dict,
    k=100
)

Running Random Search with 50 trials and 5-fold CV...



Trials: 100%|██████████| 50/50 [14:06<00:00, 16.92s/it]


Random Search CV Complete
Best Avg Recall@100: 0.9218
Best Weights: [0.007 0.224 0.084 0.157 0.528]





We get the best weights combining the scores of our embeddings achieving an optimal Recall@100 value of 92.18% on the training set.

In [64]:
final_score = (
    best_weights[0] * scores_dict["title"] +
    best_weights[1] * scores_dict["abstract"] +
    best_weights[2] * scores_dict["claim1"] +
    best_weights[3] * scores_dict["claims"] +
    best_weights[4] * scores_dict["description"]
)
final_score

array([[0.70868831, 0.64876335, 0.71829742, ..., 0.7312901 , 0.62097047,
        0.68761842],
       [0.62255261, 0.60406506, 0.67559978, ..., 0.60358749, 0.69006861,
        0.64031566],
       [0.65863634, 0.64548044, 0.70306693, ..., 0.68020214, 0.62002365,
        0.67026708],
       ...,
       [0.68364639, 0.65594998, 0.7473656 , ..., 0.69459041, 0.68761728,
        0.7108391 ],
       [0.62632647, 0.6601324 , 0.68750676, ..., 0.60968168, 0.64810056,
        0.69808483],
       [0.59714058, 0.60111651, 0.62548158, ..., 0.57566317, 0.58058042,
        0.64461073]])

In [65]:
k = 100
top_k_rank = top_k_ranks(title_citing, title_nonciting, final_score, k=k)

# Evaluate
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted(mapping_dict, top_k_rank)
mean_rank = mean_ranking(true_labels, predicted_labels)
mean_average_precision_val = mean_average_precision(true_labels, predicted_labels)

# Recalls at various thresholds
recall_at_10 = mean_recall_at_k(true_labels, predicted_labels, k=10)
recall_at_20 = mean_recall_at_k(true_labels, predicted_labels, k=20)
recall_at_50 = mean_recall_at_k(true_labels, predicted_labels, k=50)
recall_at_100 = mean_recall_at_k(true_labels, predicted_labels, k=100)

# Print the results
print("Recall at 10:", round(recall_at_10, 4))
print("Recall at 20:", round(recall_at_20, 4))
print("Recall at 50:", round(recall_at_50, 4))
print("Recall at 100:", round(recall_at_100, 4))
print("Mean ranking:", round(mean_rank, 4))
print("Mean average precision:", round(mean_average_precision_val, 4))
print("Number of patents measured:", len(predicted_labels))
print("Number of patents not in the citation:", not_in_citation_mapping)

Recall at 10: 0.6925
Recall at 20: 0.7791
Recall at 50: 0.8697
Recall at 100: 0.921
Mean ranking: 17.5801
Mean average precision: 0.4641
Number of patents measured: 6831
Number of patents not in the citation: 0


## 3. Test Set Ranking

In this section, we will embed the queries in the test set and load the pre-embedded non-citing patents to perform the retrieval using the weights we computed previously.

In [None]:
from scipy.sparse import load_npz

tfidf_vectorizer_title = joblib.load("embeddings/tfidf_vectorizer_title.pkl")
tfidf_nonciting_title = load_npz("embeddings/tfidf_title_nonciting.npz")
tfidf_vectorizer_claim1 = joblib.load("embeddings/tfidf_vectorizer_claim1.pkl")
tfidf_nonciting_claim1 = load_npz("embeddings/tfidf_claim1_nonciting.npz")
emb_nonciting_claims = np.load("embeddings/claims_nonciting.npy")
emb_nonciting_desc = np.load("embeddings/description_nonciting.npy")

#### Title and First Claim Embedding

In [None]:
title_citing_test = create_corpus(json_citing_test, 'title')
abstract_citing_test = create_corpus(json_citing_test, 'abstract')
claim1_citing_test = create_corpus(json_citing_test, 'claim1')
claims_citing_test = create_corpus(json_citing_test, 'claims')
desc_citing_test = create_corpus(json_citing_test, 'description')

tfidf_citing_title_test = tfidf_vectorizer_title.transform([doc['text'] for doc in title_citing_test])
tfidf_citing_claim1_test = tfidf_vectorizer_claim1.transform([doc['text'] for doc in claim1_citing_test])

Number of documents without title: 0
Number of documents without abstract: 0
Number of documents without claim 1: 0
Number of documents without claims: 0
Number of documents without description: 0


#### Title + Abstract Dense Embedding

The same operations performed on the training set are performed on the test set for compatibility.

In [None]:
title_abs_citing_test = []

# Combine title and abstract for citing patents
for i in range(len(title_citing_test)):
    title_abs_citing_test.append({'id': title_citing_test[i]['id'],
                             'text': title_citing_test[i]['text'] + ' ' + abstract_citing_test[i]['text'] # Concatenate title and abstract
                             })

title_abs_citing_test_embedding = get_dense_embeddings([doc['text'] for doc in title_abs_citing_test])
np.save("embeddings/title_abstract_citing_test.npy", title_abs_citing_test_embedding)

In [None]:
emb_citing_abstract_test = get_dense_embeddings([doc['text'] for doc in abstract_citing_test])
np.save("embeddings/abstract_citing_test.npy", emb_citing_abstract_test)

Batches: 100%|██████████| 63/63 [00:30<00:00,  2.06it/s]


#### Claims Embedding

In [63]:
emb_citing_claims_test = compute_chunked_embeddings([doc['text'] for doc in claims_citing_test])
np.save("embeddings/claims_citing_test.npy", emb_citing_claims_test)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.52it/s] ?it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  2.63it/s]:17,  1.48it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.53it/s]:09,  1.37it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.85it/s]:18,  2.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.01it/s]:44,  2.14it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.36it/s]:09,  2.69it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.97it/s]:22,  3.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.18it/s]:21,  2.60it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  5.46it/s]:36,  2.95it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  4.61it/s]:48,  3.43it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  3.56it/s]5:32,  2.97it/s]
Batches: 100%|██████████| 3/3 [00:00<00:00,  5.35it/s]5:16,  3.12it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  5.08it/s]6:29,  2.53it/s]
Batches: 100%|██████████| 2/2 [00:00<00:00,  3.99it/s]6:30,  2.53it/s]
Batches: 100%|██████████

#### Description Embedding

In [66]:
emb_citing_desc_test = compute_chunked_embeddings([doc['text'] for doc in desc_citing_test])
np.save("embeddings/description_citing_test.npy", emb_citing_desc_test)

Batches: 100%|██████████| 7/7 [00:07<00:00,  1.02s/it] ?it/s]
Batches: 100%|██████████| 7/7 [00:06<00:00,  1.13it/s]00:03,  7.21s/it]
Batches: 100%|██████████| 25/25 [00:04<00:00,  6.14it/s]:56,  6.61s/it]
Batches: 100%|██████████| 11/11 [00:01<00:00,  6.15it/s]:47,  5.46s/it]
Batches: 100%|██████████| 5/5 [00:00<00:00,  6.48it/s]06:40,  4.02s/it]
Batches: 100%|██████████| 8/8 [00:01<00:00,  4.91it/s]:14,  2.85s/it]  
Batches: 100%|██████████| 11/11 [00:01<00:00,  6.10it/s]3,  2.44s/it]
Batches: 100%|██████████| 10/10 [00:01<00:00,  7.45it/s]8,  2.23s/it]
Batches: 100%|██████████| 11/11 [00:01<00:00,  6.76it/s]7,  1.95s/it]
Batches: 100%|██████████| 5/5 [00:00<00:00,  7.28it/s]:36,  1.85s/it]
Batches: 100%|██████████| 13/13 [00:01<00:00,  6.74it/s]40,  1.50s/it]
Batches: 100%|██████████| 13/13 [00:01<00:00,  7.31it/s]53,  1.63s/it]
Batches: 100%|██████████| 15/15 [00:02<00:00,  6.14it/s]38,  1.68s/it]
Batches: 100%|██████████| 35/35 [00:05<00:00,  6.79it/s]26,  1.91s/it]
Batches: 100%|

The following section is for loading.

In [None]:
emb_citing_desc_test = np.load("embeddings/description_citing_test.npy")
emb_citing_claims_test = np.load("embeddings/claims_citing_test.npy")
title_abs_nonciting_embedding = np.load("embeddings/title_abstract_nonciting.npy")
title_abs_citing_test_embedding = np.load("embeddings/title_abstract_citing_test.npy")

### Scores on the Test Set

In [None]:
score_title = cosine_similarity(tfidf_citing_title_test, tfidf_nonciting_title)
score_abstract = cosine_similarity(title_abs_citing_test_embedding, title_abs_nonciting_embedding)
score_claim1 = cosine_similarity(tfidf_citing_claim1_test, tfidf_nonciting_claim12)
score_claims = cosine_similarity(emb_citing_claims_test, emb_nonciting_claims2)
score_desc = cosine_similarity(emb_citing_desc_test, emb_nonciting_desc)

In [None]:
final_score = (
    best_weights[0] * score_title +
    best_weights[1] * score_abstract +
    best_weights[2] * score_claim1 +
    best_weights[3] * score_claims +
    best_weights[4] * score_desc
)

### Retrieval

In [None]:
top_k = top_k_ranks(title_citing_test, title_nonciting, final_score, k=100)

with open("prediction1.json", "w") as f:
    json.dump(top_k, f)