In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

elizavetasirotina_ir_challenge_path = kagglehub.dataset_download('elizavetasirotina/ir-challenge')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/elizavetasirotina/ir-challenge?dataset_version_number=1...


100%|██████████| 303M/303M [00:05<00:00, 61.9MB/s]

Extracting files...





Data source import complete.


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import json

from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import numpy as np
from tqdm.auto import tqdm
import itertools

# 0.1 Helper functions

## load_json_data

In [None]:
def load_json_data(file_path):
    with open(file_path, "r") as file:
        contents = json.load(file)
    return contents

## get_mapping_dict

In [None]:
def get_mapping_dict(mapping_df):
    """
    Creates dictionary of citing ids to non-citing id based on given dataframe (which is based on providedjson)

    Parameters:
    mapping_df (DataFrame): DataFrame containing mapping between citing and cited patents
    Returns:
    dict: dictionary of unique citing patent ids to list of cited patent ids
    """
    mapping_dict = {}

    for _, row in mapping_df.iterrows():
        key = row[0]  # Value from column 0
        value = row[2]  # Value from column 2
        if key in mapping_dict:
            mapping_dict[key].append(value)
        else:
            mapping_dict[key] = [value]

    return mapping_dict

## create_corpus

In [None]:
def create_corpus(corpus, text_type):
    """
    Extracts text data from a corpus based on the specified text type.

    Parameters:
    corpus (list): List of dictionaries representing patent documents.
    text_type (str): Type of text to extract ('title', 'abstract', 'claim1', 'claims', 'description', 'fulltext').

    Returns:
    list: List of dictionaries with 'id' and 'text' keys representing each document in the corpus.
    """

    app_ids = [doc['Application_Number'] + doc['Application_Category'] for doc in corpus]

    cnt = 0 # count the number of documents without text
    texts = []  # list of texts
    ids_to_remove = []  # list of ids of documents without text, to remove them from the corpus

    if text_type == 'title':
        for doc in corpus:
            try:
                texts.append(doc['Content']['title'])
            except: # if the document does not have a title
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without title: {cnt}")

    elif text_type == 'abstract':
        for doc in corpus:
            try:
                texts.append(doc['Content']['pa01'])
            except: # if the document does not have an abstract
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without abstract: {cnt}")

    elif text_type == 'claim1':
        for doc in corpus:
            try:
                texts.append(doc['Content']['c-en-0001'])
            except: # if the document does not have claim 1
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
        print(f"Number of documents without claim 1: {cnt}")

    elif text_type == 'claims':
        # all the values with the key starting with 'c-en-', each element in the final list is a list of claims
        for doc in corpus:
            doc_claims = []
            for key in doc['Content'].keys():
                if key.startswith('c-en-'):
                    doc_claims.append(doc['Content'][key])
            if len(doc_claims) == 0:    # if the document does not have any claims
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_text)
                texts.append(doc_text_string)
        print(f"Number of documents without claims: {cnt}")

    elif text_type == 'description':
        # all the values with the key starting with 'p'
        for doc in corpus:
            doc_text = []
            for key in doc['Content'].keys():
                if key.startswith('p'):
                    doc_text.append(doc['Content'][key])
            if len(doc_text) == 0:  # if the document does not have any description
                ids_to_remove.append(doc['Application_Number']+doc['Application_Category'])
                cnt += 1
            else:
                doc_text_string = ' '.join(doc_text)
                texts.append(doc_text_string)
        print(f"Number of documents without description: {cnt}")

    elif text_type == 'fulltext':
        for doc in corpus:
            doc_text = list(doc['Content'].values())
            doc_text_string = ' '.join(doc_text)
            texts.append(doc_text_string)
        if cnt > 0:
            print(f"Number of documents without any text: {cnt}")

    else:
        raise ValueError("Invalid text type")

    if len(ids_to_remove) > 0:
        print(f"Removing {len(ids_to_remove)} documents without required text")
        for id_ in ids_to_remove[::-1]:
            idx = app_ids.index(id_)
            del app_ids[idx]

    # Create a list of dictionaries with app_ids and texts
    corpus_data = [{'id': app_id, 'text': text} for app_id, text in zip(app_ids, texts)]

    return corpus_data


## get_true_and_predicted

In [None]:
def get_true_and_predicted(citing_to_cited_dict, recommendations_dict):
    """
    Get the true and predicted labels for the metrics calculation.

    Parameters:
    citing_to_cited_dict : dict of str : list of str
        Mapping between citing patents and the list of their cited patents
    recommendations_dict : dict of str : list of str
        Mapping between citing patents and the sorted list of recommended patents

    Returns:
    list of list
        True relevant items for each recommendation list.
    list of list
        Predicted recommended items for each recommendation list.
    int
        Number of patents not in the citation mapping
    """
    # Initialize lists to store true labels and predicted labels
    true_labels = []
    predicted_labels = []
    not_in_citation_mapping = 0

    # Iterate over the items in both dictionaries
    for citing_id in recommendations_dict.keys():
        # Check if the citing_id is present in both dictionaries
        if citing_id in citing_to_cited_dict:
            # If yes, append the recommended items from both dictionaries to the respective lists
            true_labels.append(citing_to_cited_dict[citing_id])
            predicted_labels.append(recommendations_dict[citing_id])
        else:
            not_in_citation_mapping += 1

    return true_labels, predicted_labels, not_in_citation_mapping


## mean_recall_at_k

In [None]:
def mean_recall_at_k(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Recall@k for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Recall@k value.
    """
    recalls_at_k = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Recall@k for each recommendation list
        true_set = set(true)
        k = min(k, len(pred))
        relevant_count = sum(1 for item in pred[:k] if item in true_set)
        recalls_at_k.append(relevant_count / len(true_set))

    # Calculate the mean Recall@k
    mean_recall = sum(recalls_at_k) / len(recalls_at_k)

    return mean_recall

## mean_inv_ranking

In [None]:
def mean_inv_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean inverse rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean inverse rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the inverse rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = 1 / (pred.index(item) + 1)
            except ValueError:
                rank = 0  # If item not found, assign 0
            ranks.append(rank)

        # Calculate the mean inverse rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean inverse ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks

## mean_ranking

In [None]:
def mean_ranking(true_labels, predicted_labels):
    """
    Calculate the mean of lists of the mean rank of true relevant items
    in the lists of sorted recommended items.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.

    Returns:
    float
        Mean of lists of the mean rank of true relevant items.
    """
    mean_ranks = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate the rank of true relevant items
        # in the recommendation list
        ranks = []
        for item in true:
            try:
                rank = pred.index(item) + 1
            except ValueError:
                rank = len(pred)  # If item not found, assign the length of the list
            ranks.append(rank)

        # Calculate the mean rank of true relevant items
        # in the recommendation list
        mean_rank = sum(ranks) / len(ranks)
        mean_ranks.append(mean_rank)

    # Calculate the mean of the mean ranks across all recommendation lists
    mean_of_mean_ranks = sum(mean_ranks) / len(mean_ranks)

    return mean_of_mean_ranks

## mean_average_precision

In [None]:

def mean_average_precision(true_labels, predicted_labels, k=10):
    """
    Calculate the mean Average Precision for a list of recommendations.

    Parameters:
    true_labels : list of list
        True relevant items for each recommendation list.
    predicted_labels : list of list
        Predicted recommended items for each recommendation list.
    k : int
        Number of recommendations to consider.

    Returns:
    float
        Mean Average Precision value.
    """
    average_precisions = []

    for true, pred in zip(true_labels, predicted_labels):
        # Calculate Average Precision for each recommendation list
        true_set = set(true)
        precision_at_k = []
        relevant_count = 0
        for i, item in enumerate(pred[:k]):
            if item in true_set:
                relevant_count += 1
                precision_at_k.append(relevant_count / (i + 1))
        average_precision = sum(precision_at_k) / len(true_set)
        average_precisions.append(average_precision)

    # Calculate the mean Average Precision
    mean_average_precision = sum(average_precisions) / len(average_precisions)

    return mean_average_precision


## top_k_ranks

In [None]:
def top_k_ranks(citing, cited, cosine_similarities, k=10):
    # Create a dictionary to store the top k ranks for each citing patent
    top_k_ranks = {}
    for i, content_id in enumerate(citing):
        top_k_ranks[content_id['id']] = [cited[j]['id'] for j in np.argsort(cosine_similarities[i])[::-1][:k]]
    return top_k_ranks

# 1.0 Load Datasets

In [None]:
json_citing_train = load_json_data(elizavetasirotina_ir_challenge_path+"/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TRAIN.json")
json_citing_test = load_json_data(elizavetasirotina_ir_challenge_path+"/Content_JSONs/Citing_2020_Cleaned_Content_12k/Citing_Train_Test/citing_TEST.json")

json_nonciting = load_json_data(elizavetasirotina_ir_challenge_path+"/Content_JSONs/Cited_2020_Uncited_2010-2019_Cleaned_Content_22k/CLEANED_CONTENT_DATASET_cited_patents_by_2020_uncited_2010-2019.json")
json_citing_to_cited = load_json_data(elizavetasirotina_ir_challenge_path+"/Citation_JSONs/Citation_Train.json") # Citing ids are unique

In [None]:
citing_dataset_df = pd.DataFrame(json_citing_train)

nonciting_dataset_df = pd.DataFrame(json_nonciting)
mapping_dataset_df = pd.DataFrame(json_citing_to_cited)

In [None]:
citing_dataset_df.head()

Unnamed: 0,Application_Number,Application_Date,Application_Category,Content
0,3650293,2020-05-13,A1,{'title': 'DEVICE FOR CONTROLLING THE BRAKING ...
1,3694265,2020-08-12,A1,{'title': 'TIME DIVISION MULTIPLEXING OF SYNCH...
2,3623522,2020-03-18,A1,{'title': 'LAUNDRY MACHINE AND CONTROL METHOD ...
3,3611147,2020-02-19,A1,{'title': 'CHEMICAL AND TOPOLOGICAL SURFACE MO...
4,3640454,2020-04-22,A1,"{'title': 'FUEL METERING SYSTEM', 'pa01': 'The..."


In [None]:
nonciting_dataset_df.head()

Unnamed: 0,Application_Number,Application_Date,Application_Category,Content
0,2019772,2013-08-28,B1,"{'title': 'VEHICLE WITH LOCKABLE TILT SYSTEM',..."
1,2136094,2013-05-22,B1,"{'title': 'UNIVERSAL JOINT', 'c-en-0001': 'A u..."
2,2340701,2011-07-06,A2,"{'title': 'Apparatus, method and computer prog..."
3,2338661,2011-06-29,A1,{'title': 'Method and apparatus for forming a ...
4,2339144,2011-06-29,A2,{'title': 'Heat exchanger mounting assembly in...


In [None]:
mapping_dataset_df.head()

Unnamed: 0,0,1,2,3,4
0,3712070A1,[c-en-0004],3354576A1,"[p0024, p0027, c-en-0012, c-en-0013]",A
1,3675165A1,"[c-en-0001, c-en-0002, c-en-0003, c-en-0004, c...",3336831A2,"[p0045, p0046, p0047, p0048, p0049, p0050, p00...",A
2,3599626A1,"[c-en-0002, c-en-0003, c-en-0004, c-en-0005, c...",2453448A1,"[p0029, p0030]",A
3,3705201A1,"[c-en-0001, c-en-0002, c-en-0004, c-en-0006, c...",2468433A2,"[p0011, p0012, p0013, p0014, p0015, p0016, p00...",X
4,3628210A1,"[c-en-0001, c-en-0002, c-en-0003, c-en-0004, c...",3369366A1,[pa01],A


In [None]:
mapping_dict = get_mapping_dict(mapping_dataset_df)
# mapping_dict

In [None]:
# Saving mapping dict in the file
with open('mapping_dict.json', 'w') as f:
    json.dump(mapping_dict, f)


In [None]:
#import ast
#nonciting_dataset_df['all_content'] = nonciting_dataset_df['Content'].apply(ast.literal_eval)

## 1.1 Take a look at the content

## 1.2 Create a column with the Title

Note for lateR: Try all text content by uncommenting the below

In [None]:
def extract_title(dictionary):
    return dictionary.get('title', None)  # Return None if 'title' key is not present

nonciting_dataset_df['Title'] = nonciting_dataset_df['Content'].apply(extract_title)
citing_dataset_df['Title'] = citing_dataset_df['Content'].apply(extract_title)

nonciting_dataset_df.head()

Unnamed: 0,Application_Number,Application_Date,Application_Category,Content,Title
0,2019772,2013-08-28,B1,"{'title': 'VEHICLE WITH LOCKABLE TILT SYSTEM',...",VEHICLE WITH LOCKABLE TILT SYSTEM
1,2136094,2013-05-22,B1,"{'title': 'UNIVERSAL JOINT', 'c-en-0001': 'A u...",UNIVERSAL JOINT
2,2340701,2011-07-06,A2,"{'title': 'Apparatus, method and computer prog...","Apparatus, method and computer program product..."
3,2338661,2011-06-29,A1,{'title': 'Method and apparatus for forming a ...,Method and apparatus for forming a rubber article
4,2339144,2011-06-29,A2,{'title': 'Heat exchanger mounting assembly in...,Heat exchanger mounting assembly in a gas turbine



# 2.0 sBert

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available, using CPU.")

Using CUDA device: Tesla T4


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## 2.1 'Title'

In [None]:
# # Load the title of the citing and cited patents, create_corpus creates a dictionary of 'id' : 'title' for each patent.
title_citing_train = create_corpus(json_citing_train , 'title')
title_citing_test = create_corpus(json_citing_test , 'title')

title_nonciting = create_corpus(json_nonciting, 'title')
title_citing_train[:5]

Number of documents without title: 0
Number of documents without title: 0
Number of documents without title: 0


[{'id': '3650293A1',
  'text': 'DEVICE FOR CONTROLLING THE BRAKING OF A TRAILER'},
 {'id': '3694265A1',
  'text': 'TIME DIVISION MULTIPLEXING OF SYNCHRONIZATION CHANNELS'},
 {'id': '3623522A1', 'text': 'LAUNDRY MACHINE AND CONTROL METHOD OF THE SAME'},
 {'id': '3611147A1',
  'text': 'CHEMICAL AND TOPOLOGICAL SURFACE MODIFICATION TO ENHANCE COATING ADHESION AND COMPATIBILITY'},
 {'id': '3640454A1', 'text': 'FUEL METERING SYSTEM'}]

### 2.1.1 Title - embeddings

In [None]:
title_citing_train_titles = [doc['text'] for doc in title_citing_train]
citing_embeddings = model.encode(title_citing_train_titles, convert_to_tensor=True, device=device)

In [None]:
title_nonciting_titles = [doc['text'] for doc in title_nonciting]
nonciting_embeddings = model.encode(title_nonciting_titles, convert_to_tensor=True, device=device)

### 2.1.2 Get Cosine similarities

In [None]:
cosine_similarities = linear_kernel(citing_embeddings.cpu(), nonciting_embeddings.cpu())
cosine_similarities

array([[ 0.36911705,  0.13136497,  0.31678134, ...,  0.23598102,
         0.10692442,  0.03131937],
       [ 0.02560851,  0.05637474,  0.06779245, ..., -0.04658467,
         0.1480076 , -0.0103356 ],
       [ 0.12454381,  0.15283757,  0.35839814, ...,  0.1846452 ,
         0.00318378,  0.2415548 ],
       ...,
       [ 0.21950434,  0.03933778,  0.21481168, ...,  0.15680458,
         0.11638679,  0.21677847],
       [ 0.11442984,  0.14409205,  0.26019055, ...,  0.05855274,
         0.15397629,  0.12991686],
       [ 0.02073644,  0.02390994,  0.3105056 , ...,  0.16654235,
         0.12852314,  0.29862452]], dtype=float32)

### 2.1.3 Get Metrics

In [None]:
# Get the top k ranks for each citing patent
k = 100
top_k_rank = top_k_ranks(title_citing_train, title_nonciting, cosine_similarities, k=k)

# Calculate the metrics
true_labels, predicted_labels, not_in_citation_mapping = \
    get_true_and_predicted(mapping_dict, top_k_rank)
mean_rank = mean_ranking(true_labels, predicted_labels)
mean_average_precision_val = mean_average_precision(true_labels, predicted_labels)

recall_at_10_titles_word2vec = mean_recall_at_k(true_labels, predicted_labels, k=10)
recall_at_20_titles_word2vec = mean_recall_at_k(true_labels, predicted_labels, k=20)
recall_at_50_titles_word2vec = mean_recall_at_k(true_labels, predicted_labels, k=50)
recall_at_100_titles_word2vec= mean_recall_at_k(true_labels, predicted_labels, k=100)

print("Recall at 10:", round(recall_at_10_titles_word2vec, 4))
print("Recall at 20:", round(recall_at_20_titles_word2vec, 4))
print("Recall at 50:", round(recall_at_50_titles_word2vec, 4))
print("Recall at 100:", round(recall_at_100_titles_word2vec, 4))
print("Mean ranking:", round(mean_rank, 4))
print("Mean average precision:", round(mean_average_precision_val, 4))
print("Number of patents measured:", len(predicted_labels))
print("Number of patents not in the citation:", not_in_citation_mapping)

Recall at 10: 0.3326
Recall at 20: 0.4067
Recall at 50: 0.511
Recall at 100: 0.5937
Mean ranking: 52.2816
Mean average precision: 0.198
Number of patents measured: 6831
Number of patents not in the citation: 0


## Full Text Word2Vec

In [None]:
# # prompt: how to run sbert on gpu?

# import torch

# # Check if CUDA is available
# if torch.cuda.is_available():
#     device = torch.device("cuda")
#     print("Using CUDA device:", torch.cuda.get_device_name(0))
# else:
#     device = torch.device("cpu")
#     print("CUDA is not available, using CPU.")

# # Move the model to the GPU
# model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# # Example usage with GPU:
# title_citing_train_titles = [doc['text'] for doc in title_citing_train]
# citing_embeddings = model.encode(title_citing_train_titles, convert_to_tensor=True, device=device)

# title_nonciting_titles = [doc['text'] for doc in title_nonciting]
# nonciting_embeddings = model.encode(title_nonciting_titles, convert_to_tensor=True, device=device)

# # ... rest of your code ...
# cosine_similarities = linear_kernel(citing_embeddings.cpu(), nonciting_embeddings.cpu()) #compute on CPU


## Embeddings

In [None]:
full_text_citing_text = [doc['text'] for doc in full_text_citing]
fulltext_citing_embeddings = model.encode(full_text_citing_text, convert_to_tensor=True, device=device)

In [None]:
np.save('fulltext_citing_embeddings.npy', fulltext_citing_embeddings.cpu().numpy())

In [None]:
full_text_nonciting_text = [doc['text'] for doc in full_text_nonciting]
fulltext_nonciting_embeddings = model.encode(full_text_nonciting_text, convert_to_tensor=True, device=device)

In [None]:
np.save('fulltext_nonciting_embeddings.npy', fulltext_nonciting_embeddings.cpu().numpy())

In [None]:
cosine_similarities_fulltext = linear_kernel(fulltext_citing_embeddings.cpu(), fulltext_nonciting_embeddings.cpu()) # Remove cpu() if using data from the file

AttributeError: 'numpy.ndarray' object has no attribute 'cpu'

## Evaluation

In [None]:
# Get top k ranks
k = 100
top_k_rank_fulltext = top_k_ranks(full_text_citing, full_text_nonciting, cosine_similarities_fulltext, k=k)

# Calculate metrics
true_labels, predicted_labels, not_in_citation_mapping = get_true_and_predicted(mapping_dict, top_k_rank_fulltext)

mean_rank_fulltext = mean_ranking(true_labels, predicted_labels)
mean_average_precision_fulltext = mean_average_precision(true_labels, predicted_labels)

recall_at_10_fulltext = mean_recall_at_k(true_labels, predicted_labels, k=10)
recall_at_20_fulltext = mean_recall_at_k(true_labels, predicted_labels, k=20)
recall_at_50_fulltext = mean_recall_at_k(true_labels, predicted_labels, k=50)
recall_at_100_fulltext = mean_recall_at_k(true_labels, predicted_labels, k=100)


print("Recall at 10:", round(recall_at_10_fulltext, 4))
print("Recall at 20:", round(recall_at_20_fulltext, 4))
print("Recall at 50:", round(recall_at_50_fulltext, 4))
print("Recall at 100:", round(recall_at_100_fulltext, 4))

print("Mean ranking:", round(mean_rank_fulltext, 4))
print("Mean average precision:", round(mean_average_precision_fulltext, 4))
print("Number of patents measured:", len(predicted_labels))
print("Number of patents not in the citation:", not_in_citation_mapping)


Recall at 10: 0.5642
Recall at 20: 0.6555
Recall at 50: 0.7647
Recall at 100: 0.8314
Mean ranking: 28.2307
Mean average precision: 0.3566
Number of patents measured: 6831
Number of patents not in the citation: 0


# Re-ranking

In [None]:
cosine_similarities = linear_kernel(fulltext_citing_embeddings, fulltext_nonciting_embeddings)

In [None]:
cosine_similarities.shape

(6831, 16837)

In [1]:
fulltext_citing_embeddings.shape


NameError: name 'fulltext_citing_embeddings' is not defined

In [None]:
fulltext_nonciting_embeddings.shape

(16837, 384)

In [None]:
fulltext_citing_embeddings = np.load('fulltext_citing_embeddings.npy')
fulltext_nonciting_embeddings = np.load('fulltext_nonciting_embeddings.npy')

cosine_similarities = linear_kernel(fulltext_citing_embeddings, fulltext_nonciting_embeddings)

In [None]:
def prepare_training_data(citing_ids, sim_matrix, gold_mapping, top_k_candidates=100):
    nonciting_ids = [doc['id'] for doc in full_text_nonciting]

    X = []
    y = []
    for idx, pid in enumerate(citing_ids):
        if pid not in gold_mapping:
            continue
        gold_ids = set(gold_mapping[pid])
        candidate_indices = np.argsort(sim_matrix[idx])[::-1][:top_k_candidates]
        for cand_idx in candidate_indices:
            feature = [sim_matrix[idx, cand_idx]]
            # Use nonciting_ids for comparison
            label = 1 if nonciting_ids[cand_idx] in gold_ids else 0
            X.append(feature)
            y.append(label)
    return np.array(X), np.array(y)

In [2]:
# full_text_citing = create_corpus(json_citing_train , 'fulltext')
citing_ids = [doc['id'] for doc in full_text_citing]  # Get citing patent IDs

X_train, y_train = prepare_training_data(
    citing_ids, cosine_similarities,
    mapping_dict,  # Your gold standard mapping
)

NameError: name 'full_text_citing' is not defined

In [None]:
X_train.shape, y_train.shape

((683100, 1), (683100,))

In [None]:
positive_rate = np.mean(y_train)
positive_rate # 0.010481627872932222

In [None]:
np.save('X_train.npy', X_train)
np.save('y_train.npy', y_train)


In [None]:
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')

# Training

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
re_rank_model = LogisticRegression(class_weight='balanced')
re_rank_model.fit(X_train, y_train)

In [None]:
def evaluate_recommendations(gold_mapping, recommendations, k=100):
    true_labels = []
    predicted_labels = []
    for pid, gold in gold_mapping.items():
        if pid in recommendations:
            true_labels.append(gold)
            predicted_labels.append(recommendations[pid])
    recall = mean_recall_at_k(true_labels, predicted_labels, k=k)
    mAP = mean_average_precision(true_labels, predicted_labels, k=k)
    return recall, mAP

def re_rank_candidates(ids, baseline_sim_matrix, re_rank_model, top_k_candidates=100):
    nonciting_ids = [doc['id'] for doc in full_text_nonciting] # Get non-citing patent IDs

    re_ranked = {}
    for idx, pid in enumerate(ids):
        candidate_indices = np.argsort(baseline_sim_matrix[idx])[::-1][:top_k_candidates]
        features = np.array([[baseline_sim_matrix[idx, cand_idx]] for cand_idx in candidate_indices])
        probas = re_rank_model.predict_proba(features)[:, 1]
        sorted_order = np.argsort(probas)[::-1]
        re_ranked_ids = [nonciting_ids[candidate_indices[i]] for i in sorted_order]
        re_ranked[pid] = re_ranked_ids
    return re_ranked

In [None]:
re_ranked_recommendations = re_rank_candidates(citing_ids, cosine_similarities, re_rank_model)

In [None]:
recall_after, mAP_after = evaluate_recommendations(mapping_dict, re_ranked_recommendations, k=100)
print("After re-ranking: Recall@100 =", recall_after, "mAP@100 =", mAP_after) # Recall@100 = 0.8314253647586985 mAP@100 = 0.36926215097770604

После re-ranking: Recall@100 = 0.8314253647586985 mAP@100 = 0.36926215097770604


# Grid search models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import itertools
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [None]:
lr_params = {
    'C': [0.1, 1.0, 10.0]
}
rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10]
}
xgb_params = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [50, 100],
    'max_depth': [3, 5]
}

models = [
    ('lr', LogisticRegression(class_weight='balanced', random_state=42), lr_params),
    ('rf', RandomForestClassifier(class_weight='balanced', random_state=42), rf_params),
    ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), xgb_params)
]

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_score = -np.inf
best_model = None
best_model_name = None
best_params = None

scoring = 'average_precision'

In [None]:
for model_name, model, param_grid in models:
    keys, values = zip(*param_grid.items())
    for combination in itertools.product(*values):
        params = dict(zip(keys, combination))
        model.set_params(**params)
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring)
        mean_score = np.mean(scores)
        print(f"Model {model_name} with params {params}: average precision = {mean_score:.4f}")
        if mean_score > best_score:
            best_score = mean_score
            best_model = model
            best_model_name = model_name
            best_params = params

Model lr with params {'C': 0.1}: average precision = 0.1083
Model lr with params {'C': 1.0}: average precision = 0.1083
Model lr with params {'C': 10.0}: average precision = 0.1083
Model rf with params {'n_estimators': 50, 'max_depth': 5}: average precision = 0.1051
Model rf with params {'n_estimators': 50, 'max_depth': 10}: average precision = 0.0957
Model rf with params {'n_estimators': 100, 'max_depth': 5}: average precision = 0.1062
Model rf with params {'n_estimators': 100, 'max_depth': 10}: average precision = 0.0960


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.01, 'n_estimators': 50, 'max_depth': 3}: average precision = 0.0935


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.01, 'n_estimators': 50, 'max_depth': 5}: average precision = 0.0947


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 3}: average precision = 0.0944


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.01, 'n_estimators': 100, 'max_depth': 5}: average precision = 0.0950


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3}: average precision = 0.0956


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 5}: average precision = 0.0964


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 3}: average precision = 0.0961


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Model xgb with params {'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 5}: average precision = 0.0965


In [None]:
print(best_model_name) # lr
print(best_params) # {'C': 0.1}
print( best_score) # 0.10834978155896706

## Getting Test Enbeddings and predicitions

In [None]:
full_text_citing = create_corpus(json_citing_test , 'fulltext')
full_text_nonciting = create_corpus(json_nonciting, 'fulltext')

In [None]:
full_text_citing_text = [doc['text'] for doc in full_text_citing]
fulltext_citing_embeddings = model.encode(full_text_citing_text, convert_to_tensor=True, device=device)

In [None]:
full_text_nonciting_text = [doc['text'] for doc in full_text_nonciting]
fulltext_nonciting_embeddings = model.encode(full_text_nonciting_text, convert_to_tensor=True, device=device)

In [None]:
cosine_similarities_fulltext = linear_kernel(fulltext_citing_embeddings.cpu(), fulltext_nonciting_embeddings.cpu())

In [None]:
top_k_rank_fulltext = top_k_ranks(full_text_citing, full_text_nonciting, cosine_similarities_fulltext, k=100)

In [None]:
np.save('full_text_citing_text_embeddings.npy', fulltext_citing_embeddings.cpu().numpy())

In [None]:
np.save('full_text_nonciting_text_embeddings.npy', fulltext_nonciting_embeddings.cpu().numpy())

# Save results

In [None]:
with open('prediction1.json', 'w') as f:
    json.dump(top_k_rank_fulltext, f)