In [ ]:
# Create data directory if it doesn't exist
!mkdir -p /mnt/local/ii/retriever/patents/data

# Download patent files to data directory
!wget -P /mnt/local/ii/retriever/patents/data https://s3.amazonaws.com/data.patentsview.org/download/g_patent.tsv.zip
!wget -P /mnt/local/ii/retriever/patents/data https://s3.amazonaws.com/data.patentsview.org/download/g_patent_abstract.tsv.zip

# Decompress the downloaded files
!unzip -o /mnt/local/ii/retriever/patents/data/g_patent.tsv.zip -d /mnt/local/ii/retriever/patents/data
!unzip -o /mnt/local/ii/retriever/patents/data/g_patent_abstract.tsv.zip -d /mnt/local/ii/retriever/patents/data

In [ ]:
import pandas as pd

# Load the data into pandas dataframes
patents = pd.read_csv('/mnt/local/ii/retriever/patents/data/g_patent.tsv', sep='\t')
abstracts = pd.read_csv('/mnt/local/ii/retriever/patents/data/g_patent_abstract.tsv', sep='\t')

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base", use_fast=True, trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
patents['patent_abstract'] = abstracts.patent_abstract
patents['content'] = patents.patent_title + tokenizer.sep_token + patents.patent_abstract
patents.patent_id = patents.patent_id.astype(str)
patents = patents[patents.patent_abstract.notna()]

In [7]:
save_path = '/mnt/local/ii/retriever/patents'

patents.sample(frac=0.1).to_parquet(f'{save_path}/patents_sample.parquet')
patents.to_parquet(f'{save_path}/patents_all.parquet')

In [ ]:
import datasets
datasets.load_dataset(
    'parquet',
    data_files='/mnt/local/ii/retriever/patents/patents_sample.parquet',
    split='train',
    num_proc=4
)