In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import os
import requests
from requests.auth import HTTPBasicAuth
import time
from google.colab import files

# 1. Configure API to EPO OPS

Registro en OPS para la API: [OPS](https://www.epo.org/en/searching-for-patents/data/web-services/ops)

Documentacion de la API: [OPS v1.3.19](https://link.epo.org/web/ops_v3.2_documentation_-_version_1.3.19_en.pdf)

In [2]:
client_id = 'i6AxJ1MYJ43N3SkC5CeRpujtzndUNHzRk72kulUA04n4wh8M'
client_secret = 'SFmlAli4KOlr1xPNygxRfgXNcfUmDzrtjpAkfv10Hkf6R1cEQsmXCBZjXV5zyhaQ'

In [3]:
# Authentication URL for the EPO API
auth_url = 'https://ops.epo.org/3.2/auth/accesstoken'

# Send a POST request to get the access token
auth_response = requests.post(auth_url,
                              auth=HTTPBasicAuth(client_id, client_secret),
                              headers={'Content-Type': 'application/x-www-form-urlencoded'},
                              data={'grant_type': 'client_credentials'})

# Parse the JSON response
auth_data = auth_response.json()

# Extract the access token from the response
access_token = auth_data['access_token']

# URL for the EPO API search endpoint
search_url = 'https://ops.epo.org/3.2/rest-services/published-data/search/biblio'

def get_patent_data(query_cql, start, end):
    """
    Function to retrieve patent data from the EPO API.

    :param query_cql: The CQL query string for searching patents
    :param start: The start index of the range of results to retrieve
    :param end: The end index of the range of results to retrieve
    :return: JSON response with patent data or None if there's an error
    """
    # Set up the headers for the API request
    headers = {
        'Authorization': f'Bearer {access_token}',
        'Accept': 'application/json',
        'Range': f'{start}-{end}'  # Specify the range of results to retrieve
    }

    # Set up the query parameters
    params = {
        'q': query_cql
    }

    # Send a GET request to the EPO API
    response = requests.get(search_url, headers=headers, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        return response.json()
    else:
        print(f'Error: {response.status_code} - {response.text}')
        return None

# Example CQL query to search for patents in the A61K IPC class
query_cql = 'ipc=/low A61K'

# Maximum number of results allowed per request by the API
batch_size = 100

data = get_patent_data(query_cql, start=1, end=3)

data

{'ops:world-patent-data': {'@xmlns': {'ops': 'http://ops.epo.org',
   '$': 'http://www.epo.org/exchange',
   'xlink': 'http://www.w3.org/1999/xlink'},
  'ops:biblio-search': {'@total-result-count': '10000',
   'ops:query': {'$': 'ipc =/low A61K', '@syntax': 'CQL'},
   'ops:range': {'@begin': '1', '@end': '3'},
   'ops:search-result': {'exchange-documents': [{'exchange-document': {'@system': 'ops.epo.org',
       '@family-id': '89897625',
       '@country': 'JP',
       '@doc-number': '2024120161',
       '@kind': 'A',
       'bibliographic-data': {'publication-reference': {'document-id': [{'@document-id-type': 'docdb',
           'country': {'$': 'JP'},
           'doc-number': {'$': '2024120161'},
           'kind': {'$': 'A'},
           'date': {'$': '20240904'}},
          {'@document-id-type': 'epodoc',
           'doc-number': {'$': 'JP2024120161'},
           'date': {'$': '20240904'}}]},
        'classifications-ipcr': {'classification-ipcr': [{'@sequence': '1',
           'tex

In [4]:
docs = data['ops:world-patent-data']['ops:biblio-search'][ 'ops:search-result']['exchange-documents']
[
    {
        'family' : doc['exchange-document']['@family-id'],
        'publication_number' : doc['exchange-document']['bibliographic-data']['publication-reference']['document-id'][0]['doc-number']['$'],
        'application_number' : doc['exchange-document']['bibliographic-data']['application-reference']['document-id'][0]['doc-number']['$']
    } for doc in docs
]

[{'family': '89897625',
  'publication_number': '2024120161',
  'application_number': '2024021608'},
 {'family': '85201060',
  'publication_number': '2024532015',
  'application_number': '2024508325'},
 {'family': '83004729',
  'publication_number': '2024532010',
  'application_number': '2024504967'}]

In [5]:
docs[0]

{'exchange-document': {'@system': 'ops.epo.org',
  '@family-id': '89897625',
  '@country': 'JP',
  '@doc-number': '2024120161',
  '@kind': 'A',
  'bibliographic-data': {'publication-reference': {'document-id': [{'@document-id-type': 'docdb',
      'country': {'$': 'JP'},
      'doc-number': {'$': '2024120161'},
      'kind': {'$': 'A'},
      'date': {'$': '20240904'}},
     {'@document-id-type': 'epodoc',
      'doc-number': {'$': 'JP2024120161'},
      'date': {'$': '20240904'}}]},
   'classifications-ipcr': {'classification-ipcr': [{'@sequence': '1',
      'text': {'$': 'A61K  31/   565            A I'}},
     {'@sequence': '2', 'text': {'$': 'A61K  31/   585            A I'}},
     {'@sequence': '3', 'text': {'$': 'A61P   5/    30            A I'}},
     {'@sequence': '4', 'text': {'$': 'A61P   5/    34            A I'}},
     {'@sequence': '5', 'text': {'$': 'A61P  43/    00            A I'}}]},
   'patent-classifications': {'patent-classification': [{'@sequence': '1',
      'clas

#### Loop to retrieve all files

In [6]:
start = 1
doc_list = []
while True:

    end = start + batch_size - 1
    data = get_patent_data(query_cql, start, end)
    if not data:
        break
    docs = data['ops:world-patent-data']['ops:biblio-search'][ 'ops:search-result']['exchange-documents']
    doc_list = doc_list + [
            {
            'family' : doc['exchange-document']['@family-id'],
            'publication_number' : doc['exchange-document']['bibliographic-data']['publication-reference']['document-id'][0]['doc-number']['$'],
            'application_number' : doc['exchange-document']['bibliographic-data']['application-reference']['document-id'][0]['doc-number']['$']
            } for doc in docs
    ]
    time.sleep(3) # wait few seconds not to get blocked
    start = end

Error: 400 - <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<fault xmlns="http://ops.epo.org">
    <code>CLIENT.InvalidQuery</code>
    <message>The request was invalid</message>
</fault>



In [7]:
len(doc_list)

2000

In [8]:
end

2080

In [9]:
pd.DataFrame(doc_list).drop_duplicates(ignore_index=True).shape

(1981, 3)

In [10]:
df = pd.DataFrame(doc_list).drop_duplicates(ignore_index=True)
df.to_csv('Patents_publication_numbers.csv', index=False)

files.download('Patents_publication_numbers.csv') #Download from Google Colab

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Para sacar los datos hicimos este paso para todas las IPC que se nos dieron en un principio:<br>
C07D, A61P, A61K, C07B, C07C, C07K, C07F, C07H, C12N, G01N, B12N (biosimilars), for industry comparison: telecommunication (Apple, Samsung): G06F, H04W, H01L <br>

Depues exportamos cada uno y los juntamos todos a mano.