In [1]:
import epo_ops
import os
import json
from pathlib import Path
import math
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm, trange
import datetime

middlewares = [
    #epo_ops.middlewares.Dogpile(), #No dogpile support on windows
    epo_ops.middlewares.Throttler(),
]

In [2]:
# The api keys should be in a json-file in the project root, with two attributes: "key" and "secret" for example:
# { "key": "YOUR_APPLICATION_KEY", "secret": "YOUR_APPLICATION_SECRET"}
api_keys_path = Path('..') / 'api_keys.json'

In [3]:
with open(api_keys_path, 'r') as fp:
    api_keys = json.load(fp)

In [4]:
from importlib import reload
reload(epo_ops)
client = epo_ops.Client(
    key=api_keys['key'],
    secret=api_keys['secret'],
    middlewares=middlewares,
    accept_type='json'
)

In [40]:
a = datetime.datetime(year=2014, month=3, day=1)
b = datetime.datetime(year=2015, month=1, day=1)
dt = b-a 
print(a+dt/2)
print(b-dt/2)
a.strftime('%Y%m%d')

2014-08-01 00:00:00
2014-08-01 00:00:00


'20140301'

In [17]:
req = client.published_data_search('pn="EP" and pd="20160106 20160107"', range_begin=1, range_end=2)

In [14]:
results = json.loads(req.content)
results

{'ops:world-patent-data': {'@xmlns': {'ops': 'http://ops.epo.org',
   '$': 'http://www.epo.org/exchange',
   'xlink': 'http://www.w3.org/1999/xlink'},
  'ops:biblio-search': {'@total-result-count': '2747',
   'ops:query': {'$': 'pn = EP and pd = "20160106 20160107"',
    '@syntax': 'CQL'},
   'ops:range': {'@begin': '1', '@end': '2'},
   'ops:search-result': {'ops:publication-reference': [{'@system': 'ops.epo.org',
      '@family-id': '54323459',
      'document-id': {'@document-id-type': 'docdb',
       'country': {'$': 'EP'},
       'doc-number': {'$': '2963856'},
       'kind': {'$': 'A1'}}},
     {'@system': 'ops.epo.org',
      '@family-id': '51040005',
      'document-id': {'@document-id-type': 'docdb',
       'country': {'$': 'EP'},
       'doc-number': {'$': '2963546'},
       'kind': {'$': 'A1'}}}]}}}}

In [6]:
results = json.loads(req.content)
docs = results['ops:world-patent-data']['ops:biblio-search']['ops:search-result']['ops:publication-reference']
document_ids = []
for doc in docs:
    document_id = doc['document-id']
    country = document_id['country']['$']
    doc_number = document_id['doc-number']['$']
    kind_code = document_id['kind']['$']
    doc_str = f'{country}{doc_number}.{kind_code}'
    document_ids.append(doc_str)
document_ids

['EP3903806.A1', 'EP3903807.A1']

In [65]:
def determine_yearly_range(client, cql, year_range):
    '''return a sequence of year ranges where each range will return less than 2000 patents'''
    begin_year, end_year = year_range
    year_instantiated_cql = cql.format(begin_year=begin_year, end_year=end_year)
    print(year_instantiated_cql)
    req = client.published_data_search(year_instantiated_cql, range_begin=1, range_end=2)  # We limit the range to limit how much date we request
    query_response = json.loads(req.content)
    total_count = int(query_response['ops:world-patent-data']['ops:biblio-search']['@total-result-count'])
    if total_count < 2000:
        return (year_range,)
    else:
        year_range = int((end_year - begin_year)/2)
        first_range = (begin_year, begin_year+year_range)
        second_range = (begin_year+year_range + 1, end_year)   # +1 since the ranges are inclusive (I think?)
        return determine_yearly_range(client, cql, first_range) + determine_yearly_range(client, cql, second_range)


def extract_patents(query_response):
    docs = query_response['ops:world-patent-data']['ops:biblio-search']['ops:search-result']['ops:publication-reference']
    document_ids = []
    for doc in docs:
        document_id = doc['document-id']
        country = document_id['country']['$']
        doc_number = document_id['doc-number']['$']
        kind_code = document_id['kind']['$']
        doc_str = f'{country}{doc_number}.{kind_code}'
        document_ids.append(doc_str)
    return document_ids


def get_class_patents(client, cql, year_range):
    patents = []
    
    begin_year, end_year = year_range
    year_instantiated_cql = cql.format(begin_year=begin_year, end_year=end_year)
    req = client.published_data_search(year_instantiated_cql, range_begin=1, range_end=100)  # We limit the range to limit how much date we request
    query_response = json.loads(req.content)
    patents.extend(extract_patents(query_response))

    total_count = int(query_response['ops:world-patent-data']['ops:biblio-search']['@total-result-count'])
    n_requests = int(math.ceil(total_count / 100))
    for i in trange(1, n_requests, desc="Retriving documents"):
        start_range = i*100+1
        end_range = (i+1)*100
        req = client.published_data_search(year_instantiated_cql, range_begin=start_range, range_end=end_range)  # We limit the range to limit how much date we request
        query_response = json.loads(req.content)
        patents.extend(extract_patents(query_response))
    return patents


def search_patents_in_classes(ipc_classes, client, output_dir: Path, year_range=(1950,2021), overwrite=False):
    '''Search for patents belonging to the given classes'''
    for ipc_class in ipc_classes:
        output_path = output_dir / f'{ipc_class}_{year_range[0]}-{year_range[1]}.csv'
        if not output_path.exists() or overwrite:
            # We're going to do a gradual divide and conquer until the hit results are below 2000 since that is the maximum
            cql = f'ipc={ipc_class}' + ' and pn=EP and pd="{begin_year} {end_year}"'
            valid_year_ranges = determine_yearly_range(client, cql, year_range)
            documents = []
            for valid_year_range in valid_year_ranges:
                documents.extend(get_class_patents(client, cql, valid_year_range))
            with open(output_path, 'w') as fp:
                fp.write('\n'.join(documents))


In [67]:
search_patents_in_classes(['C08L69'], client, output_dir=Path(), year_range=(2015, 2020), overwrite=True)

ipc=C08L69 and pn=EP and pd="2015 2020"


Retriving documents:   0%|          | 0/7 [00:00<?, ?it/s]

In [30]:
cql = 'ipc={patent_class} and pn=EP and pd="{begin_year} {end_year}"'
determine_yearly_range(client, cql, (1950, 2021))

KeyError: 'patent_class'

In [26]:
cql.format(begin_year=0, end_year=1)

'ipc=C08L69 and pn=EP and pd="0 1"'