In [1]:
import re
import os
import epo_ops
import json
import xmltodict
import pandas as pd
from pathlib import Path
from importlib import reload

middlewares = [
    epo_ops.middlewares.Dogpile(),
    epo_ops.middlewares.Throttler(),
]

In [2]:
# Initialisation
with open(Path('..') / 'api_keys.json', 'r') as fp: api_keys = json.load(fp)

client = epo_ops.Client(
    key=api_keys['key'],
    secret=api_keys['secret'],
    middlewares=middlewares,
)

reload(epo_ops.models)
from epo_ops.models import Epodoc, Docdb, BaseInput

In [3]:
# Load list of applications
path = "../../../data"
file_name = "bisfenol_a_ep_applications.txt"
data = pd.read_csv(Path(path)/file_name, header=None, names=["pat"])
num_patents = data.shape[0]
data

Unnamed: 0,pat
0,EP-0000022-A1
1,EP-0000060-A1
2,EP-0000082-A1
3,EP-0000086-A1
4,EP-0000090-A1
...,...
39236,EP-3757183-A1
39237,EP-3757249-A1
39238,EP-3757258-A1
39239,EP-3757625-A1


In [4]:
# Class code extraction
def get_codes(d):
    return d["ops:world-patent-data"] \
            ["exchange-documents"] \
            ["exchange-document"] \
            ["bibliographic-data"]

def get_ipc(d):
    c = get_codes(d)
    if "classification-ipc" not in c: return []    
    l = c["classification-ipc"]["text"]
    return l if type(l) is list else [l]

def get_ipcr(d):
    c = get_codes(d)
    if "classifications-ipcr" not in c: return [] 
    l = c["classifications-ipcr"]["classification-ipcr"]
    if type(l) is not list: l = [l]
    return [re.sub(' +', ' ', c["text"]) for c in l]

def parse_cpci(v):
    if v["generating-office"] != "EP":
        return None
    return "{} {} {} {} {} {}".format( \
           v["section"], v["class"], v["subclass"], v["main-group"], v["subgroup"], v["classification-value"])

def get_cpci(d):
    c = get_codes(d)
    if "patent-classifications" not in c: return []
    l = c["patent-classifications"]["patent-classification"]
    if type(l) is not list: l = [l]
    l = [parse_cpci(v) for v in l]
    return [v for v in l if v != None]

In [None]:
# Collect classes and store biblio files
ipcs, ipcrs, cpcis = [], [], []
os.makedirs("../xml", exist_ok=True)
counter = 1

for d in data["pat"]:
    print(counter/num_patents, counter, d, end='\r')
    d = d.replace('-', '.')
    req = client.published_data("publication", Epodoc(d), endpoint="biblio")
    with open(f"../xml/{d}.biblio.xml", 'wb') as xml_file:
        xml_file.write(req.content)
    c = xmltodict.parse(req.content)
    ipcs.append(get_ipc(c))
    ipcrs.append(get_ipcr(c))
    cpcis.append(get_cpci(c))
    counter += 1
    
data["ipc"] = ipcs
data["ipcr"] = ipcrs
data["cpci"] = cpcis

data        

0.02617160622817971 1027 EP-0071061-A14 113 EP-0004850-A2

In [None]:
# Convert to JSON and write to disk
path = ".."
file_name = "bisfenol_a_ep_application_codes.json"
with open(Path(path)/file_name, 'w') as out_file:
    out_file.write(data.to_json(orient="records", indent=4))