In [1]:
import re
import epo_ops
import json
import xmltodict
import pandas as pd
from pathlib import Path
from importlib import reload

middlewares = [
    epo_ops.middlewares.Dogpile(),
    epo_ops.middlewares.Throttler(),
]

In [2]:
# Initialisation
with open(Path('..') / 'api_keys.json', 'r') as fp: api_keys = json.load(fp)

client = epo_ops.Client(
    key=api_keys['key'],
    secret=api_keys['secret'],
    middlewares=middlewares,
)

reload(epo_ops.models)
from epo_ops.models import Epodoc, Docdb, BaseInput

In [3]:
# Load list of applications
path = "../../../data/"
file_name = "bisfenol_a_ep_applications_test.txt"

data = pd.read_csv(path+file_name, header=None, names=["app"])
data

Unnamed: 0,app
0,EP-0000022-A1
1,EP-0000060-A1
2,EP-0000082-A1
3,EP-0000086-A1
4,EP-0000090-A1
5,EP-0000097-A1
6,EP-0000132-A1
7,EP-0000146-A1
8,EP-0000171-A1
9,EP-0000186-A1


In [4]:
# Class code extraction
def get_codes(d):
    return d["ops:world-patent-data"] \
            ["exchange-documents"] \
            ["exchange-document"] \
            ["bibliographic-data"]

def get_ipc(d):
    l = get_codes(d)["classification-ipc"]["text"]
    return l if type(l) is list else [l]

def get_ipcr(d):
    l = get_codes(d)["classifications-ipcr"]["classification-ipcr"]
    if type(l) is not list: l = [l]
    return [re.sub(' +', ' ', c["text"]) for c in l]

def parse_cpci(v):
    if v["generating-office"] != "EP":
        return None
    return "{} {} {} {} {} {}".format( \
           v["section"], v["class"], v["subclass"], v["main-group"], v["subgroup"], v["classification-value"])

def get_cpci(d):
    l = get_codes(d)["patent-classifications"]["patent-classification"]
    if type(l) is not list: l = [l]
    l = [parse_cpci(v) for v in l]
    return [v for v in l if v != None]

In [5]:
# Collect classes
ipcs, ipcrs, cpcis = [], [], []

for d in data["app"]:
    d = d.replace('-', '.')
    req = client.published_data('publication', Epodoc(d), endpoint='biblio')
    c = xmltodict.parse(req.content)
    ipcs.append(get_ipc(c))
    ipcrs.append(get_ipcr(c))
    cpcis.append(get_cpci(c))
    
data["ipc"] = ipcs
data["ipcr"] = ipcrs
data["cpci"] = cpcis

data        

Unnamed: 0,app,ipc,ipcr,cpci
0,EP-0000022-A1,"[B29D9/00, B32B27/36, C08J7/04]","[B32B 27/ 30 A I, B32B 27/ 36 A I, C08J 7/ 04 ...","[B 32 B 27 36 I, C 08 J 7 042 I, C 08 J 7 043 ..."
1,EP-0000060-A1,"[C07C69/96, C08G63/66, C08G65/32]","[C08G 63/ 00 A I, B01J 31/ 00 A I, C07B 61/ 00...",[C 08 G 64 183 I]
2,EP-0000082-A1,"[G03C1/68, G03F7/10]","[G03F 7/ 26 A I, C08F 20/ 00 A I, C08F 20/ 34 ...",[G 03 F 7 0388 I]
3,EP-0000086-A1,"[C25D13/06, C08G59/14]","[C08G 59/ 00 A I, C08G 59/ 14 A I, C09D 5/ 44 ...","[C 08 G 59 145 I, C 09 D 5 4446 I]"
4,EP-0000090-A1,[C08G14/06],"[C08G 14/ 00 A I, B01D 15/ 04 A I, B01J 45/ 00...","[B 01 J 45 00 I, C 01 G 3 003 I, C 01 G 13 003..."
5,EP-0000097-A1,"[C08L67/02, C08G63/18]","[C08L 67/ 00 A I, C08G 63/ 19 A I, C08K 3/ 00 ...","[C 08 G 63 19 I, C 08 K 3 013 I, C 08 K 3 40 I]"
6,EP-0000132-A1,"[C07F9/09, C08K5/52]","[C08K 5/ 00 A I, C07F 9/ 09 A I, C07F 9/ 38 A ...","[C 07 F 9 091 I, C 07 F 9 093 I, C 08 K 5 521 ..."
7,EP-0000146-A1,"[C08L69/00, C08L51/04, C08L55/02]","[C08L 7/ 00 A I, C08L 21/ 00 A I, C08L 23/ 00 ...",[C 08 L 69 00 I]
8,EP-0000171-A1,"[C08G18/42, C08G18/34]","[C08G 18/ 08 A I, C08G 18/ 10 A I, C08G 18/ 34...","[C 08 G 18 0823 I, C 08 G 18 10 I, C 08 G 18 3..."
9,EP-0000186-A1,[C08L69/00],[C08L 69/ 00 A I],[C 08 L 69 00 I]


In [6]:
codes = data.to_json(orient="records", indent=4)
print(codes)

[
    {
        "app":"EP-0000022-A1",
        "ipc":[
            "B29D9\/00",
            "B32B27\/36",
            "C08J7\/04"
        ],
        "ipcr":[
            "B32B 27\/ 30 A I",
            "B32B 27\/ 36 A I",
            "C08J 7\/ 04 A I"
        ],
        "cpci":[
            "B 32 B 27 36 I",
            "C 08 J 7 042 I",
            "C 08 J 7 043 I",
            "C 08 J 7 046 I",
            "C 08 J 2369 00 A",
            "C 08 J 2427 12 A",
            "C 08 J 2433 06 A",
            "Y 10 T 428 31507 A",
            "Y 10 T 428 3154 A"
        ]
    },
    {
        "app":"EP-0000060-A1",
        "ipc":[
            "C07C69\/96",
            "C08G63\/66",
            "C08G65\/32"
        ],
        "ipcr":[
            "C08G 63\/ 00 A I",
            "B01J 31\/ 00 A I",
            "C07B 61\/ 00 A I",
            "C07C 67\/ 00 A I",
            "C07C 69\/ 96 A I",
            "C07C 313\/ 00 A I",
            "C07C 315\/ 04 A I",
            "C07C 317\/ 22 A I",
    