In [41]:
import json
from pathlib import Path
from tqdm.notebook import tqdm, trange
import imageio
from collections import defaultdict

In [6]:
!conda install -c conda-forge imageio

^C


In [2]:
patents_path = Path('../patents')
list(patents_path.iterdir())

[WindowsPath('../patents/netto_list')]

In [3]:
patents = list(patents_path.glob('**/EP*'))
len(patents)

7798

In [13]:
successfully_downloaded_patents = []
for patent_path in tqdm(patents, desc='Checking download status'):
    status_file = patent_path / 'status.txt'
    if status_file.exists():
        with open(status_file) as fp:
            status = fp.read()
            if status == 'Done processing':
                successfully_downloaded_patents.append(patent_path)
len(successfully_downloaded_patents)

Checking download status:   0%|          | 0/7798 [00:00<?, ?it/s]

4433

In [53]:
def load_patent(patent_path):
    endpoints = ['biblio', 'claims', 'description', 'fulltext']
    endpoint_data = {}
    for endpoint in endpoints:
        with open(patent_path / f'{endpoint}.json') as fp:
            try:
                endpoint_data[endpoint] = json.load(fp)
            except BaseException as e:
                raise RuntimeError(f'When parsing endpoint {endpoint}') from e
    drawings = {}
    drawings_folder_path = patent_path / 'Drawings'
    if drawings_folder_path.exists():
        for drawing_path in drawings_folder_path.iterdir():
            drawing_name = drawing_path.with_suffix('').name
            drawings[drawing_name] = imageio.imread(drawing_path)
        endpoint_data['drawings'] = drawings
    return endpoint_data

In [54]:
def get_fulltext_languages(patent_path):
    try:
        patent_data = load_patent(patent_path)
    except BaseException as e:
        raise ValueError(f"When loading patent {patent_path}") from e
    fulltext_instances = patent_data['fulltext']['ops:world-patent-data']['ops:fulltext-inquiry']['ops:inquiry-result']['ops:fulltext-instance']
    instance_languages = defaultdict(list)
    for instance in fulltext_instances:
        instance_languages[instance['@desc']].append(instance['@lang'])
    return instance_languages


In [55]:
def check_language(patent_path, permissible_language_codes=('EN',), required_instances=('description', 'claims')):
    instance_languages = get_fulltext_languages(patent_path)
    for instance in required_instances:
        if instance in instance_languages.keys():
            language_in_instance = False
            for lang in instance_languages[instance]:
                if lang in permissible_language_codes:
                    language_in_instance = True
                    break
            if not language_in_instance:
                return False
    # We only get to this point if all required instances has the permissible languages
    return True
        


In [56]:
language_filtered = tqdm([patent for patent in successfully_downloaded_patents if check_language(patent)])

ValueError: When loading patent ..\patents\netto_list\EP0000082.A1

In [82]:
[ line['$'] for line in patent_data['description']['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['description']['p']]}

['Technical Field',
 '[0001]    The present invention relates to a bonding joining structure, and more particularly relates to a bonding joining structure that is preferably used as a die bonding joining structure between a die of a semiconductor device and a metal support.',
 'Background Art',
 '[0002]    With regard to bonding of a semiconductor device, Patent Document 1 discloses a semiconductor apparatus having: a semiconductor device that has a collector electrode on one surface and an emitter electrode on the other surface; and an insulating substrate that has a first electrode interconnect on one surface. The first electrode interconnect of the insulating substrate and the collector electrode of the semiconductor device are connected to each other via a first bonding layer. This first bonding layer is a sintered layer obtained by sintering a bonding material and a reducing agent. The bonding material contains a metal particle precursor that is composed of silver carbonate and th

In [90]:
[line['$'] for line in patent_data['claims']['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['claims']['claim']['claim-text']]

['1. A bonding joining structure (1) in which a heat generating body and a support (20) comprising a metal are joined to each other via a joint portion (30) composed of a sintered body (32) of copper powder,\nwherein the support (20) contains copper, the copper being present in at least an outermost surface of the support (20),\nan interdiffusion portion (41) in which copper contained in the support (20) and copper contained in the sintered body (32) are diffused to each other is formed so as to straddle a bonding interface (40) between the support (20) and the sintered body (32), and\na copper crystal structure having the same crystal orientation is formed in the interdiffusion portion (41) so as to straddle the bonding interface (40).',
 '2. A bonding joining structure (1) in which a heat generating body and a support (20) comprising a metal are joined to each other via a joint portion (30) composed of a sintered body (32) of copper powder,\nwherein the bonding joining structure (1) 

In [64]:
weird_patent = Path('../patents/netto_list/EP0000082.A1')

#patent_data = load_patent(weird_patent)
description_path = weird_patent / 'description.json'
with open(description_path, 'rb') as fp:
    description = fp.read()
description[14190:14210]


b'orated to 1\xe2\x81\x844 of i'

In [102]:
patent_path = successfully_downloaded_patents[-1]


In [103]:
patent = load_patent(patent_path)
patent['fulltext']

{'ops:world-patent-data': {'@xmlns': {'ops': 'http://ops.epo.org',
   '$': 'http://www.epo.org/exchange',
   'xlink': 'http://www.w3.org/1999/xlink'},
  'ops:fulltext-inquiry': {'ops:publication-reference': {'document-id': {'@document-id-type': 'epodoc',
     'doc-number': {'$': 'EP3758048'},
     'kind': {'$': 'A1'}}},
   'ops:inquiry-result': {'publication-reference': {'document-id': {'@document-id-type': 'docdb',
      'country': {'$': 'EP'},
      'doc-number': {'$': '3758048'},
      'kind': {'$': 'A1'}}},
    'ops:fulltext-instance': [{'@system': 'ops.epo.org',
      '@lang': 'EN',
      '@desc': 'description',
      'ops:fulltext-format-options': {'ops:fulltext-format': {'$': 'text-only'}}},
     {'@system': 'ops.epo.org',
      '@lang': 'EN',
      '@desc': 'claims',
      'ops:fulltext-format-options': {'ops:fulltext-format': {'$': 'text-only'}}}]}}}}

In [105]:
check_language(patent_path)

True

In [115]:
with (open('netto_list_descriptions_2015-2020.txt', 'w') as descriptions_fp,
      open('netto_list_claims_2015-2020.txt', 'w') as claims_fp, 
      open('netto_list_claims_descriptions_2015-2020.txt', 'w') as claims_descriptions_fp,):
    for patent in successfully_downloaded_patents[-100:]:
        try:
            if check_language(patent):
                patent_data = load_patent(patent)
                description = get_description(patent_data)
                claims = get_claims(patent_data)
                desc_str = '\n'.join(description)
                claims_str = '\n'.join(claims)
                descriptions_fp.write(desc_str + '\n\n')
                claims_fp.write(claims_str + '\n\n')
                claims_descriptions_fp.write(desc_str + '\n' + claims_str + '\n\n')
            else:
                print('check_language failed')
        except BaseException as e:
            print(f"Exception loading {patent}, with exception {e}")
            continue

Exception loading ..\patents\netto_list\EP3744778.A1, with exception When loading patent ..\patents\netto_list\EP3744778.A1
Exception loading ..\patents\netto_list\EP3745105.A1, with exception When loading patent ..\patents\netto_list\EP3745105.A1
Exception loading ..\patents\netto_list\EP3745106.A1, with exception When loading patent ..\patents\netto_list\EP3745106.A1
Exception loading ..\patents\netto_list\EP3747266.A1, with exception When loading patent ..\patents\netto_list\EP3747266.A1
Exception loading ..\patents\netto_list\EP3747422.A1, with exception When loading patent ..\patents\netto_list\EP3747422.A1
check_language failed
Exception loading ..\patents\netto_list\EP3757150.A1, with exception When loading patent ..\patents\netto_list\EP3757150.A1
Exception loading ..\patents\netto_list\EP3757183.A1, with exception When loading patent ..\patents\netto_list\EP3757183.A1


In [83]:
def get_description(patent_data):
    return [ line['$'] for line in patent_data['description']['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['description']['p']]

In [110]:
def get_claims(patent_data):
    return [line['$'] for line in patent_data['claims']['ops:world-patent-data']['ftxt:fulltext-documents']['ftxt:fulltext-document']['claims']['claim']['claim-text']]
    

In [111]:
get_claims(patent_data)

['1. A bonding joining structure (1) in which a heat generating body and a support (20) comprising a metal are joined to each other via a joint portion (30) composed of a sintered body (32) of copper powder,\nwherein the support (20) contains copper, the copper being present in at least an outermost surface of the support (20),\nan interdiffusion portion (41) in which copper contained in the support (20) and copper contained in the sintered body (32) are diffused to each other is formed so as to straddle a bonding interface (40) between the support (20) and the sintered body (32), and\na copper crystal structure having the same crystal orientation is formed in the interdiffusion portion (41) so as to straddle the bonding interface (40).',
 '2. A bonding joining structure (1) in which a heat generating body and a support (20) comprising a metal are joined to each other via a joint portion (30) composed of a sintered body (32) of copper powder,\nwherein the bonding joining structure (1) 