In [1]:
import gzip
import pickle
import os
os.chdir("../")

In [4]:
# Define all folder paths
root_folder = './'
bulk_data_folder = os.path.join(root_folder, 'bulk_data/')
extracted_dict_folder = os.path.join(root_folder, 'extracted_data/')

In [13]:
extracted_paths = [path for path in os.listdir(extracted_dict_folder) if path[-3:] == '.gz']
print(extracted_paths)

['pftaps19950502_wk18.pkl.gz']


In [14]:
extracted_paths_errors = [path for path in os.listdir(extracted_dict_folder) if path[-4:] == '.txt']
print(extracted_paths_errors)

[]


In [7]:
archive_path_name = extracted_paths[0]
with gzip.open(os.path.join(extracted_dict_folder, archive_path_name), 'rb') as fp:
    patents = pickle.load(fp)

In [16]:
len(patents)

1061

In [12]:
patents[0]

{'uid': 'D0357791',
 'kind': 'Design',
 'app_date': '19940504',
 'title': 'Seat belt protector',
 'grant_date': '19950502',
 'inventors': ['Granda; Nelson F.'],
 'US_main_class': 'D 2639',
 'citations': ['D287881',
  'D308287',
  'D331835',
  'D332864',
  'D338552',
  'D343284',
  '3291528',
  '3482569',
  '4693495',
  '4741574',
  '4786080'],
 'abstract': '',
 'language_title': '',
 'IPC_main_class': '',
 'IPC_edition': '',
 'US_edition': '',
 'IPC_further_classes': [],
 'US_further_classes': [],
 'keywords': [],
 'year': 1995}

In [15]:
patents[1000]

{'uid': '5411509',
 'kind': 'Utility',
 'app_date': '19920930',
 'title': 'Embolectomy catheter',
 'grant_date': '19950502',
 'inventors': ['Hilal; Said'],
 'US_main_class': '606159',
 'US_further_classes': '604267',
 'US_edition': '6',
 'IPC_main_class': 'A61B 1000',
 'IPC_further_classes': [],
 'citations': ['2490168',
  '3640282',
  '3799173',
  '3877464',
  '3896815',
  '3978863',
  '3996938',
  '4735214',
  '4877037',
  '4886493',
  '4890612',
  '4968298',
  '5011488',
  '5085633',
  '5192290',
  '3519626',
  '0395075'],
 'abstract': 'An improved embolectomy catheter, includes a catheter body, an elastomeric foam catheter tip attached at one end of the catheter body, and actuator means for elastically deforming the catheter tip. The catheter tip deforms longitudinally and radially in response to activation of the actuation means. The catheter tip may be formed of any type of sterile elastomeric foam, which may be easily compressed and restored to its original shape upon decompress

# Solving problems...
## pftaps19871103_wk44.txt and pftaps19871110_wk45.txt

In [2]:
import os
from datetime import datetime
import argparse
import zipfile
import pickle
import gzip

In [3]:
# Change directory to the root of the folder (this script was launched from the subfolder python_scripts)
# All utils presuppose that we are working from the root directory of the github folder
# os.chdir("../")
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
sys.path.insert(0, os.path.join(os.getcwd(),'utils', 'extraction'))

# local utils
from parsing import *


def import_file(file_path = None, file_object = None, use_default_folder = True, use_as_name_archive = True, year = None, bulk_data_folder='./bulk_data/'):
    if file_path is None:
        print('No file_path provided, try again.')
        return None
    if file_object is None:
        if use_as_name_archive == False:
            name_archive = file_path.split('/')[-1]
        else:
            name_archive = file_path
        if '.' in file_path:
            name_archive = name_archive.split('.')[-2]
        if use_as_name_archive == True:
            file_path = os.path.join(bulk_data_folder,f'{name_archive}.zip')
        archive = zipfile.ZipFile(file_path, 'r')
        for filename in archive.namelist():
            file_object = archive.open(f'{filename}', mode='r')
    else:
        if use_as_name_archive == False:
            name_archive = file_path.split('/')[-1]
        else:
            name_archive = file_path
        if '.' in file_path:
            name_archive = name_archive.split('.')[-2]
    print('importing file '+str(name_archive))

    data = parse_file(f=file_path,file_object=file_object, year=year)

    return data

In [6]:
########################### MAIN ###########################

start_time = datetime.now()
print(f"Started script at {start_time}", flush = True)


index_archive = 2


# Define all folder paths
root_folder = './'
bulk_data_folder = os.path.join(root_folder, 'bulk_data2/') # TODO
extracted_data_folder = os.path.join(root_folder, 'extracted_data/')
os.makedirs(extracted_data_folder, exist_ok = True)


# NB: archives are all .zip
archive_paths = sorted([path for path in os.listdir(bulk_data_folder) if path[-4:] == ".zip"])
path = archive_paths[index_archive - 1] # So that the argument index_archive from shell go from 1 to len(archives)
print('Requested path n. %d out of %d, that is\n\t%s'%(index_archive,len(archive_paths), os.path.join(bulk_data_folder,path)), flush=True)

Started script at 2022-09-20 20:26:33.137670
Requested path n. 2 out of 2, that is
	./bulk_data2/pftaps19871110_wk45.zip


In [7]:
name_archive = path[:-4] # cutting the '.zip' string
archive = zipfile.ZipFile(os.path.join(bulk_data_folder,f'{name_archive}.zip'), 'r')
if len(archive.namelist()) > 1:
    print('There are more than 1 element inside the archive, which one should I choose? Here they are.', flush=True)
    print(archive.namelist(), flush=True)
    with open(os.path.join(extracted_data_folder, f'ERROR_{name_archive}_more_than_1_element.txt'), 'w') as fp:
        fp.writelines(archive.namelist())
    print(f'Exiting at {datetime.now()}...', flush=True)
    exit()

for unzipped_file_path in archive.namelist():
    print('Opening',unzipped_file_path)
    file_name = unzipped_file_path.split('/')[-1]
    file_object = archive.open(f'{unzipped_file_path}', mode='r')
    patents = import_file(
        file_path=unzipped_file_path,
        file_object=file_object, 
        year=None,
        use_as_name_archive = False,
        bulk_data_folder = bulk_data_folder
    )

print(f"FOUND {len(patents)} PATENTS.", flush=True)

with gzip.open(os.path.join(extracted_data_folder, f'{name_archive}.pkl.gz'), 'wb') as fp:
    joblib.dump(patents, fp)

print('All dumped', flush=True)
print(f'Script time: {datetime.now() - start_time}', flush=True)
print(f'Exiting script at {datetime.now()}', flush=True)


Opening pftaps19871110_wk45.txt
importing file pftaps19871110_wk45
FOUND ERROR IN PATENT
{'uid': '4705030', 'kind': 'Utility', 'app_date': '19870217', 'title': 'Hand augmenting spinal manipulator encircling the hand', 'grant_date': '19871110', 'inventors': ['Tepperberg; Phillip S.'], 'US_main_class': '128 69', 'US_further_classes': ['128 61', '128 67', 'D24 36'], 'US_edition': '4', 'IPC_main_class': 'A61H  700', 'IPC_further_classes': ['A61F  500', 'A61F  501'], 'citations': ['D275607', '726054', '1342597', '2026332', '2063818', '2159654', '2450935', '2854971', '4126129', '4230099', '4350152', '4454868', '4483328', '4498464', '4590926', '4611581', '4029973', '4039862', '344528', '7504998', '361599', '118326', '150955', '122222', '216327']}
ABSTRACT
[{'PAL': "A device for treatment of anterior displacement of the vertebrae fits over the hand of the chiropractor. A groovein a first surface fits over the spinous processes and ridges parallel to the groove engage the transverse processes o

NameError: name 'joblib' is not defined