1. <strike>criar diretórios para as classes IPC</strike>
2. <strike>listar e salvar links com arquivos TAR</strike>
3. <strike>baixar arquivos TAR</strike>
4. <strike>listar e descompactar arquivos TAR</strike>
5. <strike>listar subarquivos ZIP</strike>
6. <strike>descompactar subarquivos ZIP para um diretório comum</strike>
7. ler full text, determinar classe IPC da patente, transformar e enviar imagem para o respectivo diretório específico
8. apagar tudo que não for mais necessário para economizar espaço

---
---
---
###0. configurações

In [1]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#install uninstalled modules
!pip install wget

In [3]:
#import libraries and modules
import os
import pickle
import wget

from bs4 import BeautifulSoup
import re
import requests

import tarfile
from zipfile import ZipFile

In [4]:
#important paths

DATA_DIR = '/content/drive/My Drive/octimine/data/' 
IM_LAKE = '/content/drive/My Drive/octimine/imgs/lake/' 
IPC_IM = '/content/drive/My Drive/octimine/imgs/ipc/'
RAW_IM = '/content/drive/My Drive/octimine/imgs/raw/'

---
---
---
###1. create IPC classes folders

In [6]:
#create folders related to IPC classes
#https://ipcpub.wipo.int/

ipc_classes = [
  'A01','A21','A22','A23','A24','A41','A42','A43','A44','A45','A46','A47','A61','A62','A63','A99',
  'B01','B02','B03','B04','B05','B06','B07','B08','B09','B21','B22','B23','B24','B25','B26','B27','B28','B29','B30','B31','B32','B33','B41','B42','B43','B44','B60','B61','B62','B63','B64','B65','B66','B67','B68','B81','B82','B99',
  'C01','C02','C03','C04','C05','C06','C07','C08','C09','C10','C11','C12','C13','C14','C21','C22','C23','C25','C30','C40','C99',
  'D01','D02','D03','D04','D05','D06','D07','D21','D99',
  'E01','E02','E03','E04','E05','E06','E21','E99',
  'F01','F02','F03','F04','F15','F16','F17','F21','F22','F23','F24','F25','F26','F27','F28','F41','F42','F99',
  'G01','G02','G03','G04','G05','G06','G07','G08','G09','G10','G11','G12','G16','G21','G99',
  'H01','H02','H03','H04','H05','H99'
]

In [None]:
#mkdir
for ipc_class in ipc_classes:
  ipc_path = os.path.join(IPC_IM, ipc_class)
  ipc_exists = os.path.isdir(ipc_path)

  if ipc_exists:
    print(f'Folder {ipc_class} exists already! Moving on...')
  else:
    os.mkdir(ipc_path)
    print(f'Folder {ipc_path} successfully created.')

---
---
---
###2. list paths of TAR files to be downloaded

In [7]:
#list TAR files paths
#https://www.crummy.com/software/BeautifulSoup/bs4/doc.ptbr/ 

tar_files = []

years = ['2012']

#from 2001 to 2010 the files are ZIP
#from 2011 onwards the files are TAR

#years = [
#  '2021',
#  '2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012', '2011',
#  '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002', '2001'
#]

In [None]:
#
for year in years:
  url = f'https://bulkdata.uspto.gov/data/patent/grant/redbook/{year}/'
  soup = BeautifulSoup(requests.get(url).text, 'html.parser')

  #https://www.crummy.com/software/BeautifulSoup/bs4/doc.ptbr/#os-argumentos-palavras-chave
  #keyword, common to all files to be downloaded
  keyword = year

  for x in soup.find_all(href=re.compile(keyword)):
      tar_files.append(url + x.get('href'))

print(len(tar_files))

52


In [None]:
#save TAR files list for later use
pickle.dump(
    tar_files,
    open(DATA_DIR + 'TAR-files.pickle', 'wb'))

---
---
---
###3. download TAR files

In [None]:
#open TAR files list


In [None]:
#download TAR files
for tar_file in tar_files:
  file_name = tar_file[-12:]
  file_path = RAW_IM + file_name
  file_exists = os.path.isfile(file_path)

  if file_exists:
    print(f'File {file_name} has been downloaded already! Moving on...')
  else:
    wget.download(tar_file, file_path)
    print(f'Downloading file {file_name}. Please wait...')    
    print(f'File {file_name} successfully downloaded.')

---
---
---
###4. list, untar and delete downloaded TAR source files

In [None]:
#list tar files
untar_files = []

for root, dirs, files in os.walk(RAW_IM):
  for x in files:
    #check only for .tar files
    if x.endswith('.tar'):
      untar_files.append(os.path.join(root, x))
  #break to list only first level folder
  #https://stackoverflow.com/a/20868760/3499881
  break

len(untar_files)

108

In [None]:
#untar files
for untar_file in untar_files:
  folder_name = 'I' + untar_file[-12:-4]
  folder_path = RAW_IM + folder_name
  folder_exists = os.path.isdir(folder_path)

  if folder_exists:
    print(f'Folder {folder_name} exists already! Moving on...')

  else:
    print(f'Untaring file {untar_file}. Please wait...')    
    untar = tarfile.open(untar_file)
    untar.extractall(RAW_IM)
    untar.close()
    print(f'File {untar_file} successfully untared.')

    #delete original tar files to save space
    #print(f'Original file {untar_file} deleted.')

---
---
---
###5. list and unzip ZIP (sub)files


In [None]:
#list ZIP subfiles
zip_files = []

for root, dirs, files in os.walk(RAW_IM):
  for x in files:
    if x.endswith('.ZIP'):
      zip_files.append(os.path.join(root, x))

len(zip_files)

411767

In [None]:
#unzip ZIP subfiles
#TODO: verificar se tarfile descompactaria arquivos zip e vice-versa
for zip_file in zip_files:
  folder_name = zip_file[-23:-4]
  folder_path = IM_LAKE + folder_name
  folder_exists = os.path.isdir(folder_path)

  if folder_exists:
    print(f'Folder {folder_name} exists already! Moving on...')

  else:
    print(f'Unzipping file {zip_file}. Please wait...')    
    ZipFile(zip_file, 'r').extractall(path=IM_LAKE)
    print(f'File {zip_file} successfully unzipped.')