In [None]:
from lxml import html
import requests
url = 'https://bulkdata.uspto.gov/data/patent/officialgazette/2007/'
page = requests.get(url)
content = html.fromstring(page.content)
tr_elements = content.xpath('//tr')

In [None]:
!mkdir gazettes
!mkdir gazette_zips

In [None]:
import urllib.request
import zipfile
import os

table_data = []
for column in tr_elements:
  name = column.text_content()
  table_data.append((name, []))

for row in range(1, len(tr_elements)):
  table_tr = tr_elements[row]
  column_count = 0

  for column in table_tr.iterchildren():
    data = column.text_content()
    table_data[column_count][1].append(data)
    column_count += 1

dictionary = {title : column for (title, column) in table_data}

for file_name in dictionary['FILENAMESIZE (bytes)DATE/TIME'][:1]:
  full_url = url + file_name
  print(file_name)
  urllib.request.urlretrieve(full_url, os.path.join("gazette_zips", file_name))
  with zipfile.ZipFile(os.path.join("gazette_zips", file_name), 'r') as zip_ref:
    zip_ref.extractall("gazettes")

In [None]:
import glob 
html_files = sorted(glob.glob('gazettes/**/OG/html/**/USD*.html',  recursive = True))
print(len(html_files))
print(html_files[:5])

375
['gazettes/1314-1/OG/html/1314-1/USD0534331-20070102.html', 'gazettes/1314-1/OG/html/1314-1/USD0534332-20070102.html', 'gazettes/1314-1/OG/html/1314-1/USD0534333-20070102.html', 'gazettes/1314-1/OG/html/1314-1/USD0534334-20070102.html', 'gazettes/1314-1/OG/html/1314-1/USD0534335-20070102.html']


In [None]:
!mkdir design_patent_images_2007

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cv2
import numpy as np

target_size = 512 # 224
metadata_file = open("metadata_2007.txt", 'w')

for file_name in html_files:
  # print(file_name)
  with open(file_name,"r") as f:
    web_page = f.read()
    text_head = 'uppercase"><b>'
    text_start = web_page.find(text_head)
    if text_start is not -1:
      text_tail = '</b>'
      text_end = web_page.find(text_tail, text_start + len(text_head))
      if text_end is not -1:
        title = web_page[text_start + len(text_head):text_end]
        # print(title)
        img_head = '<center><img src="'
        img_start = web_page.find(img_head)
        if img_start is not -1:
          img_tail = '"'
          img_end = web_page.find(img_tail, img_start + len(img_head))
          if img_end is not -1:
            img_file = web_page[img_start + len(img_head):img_end]
            # print(img_file, title)
            parts = img_file.split("-")
            patent_name = parts[0]

            last_slash = file_name.rfind("/")
            img_path = file_name[:last_slash] + "/" + img_file

            # print(img_path)
            img = mpimg.imread(img_path)

            mean = np.mean(img)
            # print(mean)
            if mean < 200:
              continue

            if len(img.shape) == 3:
              height, width, chan = img.shape
              img = img[:,:,0:1].reshape(height, width)

            # print(img.shape)
            # plt.figure(figsize = (5, 5))
            # plt.axis('off')
            # imgplot = plt.imshow(img, "gray")
            # plt.show()
            # print()

            height, width = img.shape
            minvals = np.amin(img, axis=0)

            num_blank_stripes = 0
            was_255 = True

            split_start = 0
            split_end = 0

            if width > height:
              resized = cv2.resize(img, (target_size, target_size*height//width), interpolation=cv2.INTER_CUBIC)
            elif height > width:
              resized = cv2.resize(img, (target_size*width//height, target_size), interpolation=cv2.INTER_CUBIC )
            else:
              resized = cv2.resize(img, (target_size, target_size), interpolation=cv2.INTER_CUBIC)

            height, width = resized.shape
            result = np.full((target_size, target_size), (255), dtype=np.uint8)
            xx = (target_size - width) // 2
            yy = (target_size - height) // 2
            result[yy:yy+height, xx:xx+width] = resized

            # print(width, height)

            class_head = "U.S. Cl. </b><b>"
            class_start = web_page.find(class_head)
            if class_start is not -1:
              class_tail = "</b>"
              class_end = web_page.find(class_tail, class_start + len(class_head))
              class_text = web_page[class_start + len(class_head):class_end]
              # print(class_text)
              class_parts = class_text.split("&#x2014;")
              if len(class_parts) is 1:
                class_parts = class_text.split("&#8212;")
              if len(class_parts) is 2:
                class_text = class_parts[0].replace(" ", "") + "-" + class_parts[1]

                cv2.imwrite("design_patent_images_2007/" + patent_name + ".png", result)
                metadata_text = patent_name + "\t" + class_text + "\t" + title
                print(metadata_text)
                metadata_file.write(metadata_text + "\n")

                # plt.figure(figsize = (5, 5))
                # plt.axis('off')
                # imgplot = plt.imshow(result, "gray")
                # plt.show()
                # print()

metadata_file.close()

USD0534331	D1-121	Pet chew
USD0534332	D2-95	Outsole for a shoe
USD0534333	D2-606	Bow tie with biblical phrases and musical notes
USD0534334	D2-743	Hospital garment
USD0534335	D2-836	Portion of a garment
USD0534336	D2-840	Shirt
USD0534338	D2-969	Footwear upper
USD0534339	D2-969	Footwear upper
USD0534340	D2-969	Footwear upper
USD0534341	D2-971	Shoe
USD0534342	D2-972	Portion of a shoe upper
USD0534343	D2-972	Portion of a shoe upper
USD0534344	D2-977	Blind shoe seam design
USD0534345	D2-977	Portion of a shoe midsole
USD0534346	D2-977	Portion of a shoe midsole
USD0534347	D3-238	Bag
USD0534348	D3-243	Handbag
USD0534349	D3-243	Handbag
USD0534350	D3-243	Handbag
USD0534352	D3-246	Handbag
USD0534353	D3-279	Luggage
USD0534354	D3-319	Modular tool storage drawer
USD0534355	D3-320	Golf bag club divider
USD0534356	D3-321	Portion of a tote bag
USD0534357	D5-47	Fabric construction
USD0534359	D6-309	Oval mirror
USD0534360	D6-312	Picture frame
USD0534361	D6-323	Hanger with air freshener therein
USD053436

In [None]:
!zip -r design_patent_images_2007.zip design_patent_images_2007