# USPTO

In [1]:
#!pip install --user voila
#!jupyter labextension install @jupyter-widgets/jupyterlab-manager
!jupyter serverextension enable voila --sys-prefix
!jupyter nbextension enable --py widgetsnbextension

Enabling: voila
- Writing config: /Users/tblanke/opt/anaconda3/etc/jupyter
    - Validating...
      voila 0.3.6 [32mOK[0m
Building jupyterlab assets (production, minimized)


In [31]:
import pandas as pd
from bs4 import BeautifulSoup

import urllib3
import requests

from bs4 import BeautifulSoup
import json

import re
import time
import random

def get_abstract(s):
    try:
        abst = s.find(string='Abstract').find_next().text.replace('\n', '').strip()
        return(re.sub(' +',' ', abst))
    except:
        return("")
    
def get_claims(s):
    try:
        claims = s.find(string=re.compile('Claims')).find_all_next(string=True)
        claims = claims[:claims.index('Description')]
        claims = [i.replace('\n', '').strip() for i in claims if i.replace('\n', '').strip() != '']
        return(' '.join(claims))
    except:
        return("")

def get_description(s):
    try:
        description = s.find(string=re.compile('Description')).find_all_next(string=True)
        description = [i.replace('\n', '').strip() for i in description if i.replace('\n', '').strip() not in ['', '* * * * *']]
        return(' '. join(description))
    except:
        return("")
    
#Scraping taken from https://github.com/daneads/pypatent
#Goes to the old DB with traditional HTML 
#https://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=11,163,036.PN.&OS=PN/11,163,036&RS=PN/11,163,036

from selenium import webdriver

class WebConnection:
    def __init__(self,
                 use_selenium: bool = False,
                 selenium_driver: webdriver = None,
                 user_agent: str = None,
                 request_header: dict = None):
        self.use_selenium = use_selenium
        self.selenium_driver = selenium_driver

        if user_agent is None:
            self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        else:
            self.user_agent = user_agent
        if request_header is None:
            self.request_header = {'user-agent': self.user_agent}
        else:
            self.request_header = request_header

    def get(self, url: str):
        if self.use_selenium:
            if self.selenium_driver is None:
                raise ValueError('WebConnection.selenium_driver must point to a valid Selenium webdriver')
            else:
                self.selenium_driver.get(url)
                return self.selenium_driver.page_source
        else:
            return requests.get(url, headers=self.request_header).text

In [8]:
import ipywidgets as widgets
    
google_in = widgets.FileUpload(accept='.csv', multiple = False, description= 'Upload Patent')

display(google_in)

FileUpload(value={}, accept='.csv', description='Upload Patent CSV')

- Go to Google Patent Search advanced search
- Specify the query, for instance, with assignee and inventor name: https://patents.google.com/?inventor=Yufei+Blankenship&assignee=Google+Llc
- Make sure to only include US and English ... https://patents.google.com/?country=US&status=GRANT&language=ENGLISH&type=PATENT
- Full search https://patents.google.com/?q=data&assignee=Thales&country=US&before=priority:20221231&after=priority:20180101&status=GRANT&language=ENGLISH&type=PATENT
- Download CSV file (first link not with concepts) 

In [29]:
input_file = list(google_in.value.values())[0]
content = input_file['content']
content = io.StringIO(content.decode('utf-8'))
patents_df = pd.read_csv(content, skiprows=[0])
#Only US patents 
patents_df = patents_df[patents_df['id'].str.startswith('US', na=False)] 
patents_df.head(1)

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link
0,US-10914955-B2,Peripheral vision in a human-machine interface,Thales,"Stéphanie Lafon, Alexiane Bailly, Sébastien Dotte",2018-02-12,2019-02-11,2021-02-09,2021-02-09,https://patents.google.com/patent/US10914955B2/en,https://patentimages.storage.googleapis.com/f6...


In [35]:
def on_click_scrape(change):
    data = []
    i = 1
    for id_ in list(patents_df['id']):
        #Format is 1,100,100 without the US and extension
        p_ = '{:,}'.format(int(re.findall(r'\d+', id_)[0]))
        url_ = 'https://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=' + p_ + '.PN.&OS=PN/' + p_ + '&RS=PN/' + p_
        r = WebConnection().get(url_)
        soup = BeautifulSoup(r, 'html.parser')
        abs_ = get_abstract(soup)
        desc_ = get_description(soup)
        claim_ = get_claims(soup)
        #Add time delay
        if (i % 10 == 0):
            time.sleep(random.randint(3, 7))
            display(len(list(patents_df['id']))-i)
        data.append((id_, abs_, claim_, desc_))
        i = i + 1

btn_run = widgets.Button(description='Scrape USPTO') 
display(btn_run)
btn_run.on_click(on_click_scrape)

Button(description='Scrape USPTO', style=ButtonStyle())

74

64

54

44

34

24

14

4

In [36]:
df_ = pd.DataFrame(data, columns=['id', 'abstract', 'claims', 'description'])
patents_df = pd.merge(patents_df, df_, on='id', how='left')

In [13]:
#patents_df.to_csv('uspto-patents_df.tsv', index = False, sep="\t")

In [37]:
patents_df.to_pickle("uspto-patents_df.pkl")  

In [38]:
import base64
import hashlib
from typing import Callable

import ipywidgets
from IPython.display import HTML, display


class DownloadButton(ipywidgets.Button):
    """Download button with dynamic content

    The content is generated using a callback when the button is clicked.
    """

    def __init__(self, filename: str, contents: Callable[[], str], **kwargs):
        super(DownloadButton, self).__init__(**kwargs)
        self.filename = filename
        self.contents = contents
        self.on_click(self.__on_click)

    def __on_click(self, b):
        contents: bytes = self.contents().encode('utf-8')
        b64 = base64.b64encode(contents)
        payload = b64.decode()
        digest = hashlib.md5(contents).hexdigest()  # bypass browser cache
        id = f'dl_{digest}'

        display(HTML(f"""
<html>
<body>
<a id="{id}" download="{self.filename}" href="data:text/csv;base64,{payload}" download>
</a>

<script>
(function download() {{
document.getElementById('{id}').click();
}})()
</script>

</body>
</html>
"""))


In [39]:
DownloadButton(filename='uspto-patents_df.pkl', contents=lambda: f'hello {time.time()}', description='download')

DownloadButton(description='download', style=ButtonStyle())