# USPTO

In [1]:
!pip install --user voila
!jupyter serverextension enable voila --sys-prefix

Collecting voila
  Downloading voila-0.3.6-py3-none-any.whl (1.7 MB)
[K     |################################| 1.7 MB 9.8 MB/s eta 0:00:01
[?25hCollecting websockets>=9.0
  Downloading websockets-10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (111 kB)
[K     |################################| 111 kB 119.6 MB/s eta 0:00:01
Collecting jupyter-core>=4.11.0
  Downloading jupyter_core-4.11.1-py3-none-any.whl (88 kB)
[K     |################################| 88 kB 9.9 MB/s s eta 0:00:01
Collecting nbconvert<7,>=6.4.5
  Downloading nbconvert-6.5.3-py3-none-any.whl (563 kB)
[K     |################################| 563 kB 122.3 MB/s eta 0:00:01
[?25hCollecting jupyterlab-server<3,>=2.3.0
  Downloading jupyterlab_server-2.15.0-py3-none-any.whl (54 kB)
[K     |################################| 54 kB 3.2 MB/s s eta 0:00:01
Collecting jupyter-server<2.0.0,>=1.18
  Downloading jupyter_server-1.18.1-py3-none-any.whl (344 kB)
[K     |#####

In [1]:
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
import urllib3
import requests

from bs4 import BeautifulSoup
import json

import re
import time

In [3]:
#Scraping taken from https://github.com/daneads/pypatent
#Goes to the old DB with traditional HTML 
#https://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=11,163,036.PN.&OS=PN/11,163,036&RS=PN/11,163,036

from selenium import webdriver

class WebConnection:
    def __init__(self,
                 use_selenium: bool = False,
                 selenium_driver: webdriver = None,
                 user_agent: str = None,
                 request_header: dict = None):
        self.use_selenium = use_selenium
        self.selenium_driver = selenium_driver

        if user_agent is None:
            self.user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        else:
            self.user_agent = user_agent
        if request_header is None:
            self.request_header = {'user-agent': self.user_agent}
        else:
            self.request_header = request_header

    def get(self, url: str):
        if self.use_selenium:
            if self.selenium_driver is None:
                raise ValueError('WebConnection.selenium_driver must point to a valid Selenium webdriver')
            else:
                self.selenium_driver.get(url)
                return self.selenium_driver.page_source
        else:
            return requests.get(url, headers=self.request_header).text

In [4]:
def get_abstract(s):
    try:
        abst = s.find(string='Abstract').find_next().text.replace('\n', '').strip()
        return(re.sub(' +',' ', abst))
    except:
        return("")
    
def get_claims(s):
    try:
        claims = s.find(string=re.compile('Claims')).find_all_next(string=True)
        claims = claims[:claims.index('Description')]
        claims = [i.replace('\n', '').strip() for i in claims if i.replace('\n', '').strip() != '']
        return(' '.join(claims))
    except:
        return("")

def get_description(s):
    try:
        description = s.find(string=re.compile('Description')).find_all_next(string=True)
        description = [i.replace('\n', '').strip() for i in description if i.replace('\n', '').strip() not in ['', '* * * * *']]
        return(' '. join(description))
    except:
        return("")

- Go to Google Patent Search advanced search
- Specify the query, for instance, with assignee and inventor name: https://patents.google.com/?inventor=Yufei+Blankenship&assignee=Google+Llc
- Make sure to only include US and English ... https://patents.google.com/?country=US&status=GRANT&language=ENGLISH&type=PATENT
- Full search https://patents.google.com/?q=data&assignee=Thales&country=US&before=priority:20221231&after=priority:20180101&status=GRANT&language=ENGLISH&type=PATENT
- Download CSV file (first link not with concepts) 

In [5]:
patents_df = pd.read_csv('gp-search-20220721-232926.csv', skiprows=[0])

patents_df.shape

(84, 10)

In [6]:
#Only US patents 

patents_df = patents_df[patents_df['id'].str.startswith('US', na=False)] 
patents_df.shape

(84, 10)

In [7]:
patents_df.head(2)

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link
0,US-10914955-B2,Peripheral vision in a human-machine interface,Thales,"Stéphanie Lafon, Alexiane Bailly, Sébastien Dotte",2018-02-12,2019-02-11,2021-02-09,2021-02-09,https://patents.google.com/patent/US10914955B2/en,https://patentimages.storage.googleapis.com/f6...
1,US-11102014-B2,Method for handling data in a secure container,"THALES DIS CPL CANADA, Inc.","Dmitry RIYUMKIN, Darren Johnson",2019-01-30,2019-01-30,2021-08-24,2021-08-24,https://patents.google.com/patent/US11102014B2/en,https://patentimages.storage.googleapis.com/45...


In [8]:
from tqdm import tqdm

data = []

i = 1

for id_ in tqdm(list(patents_df['id'])):
    #Format is 1,100,100 without the US and extension
    p_ = '{:,}'.format(int(re.findall(r'\d+', id_)[0]))
    url_ = 'https://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=' + p_ + '.PN.&OS=PN/' + p_ + '&RS=PN/' + p_
    r = WebConnection().get(url_)
    soup = BeautifulSoup(r, 'html.parser')
    abs_ = get_abstract(soup)
    desc_ = get_description(soup)
    claim_ = get_claims(soup)
    #Add time delay
    if (i % 10 == 0):
        time.sleep(5)
    data.append((id_, abs_, claim_, desc_))
    i = i + 1


100%|██████████| 84/84 [03:08<00:00,  2.24s/it]


In [9]:
df_ = pd.DataFrame(data, columns=['id', 'abstract', 'claims', 'description'])

In [10]:
print(df_.shape)
df_.tail(2)

(84, 4)


Unnamed: 0,id,abstract,claims,description
82,US-11056011-B2,A method for managing the display of a vertica...,The invention claimed is: 1. A method for man...,TECHNICAL FIELD The present invention relates ...
83,US-10938470-B2,A station placed on a high-altitude stationary...,The invention claimed is: 1. A station placed...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...


In [11]:
patents_df = pd.merge(patents_df, df_, on='id', how='left')

In [12]:
patents_df.head(2)

Unnamed: 0,id,title,assignee,inventor/author,priority date,filing/creation date,publication date,grant date,result link,representative figure link,abstract,claims,description
0,US-10914955-B2,Peripheral vision in a human-machine interface,Thales,"Stéphanie Lafon, Alexiane Bailly, Sébastien Dotte",2018-02-12,2019-02-11,2021-02-09,2021-02-09,https://patents.google.com/patent/US10914955B2/en,https://patentimages.storage.googleapis.com/f6...,A computer-implemented method for managing a g...,The invention claimed is: 1. A computer-imple...,CROSS-REFERENCE TO RELATED APPLICATIONS This a...
1,US-11102014-B2,Method for handling data in a secure container,"THALES DIS CPL CANADA, Inc.","Dmitry RIYUMKIN, Darren Johnson",2019-01-30,2019-01-30,2021-08-24,2021-08-24,https://patents.google.com/patent/US11102014B2/en,https://patentimages.storage.googleapis.com/45...,The invention is a method for handling data in...,The invention claimed is: 1. A system includi...,FIELD OF THE INVENTION The present invention r...


In [13]:
patents_df.to_csv('uspto-patents_df.tsv', index = False, sep="\t")

In [14]:
patents_df.to_pickle("uspto-patents_df.pkl")  