# Notebook that scrapes from Google Patents and pre-processed the data for analysis

Requires that patent numbers are known and included in a csv. Go to Google BigQuery and download patent information of interest.  Get patent numbers that you want to include.  Then scrape all the data to analyze.

In [1]:
import os
import pandas as pd
import re

from requests import get
from requests.exceptions import RequestException
from contextlib import closing

from bs4 import BeautifulSoup
import pickle
import json

import string
import datetime

import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import en_core_web_sm

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import warnings
warnings.filterwarnings("ignore")

os.chdir('..')

## Code to scrape Google Patents for documents of interest

In [None]:
data = pd.read_csv('data/processed/electrochemistry_patent_numbers.csv')
data_pat_id_list = data['patent_id'].tolist()

"""
Because the 12,000 patents take hours to scrape.  I will only include the data of the first 20 in this repo.
"""
data_pat_id_list  = data_pat_id_list[0:20]

In [None]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def scrape_google_patent_list(patent_list):
    pat_dict_scraped = {}
    for i in pat_to_scrape:
        print(i)
        url_str = 'https://patents.google.com/patent/US'+i
        raw_html = simple_get(url_str)
        pat_dict_scraped[i]={'url':url_str, 'html':raw_html}
    return pat_dict_scraped

In [None]:
def from_scraped_google_patent_into_dict(patent_id, single_html_scraped):
    
    html = BeautifulSoup(single_html_scraped, 'html.parser')

    """
    This is the information most relevant for the analysis.
    Extract it from the html and convert it into a dictionary
    """
    columns = ['patent_id', 'us_reference_list', 'inventor_list','abstract','background', 'claims', 
               'keywords_list', 'status', 'priorArtData', 'assignee', 'title', 'image_list']
    
    single_dict = {}

    single_dict['patent_id']=patent_id

    # extract patent reference list from the soup
    reference_soup = html.find_all("meta", scheme="references")
    us_reference_list = []
    for i in reference_soup:
        us_reference_list.append( re.sub( "[^0-9]", "", str(i) )) 
    single_dict['us_reference_list']=us_reference_list

    # extract patent inventor list from the soup    
    inventor_soup = html.find_all("meta", scheme="inventor")
    inventor_list = []
    for i in inventor_soup:
        inventor_list.append( i.get('content' )) 
    single_dict['inventor_list']=list(inventor_list)

    #print( html.find('dd', attrs={'itemprop':'assigneeCurrent'}) )
    if html.find('dd', attrs={'itemprop':'assigneeCurrent'}) is not None:
        single_dict['assignee'] = html.find('dd', attrs={'itemprop':'assigneeCurrent'}).text
    else: single_dict['assignee'] = html.find("meta", scheme="assignee").get('content')

    # extract patent keywords
    keywords_soup = html.find_all('dd', attrs={'itemprop':'priorArtKeywords'})
    keywords_list = []
    for i in keywords_soup:
        keywords_list.append(i.text)
    single_dict['keywords_list']=keywords_list

    # extract patent status, prior art date, current assignee, and title
    single_dict['status'] = html.find("span", attrs={'itemprop':"status"}).text
    single_dict['priorArtData'] = html.find("time", attrs={'itemprop':'priorArtDate'}).text
    single_dict['title'] = html.title.text #this includes the Google patents 'logo' -> remove in pre-processing

    # extract abstract
    if html.find("div", attrs={"class":"abstract"}) is not None:
        single_dict['abstract'] = html.find("div", attrs={"class":"abstract"}).text
    else: single_dict['abstract'] = html.title.text
    
    # extract background
    if html.find("div", attrs={"class":"description"}) is not None:
        single_dict['background'] = html.find("div", attrs={"class":"description"}).text
    else: single_dict['background'] = html.title.text
    
    # extract claims 
    if html.find("div", attrs={"class":"claims"}) is not None:
        single_dict['claims'] = html.find("div", attrs={"class":"claims"}).text
    else: single_dict['claims'] = html.title.text
    
    #image links
    image_soup = html.find_all("meta", attrs={'itemprop':'full'})
    image_list = []
    for i in image_soup:
        image_list.append(i.get('content'))
    single_dict['image_list']=image_list

    # classification list
    class_list_soup = html.find_all("span", attrs={'itemprop':"Code"})
    class_list = []
    for i in class_list_soup:
        class_list.append(i.text)
    single_dict['class_list']=class_list
    
    class_description_soup = html.find_all("span", attrs={'itemprop':"Description"})
    class_description_list = []
    for i in class_description_soup:
        class_description_list.append(i.text)
    single_dict['class_description_list']=class_description_list
    
    return single_dict

def patent_dict_from_list( patent_list_to_scrape, pat_dict_scraped ):
    patent_dataset = {}
    patent_list_scraped = pat_dict_scraped
    patent_none = []
    for i in range( len(patent_list_to_scrape) ):
        if patent_list_scraped[patent_list_to_scrape[i]]['html'] is not None:
            single_patent_clean_dict = from_scraped_google_patent_into_dict(patent_list_to_scrape[i], 
                                                                        patent_list_scraped[patent_list_to_scrape[i]]['html'] )
            patent_dataset[patent_list_to_scrape[i]] = single_patent_clean_dict
        else: patent_none.append(patent_list_to_scrape[i])
    patent_dataset['HTMLnotFound']=patent_none
    return patent_dataset

In [None]:
patent_list_to_scrape = data_pat_id_list[::-1]
pat_to_scrape = patent_list_to_scrape 

# run scraping function above that downloads raw html from google patents when given a list
scraped_patents = scrape_google_patent_list(pat_to_scrape)  

outfile = open('data/raw/scraped_patents_html','wb')
pickle.dump(scraped_patents,outfile)
outfile.close()

"""
To open the raw html is already on file:
scraped_patents = pickle.load( open( 'data/raw/scraped_patents_html', "rb" ) )
"""

In [None]:
# run function above that extracts information from raw download - could take a while

patent_dataset = patent_dict_from_list( pat_to_scrape, scraped_patents )

with open('data/processed/scraped_patents_dictionary.json', 'w') as f:
    json.dump(patent_dataset, f)


## If scraped HTML file is already downloaded, then proceed to clean up with following code

In [3]:
with open('data/processed/scraped_patents_dictionary.json') as f:
    my_dict = json.load(f)

In [4]:
print('Patents that not found during scraping:')
my_dict['HTMLnotFound']
my_dict.pop('HTMLnotFound')

Patents that not found during scraping:


[]

In [5]:
df = pd.DataFrame.from_dict(my_dict, orient='index')
df['priorArtData'] = pd.to_datetime(df['priorArtData'] ,infer_datetime_format=True) 
df['year'] = df['priorArtData'].apply(lambda x: x.year)
df['assignee']=df['assignee'].str.replace('\n', '')

df['title']=df['title'].apply(lambda x: x.split()[2:-3])
df['title']=df['title'].apply(lambda x: ' '.join(x))
df['abstract']=df['abstract'].apply(lambda x: x.split())
df['abstract']=df['abstract'].apply(lambda x: ' '.join(x))
df['background']=df['background'].apply(lambda x: x.split())
df['background']=df['background'].apply(lambda x: ' '.join(x) )
df['claims']=df['claims'].apply(lambda x: x.split())
df['claims']=df['claims'].apply(lambda x: ' '.join(x))

In [6]:
df.head(3)

Unnamed: 0,patent_id,us_reference_list,inventor_list,assignee,keywords_list,status,priorArtData,title,abstract,background,claims,image_list,class_list,class_description_list,year
3930883,3930883,"[3272653, 3427203, 3607409, 3516862, 3671319, ...",[Ludwig Kandler],Rheinisch Westfalisches Elektrizitatswerk ...,"[zinc, calcium, ion, electrode, hydroxide]",Expired - Lifetime,1969-08-16,Zinc-containing electrode,A zinc negative electrode for alkaline accumul...,This is a continuation of application Ser. No....,I claim: 1. A method of making a negative elec...,[https://patentimages.storage.googleapis.com/5...,"[H, H01, H01M, H01M4/00, H01M4/02, H01M4/24, H...","[ELECTRICITY, BASIC ELECTRIC ELEMENTS, PROCESS...",1969
3930885,3930885,"[2624767, 2783291, 2852592, 3294589, 3468715, ...",[Arabinda N. Dey],Duracell Inc,"[cell, electrolyte, plunger, casing, lid]",Expired - Lifetime,1973-01-11,Organic electrolyte reserve cell,A reserve cell is described for providing elec...,FIELD OF THE INVENTION This invention relates ...,What is claimed is: 1. An organic electrolyte ...,[https://patentimages.storage.googleapis.com/8...,"[H, H01, H01M, H01M6/00, H01M6/30, H01M6/36, H...","[ELECTRICITY, BASIC ELECTRIC ELEMENTS, PROCESS...",1973
3930976,3930976,"[2256733, 667471, 2756203, 2844532, 3234117, 3...",[John Cadwaladr Owen],Elster Metering Holdings Ltd,"[tube, plug, electrolyte, glass, electrical]",Expired - Lifetime,1971-07-20,Glass electrode assembly,An electrochemical glass electrode comprising ...,This application is a Continuation-in-Part of ...,I claim: 1. An electrochemical electrode assem...,[https://patentimages.storage.googleapis.com/2...,"[G, G01, G01N, G01N27/00, G01N27/26, G01N27/28...","[PHYSICS, MEASURING; TESTING, INVESTIGATING OR...",1971


In [7]:
punctuations = string.punctuation
#print(punctuations)
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

nlp.Defaults.stop_words |= {"electrochemical","electrochemistry","comprise", "invention","composition",\
                            "form","include","active","method","compound","group","present","provide",\
                            "produce","contain", "+", "'", "\ufeff1", "present","little", "',", \
                            "/sub", "\\ufeff1", "e.g.", "invention", "b", ", ", "' ", "e.g.", "wt%", "novel", 'claim' }


def convert_str_to_tokens(input_string):
    mytokens = []
    mytokens_out = []
    mytokens = parser( input_string )
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ !='-PRON-' else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ] 
    mytokens = nlp( str( mytokens )) 
    for word in mytokens: 
        mytoken = (word.pos_!='VERB') & (word.pos_!='PUNCT') & (word.pos_!='CCONJ') & (word.pos_!='ADV') & (word.pos_!='NUM') & (word.pos_!='X')
        if mytoken:
            mytokens_out.append(word)
    mytokens_out = mytokens_out[0:1000]
    return str( mytokens_out ).replace(" ',", "")

def tokenize_series(df, columns_to_tokenize):
    for j in columns_to_tokenize:
        for i in df.index:
           df.loc[i, j+'_tokens']=convert_str_to_tokens( df.loc[i, j] )
    return df


In [8]:
tokenize_series(df, ['title', 'abstract', 'background', 'claims'])
df=df.drop(columns=['background', 'claims'])
display(df.head(3))

Unnamed: 0,patent_id,us_reference_list,inventor_list,assignee,keywords_list,status,priorArtData,title,abstract,image_list,class_list,class_description_list,year,title_tokens,abstract_tokens,background_tokens,claims_tokens
3930883,3930883,"[3272653, 3427203, 3607409, 3516862, 3671319, ...",[Ludwig Kandler],Rheinisch Westfalisches Elektrizitatswerk ...,"[zinc, calcium, ion, electrode, hydroxide]",Expired - Lifetime,1969-08-16,Zinc-containing electrode,A zinc negative electrode for alkaline accumul...,[https://patentimages.storage.googleapis.com/5...,"[H, H01, H01M, H01M4/00, H01M4/02, H01M4/24, H...","[ELECTRICITY, BASIC ELECTRIC ELEMENTS, PROCESS...",1969,"[zinc, electrode]","[zinc, negative, electrode, alkaline, accumula...","[continuation, application, file, october, tur...","[negative, electrode, alkaline, accumulator, s..."
3930885,3930885,"[2624767, 2783291, 2852592, 3294589, 3468715, ...",[Arabinda N. Dey],Duracell Inc,"[cell, electrolyte, plunger, casing, lid]",Expired - Lifetime,1973-01-11,Organic electrolyte reserve cell,A reserve cell is described for providing elec...,[https://patentimages.storage.googleapis.com/8...,"[H, H01, H01M, H01M6/00, H01M6/30, H01M6/36, H...","[ELECTRICITY, BASIC ELECTRIC ELEMENTS, PROCESS...",1973,"[organic, electrolyte, reserve, cell]","[reserve, cell, energy, capable, shelf, life, ...","[field, reserve, energy, source, reserve, cell...","[organic, electrolyte, reserve, cell, cell, ca..."
3930976,3930976,"[2256733, 667471, 2756203, 2844532, 3234117, 3...",[John Cadwaladr Owen],Elster Metering Holdings Ltd,"[tube, plug, electrolyte, glass, electrical]",Expired - Lifetime,1971-07-20,Glass electrode assembly,An electrochemical glass electrode comprising ...,[https://patentimages.storage.googleapis.com/2...,"[G, G01, G01N, G01N27/00, G01N27/26, G01N27/28...","[PHYSICS, MEASURING; TESTING, INVESTIGATING OR...",1971,"[glass, electrode, assembly]","[glass, electrode, glass, membrane, seal, end,...","[application, continuation, application, file,...","[electrode, assembly, tube, glass, terminate, ..."


In [10]:
df.to_csv('data/processed/tokenized_scraped_patents.csv')