## USPTO data ingress

Data found at https://bulkdata.uspto.gov/. According to the terms and conditions, there is no restriction on use. In this notebook we want to evaluate the suitability of the data, and if so, download and clean.

There are a number of different versions of the dataset. 

- Patent Grant Bibliographic (Front Page) Text Data (JAN 1976 - PRESENT): sample file is approximately 18MB. We downloaded this in the hope that it would contain a substantial amount of text, but this is not the case. It contains titles and metadata about the patents, but nothing comparable to an abstract.

- Patent Grant Full Text Data (No Images) (JAN 1976 - PRESENT): sample file is approximately 800MB. Too large to investigate in Notebook++, so we investigate it here instead using Elementtree


In [1]:
import xml.etree.ElementTree as ET
import re
import io
import os
import json
import time
from xml.dom import minidom
from collections import defaultdict

In [2]:
tree = ET.parse('../../Data/uspto_sample/ipg210105.xml')
root = tree.getroot()

ParseError: junk after document element: line 899, column 0 (<string>)

Okay there is an issue with the form of the XML file?

From investigation, there is no root node. It seems like many XML files have been smushed together. They each start with
```<!DOCTYPE us-patent-grant SYSTEM "us-patent-grant-v45-2014-04-03.dtd" [ ]>``` (or similar)

```
<!DOCTYPE...>
    <us-patent grant... date-produced="20201221">

    <description-of-drawings>

        <p id="p-0001" num="0001"><figref idref="DRAWINGS">FIG. 1</figref> is a top, front perspective view of a molded tortilla pocket according to our new design;</p>
        
    <claim-text>The ornamental design for a molded tortilla pocket, as shown and described.</claim-text>


        
    

In [2]:
with io.open("../../Data/uspto_sample/ipg210309.xml", 'r', encoding='utf-8-sig') as f:
    xml = f.read()
    
xml = re.sub(r'<!DOCTYPE .+>\n', '', xml)
xml = re.sub(r'<!DOCTYPE .+>', '', xml)
xml = re.sub(r'<\?xml version="1.0" encoding="UTF-8"\?>\n', '', xml)
xml = '<?xml version="1.0" encoding="UTF-8"?><root>' + xml + '</root>'

tree = ET.fromstring(xml)
print(len(tree))

6438


In [3]:
root = tree[900]
for child in root:
    print(child)

<Element 'us-bibliographic-data-grant' at 0x00000288526E4278>
<Element 'abstract' at 0x00000288527AC728>
<Element 'drawings' at 0x00000288527AC7C8>
<Element 'description' at 0x00000288527B11D8>
<Element 'us-claim-statement' at 0x00000288527E82C8>
<Element 'claims' at 0x00000288527E8318>


In [3]:
def make_document(root):
    title = ''
    abstract = ''
    description = ''
    claims = ''
    ipcr_classifications = []
    cpc_classifications = []
    doc_number = ''
    date = ''
    
    if root.find('us-bibliographic-data-grant') is None:
        # unsuitable for lack of data!
        return 0
    
    else:
        
        ipcr_clfs = root.find('us-bibliographic-data-grant').find('classifications-ipcr')
        if ipcr_clfs is not None:
            for clf in ipcr_clfs:
                ipcr_classifications.append({
                    'section': clf.find('section').text,
                    'class': clf.find('class').text,
                    'subclass': clf.find('subclass').text,
                    'main_group': clf.find('main-group').text,
                    'subgroup': clf.find('subgroup').text,
                    'date': clf.find('ipc-version-indicator').find('date').text
                })

        if root.find('us-bibliographic-data-grant').find('classifications-cpc') is not None:
            cpc_clfs = root.find('us-bibliographic-data-grant').find('classifications-cpc').find('main-cpc').findall('classification-cpc')

            if root.find('us-bibliographic-data-grant').find('classifications-cpc').find('further-cpc') is not None:
                cpc_clfs += root.find('us-bibliographic-data-grant').find('classifications-cpc').find('further-cpc').findall('classification-cpc')

            if len(cpc_clfs) > 0:
                for clf in cpc_clfs:
                    cpc_classifications.append({
                        'section': clf.find('section').text,
                        'class': clf.find('class').text,
                        'subclass': clf.find('subclass').text,
                        'main_group': clf.find('main-group').text,
                        'subgroup': clf.find('subgroup').text,
                        'date': clf.find('cpc-version-indicator').find('date').text
                    })
        
        clf_sections = [c['section'] for c in ipcr_classifications+cpc_classifications]
        
        if 'G' in clf_sections or 'H' in clf_sections or 'Y' in clf_sections:
            doc_number = root.find('us-bibliographic-data-grant').find('application-reference').find('document-id').find('doc-number').text
            date_produced = root.attrib['date-produced']
            date_published = root.attrib['date-produced']
            date_filed = root.find('us-bibliographic-data-grant').find('application-reference').find('document-id').find('date').text
            date = date_filed[0:6]
            
            if date == '':
                # unsuitable for lacking date!
                return 2

            title = root.find('us-bibliographic-data-grant').find('invention-title').text
            
            if title == '':
                # unsuitable for lacking title!
                return 3

            text_abstract = root.find('abstract')
            if text_abstract is not None:
                for child in text_abstract:
                    if type(child.text) == str:
                        abstract += child.text + ' '
                    if type(child.tail) == str:
                        abstract += child.tail + ' '

                    for grandchild in child:
                        if type(grandchild.text) == str:
                            abstract += grandchild.text + ' '
                        if type(grandchild.tail) == str:
                            abstract += grandchild.tail + ' '  
                
                
            text_description = root.find('description')
            if text_description is not None:
                for child in text_description:
                    if type(child.text) == str:
                        description += child.text + ' '
                    if type(child.tail) == str:
                        description += child.tail + ' '

            all_claims = root.find('claims').findall('claim')
            if len(all_claims) > 0:
                for claim in all_claims:
                    claim_texts = claim.find('claim-text')
                    for child in claim_texts:
                        if type(child.text) == str:
                            claims += child.text + ' '
                        if type(child.tail) == str:
                            claims += child.tail + ' '
                            
            if len(abstract + description + claims) < 50:
                # unsuitable for lack of text!
                return 4

            json_document = {
                'year_month': date,
                'doc_number': doc_number,
                'date_filed': date_filed,
                'date_produced': date_produced,
                'date_published': date_published,
                'title': title,
                'abstract': abstract,
                'description': description,
                'claims': claims,
                'ipcr_classifications': ipcr_classifications,
                'cpc_classifications': cpc_classifications
            }

            return json_document
        
        else:
            # Unsuitable because does not have a classification linked to computer science
            return 1

In [85]:
def make_document_legacy(root):
    '''
    This concerns documents from 2005 and prior years
    '''
    title = ''
    abstract = ''
    description = ''
    claims = ''
    ipcr_classifications = []
    cpc_classifications = []
    us_classifications = []
    doc_number = ''
    date = ''
    
    if root.find('us-bibliographic-data-grant') is None:
        # unsuitable for lack of data!
        return 0
    
    else:
        if root.find('us-bibliographic-data-grant').find('classification-national') is not None:
            us_classifications.append(root.find('us-bibliographic-data-grant').find('classification-national').find('main-classification').text)
            
            for clf in root.find('us-bibliographic-data-grant').find('classification-national').findall('further-classification'):
                us_classifications.append(clf.text)
            
            doc_number = root.find('us-bibliographic-data-grant').find('application-reference').find('document-id').find('doc-number').text
            date_produced = root.attrib['date-produced']
            date_published = root.attrib['date-produced']
            date_filed = root.find('us-bibliographic-data-grant').find('application-reference').find('document-id').find('date').text
            date = date_filed[0:6]
            
            if date == '':
                # unsuitable for lacking date!
                return 2

            title = root.find('us-bibliographic-data-grant').find('invention-title').text
            
            if title == '':
                # unsuitable for lacking title!
                return 3

            text_abstract = root.find('abstract')
            if text_abstract is not None:
                for child in text_abstract:
                    if type(child.text) == str:
                        abstract += child.text + ' '
                    if type(child.tail) == str:
                        abstract += child.tail + ' '

                    for grandchild in child:
                        if type(grandchild.text) == str:
                            abstract += grandchild.text + ' '
                        if type(grandchild.tail) == str:
                            abstract += grandchild.tail + ' '  
                
                
            text_description = root.find('description')
            if text_description is not None:
                for child in text_description:
                    if type(child.text) == str:
                        description += child.text + ' '
                    if type(child.tail) == str:
                        description += child.tail + ' '

            all_claims = root.find('claims').findall('claim')
            if len(all_claims) > 0:
                for claim in all_claims:
                    claim_texts = claim.find('claim-text')
                    for child in claim_texts:
                        if type(child.text) == str:
                            claims += child.text + ' '
                        if type(child.tail) == str:
                            claims += child.tail + ' '
                            
            if len(abstract + description + claims) < 50:
                # unsuitable for lack of text!
                return 4

            json_document = {
                'year_month': date,
                'doc_number': doc_number,
                'date_filed': date_filed,
                'date_produced': date_produced,
                'date_published': date_published,
                'title': title,
                'abstract': abstract,
                'description': description,
                'claims': claims,
                'ipcr_classifications': ipcr_classifications,
                'cpc_classifications': cpc_classifications,
                'us_classifications': us_classifications
            }

            return json_document
        
        else:
            # Unsuitable because does not have a classification linked to computer science
            return 1


In [4]:
outcome = defaultdict(int)
for year in np.range(2006, 2021):
    files = os.listdir('D:/uspto/'+str(year))
    outcome['year'] = year
    for file in files:
        if file[-3:] == 'xml':
            time0 = time.time()

            with io.open('D:/uspto/'+str(year)+'/'+file, 'r', encoding='utf-8-sig') as f:
                xml = f.read()

                xml = re.sub(r'<!DOCTYPE .+>\n', '', xml)
                xml = re.sub(r'<!DOCTYPE .+>', '', xml)
                xml = re.sub(r'<\?xml version="1.0" encoding="UTF-8"\?>\n', '', xml)
                xml = '<?xml version="1.0" encoding="UTF-8"?><root>' + xml + '</root>'

            tree = ET.fromstring(xml)
            del xml

            documents = defaultdict(list)
            for i, root in enumerate(tree):
                document = make_document(root)
                if type(document) != int:
                    documents[document['year_month']].append(document)
                    outcome[5]+=1
                else:
                    outcome[document]+=1


            del tree

            for year_filed in documents.keys():

                with open('../../Data/uspto_filtered/'+year_filed+'.txt', "a") as f:
                    for document in documents[year_filed]:
                        f.write(json.dumps(document)+'\n')

            del documents

            print(file, time.time()-time0)
    print(year)
    print('Irregular format: ', outcome[0])
    print('Wrong classification: ', outcome[1])
    print('No date: ', outcome[2])
    print('No title: ', outcome[3])
    print('Short text: ', outcome[4])
    print('Success: ', outcome[5])

with open('../../Data/uspto_filtered/outcomes.txt', "a") as f:
    f.write(json.dumps(outcome)+'\n')

ipg210112.xml 32.59398865699768
ipg210119.xml 20.598675966262817
ipg210202.xml 45.26545000076294
ipg210209.xml 28.660041093826294
ipg210216.xml 44.08662819862366
ipg210223.xml 40.08888030052185
ipg210302.xml 37.50640106201172
ipg210309.xml 33.001054763793945
ipg210316.xml 41.39901828765869
ipg210323.xml 48.35028338432312
Irregular format:  1558
Wrong classification:  26417
No date:  0
No title:  0
Short text:  9
Success:  43715


In [45]:
outcome


defaultdict(int,
            {'year': 2010, 1: 108664, 5: 135935, 0: 5951, 2: 0, 3: 0, 4: 0})