In [11]:
# Patent data files are uploaded with Thursday's date every week. This function gives the urls for all year
# Saved lot of time instead of manually getting all urls from USPTO website
from datetime import date, timedelta

def getthursdays(year):
    d = date(year, 1, 1)                    # January 1st
    #print(d)
    #d += timedelta(days = 3 - d.weekday())  # First Thursday
    d += timedelta(days = (3 - d.weekday() if d.weekday() <= 3 else 7 + 3 - d.weekday()))
    while d.year == year and (date.today() - d).days > 0:
        url = 'https://bulkdata.uspto.gov/data/patent/application/redbook/fulltext/'
        yield url + str(year) + '/ipa' + str(d).replace('-','')[2:] + '.zip'
        d += timedelta(days = 7)

def get_urls_for_year(year):
    urls = []
    for d in getthursdays(year):
        urls.append(d)
    return urls
    #return urls[:2]

#print(get_urls_for_year(2017))
# for d in allthursdays(2016):
#         print(d)        
Ref: #https://stackoverflow.com/questions/2003870/how-can-i-select-all-of-the-sundays-for-a-year-using-python

In [12]:
# function to clean and stem the claim text from XML
import nltk
import re
sb_stemmer = nltk.stem.SnowballStemmer('english')

def tokenize_and_stem(text):
    text_stem = ''
    for word in text.lower().split():
        if text_stem:
            text_stem =  text_stem + ' ' + sb_stemmer.stem(word)
        else:
            text_stem = sb_stemmer.stem(word)
    return text_stem


def clean_stem(string):
    stopwords = {'back', 'thru', 'eg', 'hereafter', 'too', 'part', 'which', 'will', 'be', 'thereupon', 'about', 'nevertheless', 'therein', 'through', 'we', 'among', 'in', 'then', 'former', 'via', 'below', 'whereafter', 'due', 'you', 'bill', 'forty', 'few', 'not', 'with', 'rather', 'next', 'nine', 'me', 'its', 'sometime', 'yours', 'who', 'whoever', 'down', 'some', 'such', 'thereafter', 'hasnt', 'fifteen', 'both', 'as', 'ever', 'could', 'find', 'hence', 'something', 'a', 'there', 'mostly', 'whereas', 'many', 'serious', 'can', 'indeed', 'afterwards', 'whenever', 'by', 'becomes', 'may', 'after', 'couldnt', 'seemed', 'anyhow', 'etc', 'might', 'already', 'no', 'please', 'them', 'myself', 'therefore', 'from', 'along', 'ltd', 'against', 'everywhere', 'amoungst', 'because', 'where', 'sixty', 'ie', 'although', 'sincere', 'move', 'seeming', 'or', 'wherever', 'inc', 'whatever', 'into', 'anywhere', 'around', 'nor', 'see', 'several', 'sometimes', 'for', 'interest', 'beyond', 'whether', 'detail', 'describe', 'moreover', 'nobody', 'whereupon', 're', 'without', 'an', 'ours', 'perhaps', 'only', 'five', 'towards', 'keep', 'eleven', 'one', 'other', 'any', 'otherwise', 'except', 'that', 'cannot', 'behind', 'ourselves', 'under', 'within', 'fifty', 'across', 'if', 'thus', 'per', 'wherein', 'here', 'empty', 'co', 'still', 'whole', 'how', 'off', 'to', 'yourself', 'call', 'cry', 'four', 'so', 'she', 'take', 'their', 'been', 'now', 'even', 'mill', 'what', 'another', 'namely', 'always', 'themselves', 'almost', 'six', 'formerly', 'ten', 'found', 'onto', 'yet', 'between', 'give', 'hers', 'herein', 'eight', 'above', 'anyway', 'third', 'himself', 'front', 'over', 'two', 'much', 'latter', 'itself', 'besides', 'those', 'on', 'twenty', 'up', 'us', 'amongst', 'beforehand', 'but', 'most', 'same', 'mine', 'should', 'this', 'full', 'herself', 'her', 'thick', 'con', 'everything', 'is', 'am', 'three', 'throughout', 'again', 'enough', 'your', 'once', 'hereupon', 'become', 'yourselves', 'everyone', 'before', 'i', 'whereby', 'others', 'must', 'seems', 'elsewhere', 'were', 'either', 'would', 'became', 'hundred', 'toward', 'very', 'latterly', 'top', 'often', 'beside', 'cant', 'else', 'the', 'however', 'and', 'somehow', 'him', 'noone', 'somewhere', 'our', 'nothing', 'de', 'fill', 'well', 'it', 'all', 'last', 'do', 'these', 'has', 'upon', 'every', 'side', 'system', 'put', 'thence', 'twelve', 'becoming', 'show', 'un', 'least', 'of', 'have', 'own', 'since', 'though', 'whither', 'out', 'hereby', 'meanwhile', 'none', 'while', 'whom', 'further', 'why', 'made', 'whose', 'my', 'someone', 'they', 'during', 'anyone', 'first', 'go', 'less', 'his', 'anything', 'thereby', 'amount', 'together', 'never', 'was', 'thin', 'also', 'each', 'fire', 'are', 'when', 'alone', 'had', 'until', 'done', 'more', 'at', 'than', 'nowhere', 'seem', 'whence', 'name', 'neither', 'he', 'get', 'being', 'bottom'}
    #strip and change to lower case and replace commas and semi colons with spaces
    stem = string.strip().lower().replace(';', ' ').replace(',', ' ').replace(':', ' ').replace('(',' ').replace(')',' ').replace('#', ' ').replace('.', ' ').strip()
         
    # remove words that only have numbers( second one removes special characters also)
    stem = re.sub(r'\b\d+\b', ' ',stem).strip()
    
    #remove special characters at the end of words
    stem = re.sub(r'([^\w\s]|_)+(?=\s|$)', ' ',stem).strip()
    
    #remove any words that have a number in it ( even if in the middle )
    stem = re.sub(r'\w*\d\w*', ' ',stem).strip()
    
    # remove any words with only one alphabet
    stem = re.sub(r'\b[a-zA-Z]\b', ' ',stem).strip()
    
    # remove stop words and change to lower case
    stem = ' '.join([item for item in (stem.strip().split()) if item not in stopwords])
    
    # get words greater than length 4 and less than 25
    stem = ' '.join([item for item in (re.findall('\w{4,25}', stem))])
    #re.findall('\w{4,25}', stem).join(' ').strip() 
    
     #apply stemming for all words
    stem = tokenize_and_stem(stem)
    return stem

In [13]:
# function to parse an xml file and write data to csv file
from datetime import datetime

def xmltocsv(inputfilename, filename):
    
    fName = '../data/output/patent_data_' + filename[:-4] + '.csv'
    fw = open(fName, 'w')
    w = csv.writer(fw, delimiter = ',')

    dfilepath = inputfilename
    dfilepathtmp = dfilepath + 'tmp'
    !grep -v '<?xml version="1.0" encoding="UTF-8"?>' $dfilepath > $dfilepathtmp; mv $dfilepathtmp $dfilepath
    !sed -e '/^<!DOCTYPE/d' $dfilepath > $dfilepathtmp; mv $dfilepathtmp $dfilepath
    !sed -i '1 i\<items>' $dfilepath
    !sleep 5
    #!sed -i -e '$a</items>' $dfilepath
    !sed -i '$ a \</items>' $dfilepath
    
    pub_doc_number = ''
    pub_date = ''
    appl_doc_number = ''
    appl_country = ''
    appl_date = ''
    invention_title = ''
    abstract = ''
    claim_text = ''
    claim_text_stemmed = ''
    number_of_days = ''
    count = 0
    for event, element in etree.iterparse(inputfilename, tag="us-patent-application"):
        for child in element:
            
            if child.tag == 'us-bibliographic-data-application':
                for child1 in child:
                    if child1.tag == 'publication-reference':
                        for child11 in child1:
                            if child11.tag == 'document-id':
                                for child111 in child11:
                                    if child111.tag == 'doc-number':
                                        pub_doc_number = child111.text
                                    if child111.tag == 'date':
                                        pub_date = child111.text

                    if child1.tag == 'application-reference':
                        for child11 in child1:
                            for child11 in child1:
                                if child11.tag == 'document-id':
                                    for child111 in child11:
                                        if child111.tag == 'country':
                                            appl_country = child111.text
                                        if child111.tag == 'doc-number':
                                            appl_doc_number = child111.text
                                        if child111.tag == 'date':
                                            appl_date = child111.text

                    if child1.tag == 'invention-title':
                        for x in child1.itertext():
                            if x and not x.strip().isdigit(): # to remove the break numbers like 102 etc
                                invention_title = invention_title + x
                        invention_title = invention_title.strip()
                        
            if child.tag == 'abstract':
                for x in child[0].itertext():
                    if x and not x.strip().isdigit(): # to remove the break numbers like 102 etc
                        abstract = abstract + x
                abstract = abstract.strip()        
                #print(abstract)
            
            if child.tag == 'claims':
                #print(child.tag, etree.tostring(child))
                claim_text = etree.tostring(child).decode("utf-8")   
                claim_text = re.sub('<[^>]*>', ' ', claim_text)
                claim_text = claim_text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()
                #print(claim_text)


            if child.tag == 'claims':
                try:
                    number_of_days = abs((datetime.strptime(pub_date, "%Y%m%d") - datetime.strptime(appl_date, "%Y%m%d")).days)
                except ValueError:
                    number_of_days = 0
                claim_text_stemmed = clean_stem(claim_text)
                w.writerow([appl_doc_number, appl_country, appl_date, pub_doc_number, pub_date, number_of_days,invention_title, abstract,claim_text,claim_text_stemmed])
                pub_doc_number = ''
                pub_date = ''
                appl_doc_number = ''
                appl_country = ''
                appl_date = ''
                invention_title = ''
                abstract = ''
                claim_text = ''
                claim_text_stemmed = ''
                number_of_days = ''
                break
#         count += 1
#         if count > 3:
#             break
        element.clear()
    fw.close() 

In [14]:
# function to load data for a year
import csv
import time
import os
import zipfile
import lxml.etree as etree, lxml.html
import string

def load_for_year(year_urls):
    indir = '../data'
    loaddir = '../data/fulltextloadzone'
    loaddelete = '../data/fulltextloadzone/*'

    #for i in range(0,1):    
    for i in range(0,len(year_urls)):
        print('Downloading file:' + str(i))
        #down load the file and 
        #dataurl = dataurlspath + dataurls[i]
        dataurl = year_urls[i]
        !curl -L $dataurl -o '../data/datafile.zip'

        for path, subdirs, files in os.walk(indir):

            for f in files:
                filepath = os.path.join(path, f)
                #print(filepath)
                !rm -r $loaddelete
                zip_ref = zipfile.ZipFile(filepath, 'r')
                zip_ref.extractall('../data/fulltextloadzone')
                zip_ref.close()

                datadir = '../data/fulltextloadzone'
                for dpath, dsubdirs, dfiles in os.walk(datadir):
                    for df in dfiles:
                        if df.endswith('xml'):
                            dfilepath = os.path.join(dpath, df)
                            print('Starting file: ',dfilepath )
                            #xmltocsv(dfilepath,dataurls[i])
                            url = year_urls[i]
                            flname = url[url.rfind("/")+1:]
                            xmltocsv(dfilepath,flname)
                            print('Loaded file: ', dfilepath )
            break  


In [15]:
# call the function for every year from 2017 back to 2001
# We only loaded data until 2005

for year in reversed(range(2001,2018)):
#for year in reversed(range(2001,2007)):
    
    #delete output folder before starting a new loop
    !rm /home/ubuntu/data/output/*
    
    load_for_year(get_urls_for_year(year))
    
    #concatenate all week files into a year file
    year_filename = '/home/ubuntu/data/output/patent_claim_data_' + str(year) + '.csv'
    !cat /home/ubuntu/data/output/*.csv > $year_filename
    
    #copy to s3
    !aws s3 cp /home/ubuntu/data/output/ s3://ncw210data/FullTextData/ --recursive   
        
    #!sleep 30
    

Downloading file:0
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:30:00 --:--:--     0
curl: (56) SSL read: error:00000000:lib(0):func(0):reason(0), errno 104
Starting file:  ../data/fulltextloadzone/ipa060615.xml
Loaded file:  ../data/fulltextloadzone/ipa060615.xml
Downloading file:1
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 75.0M  100 75.0M    0     0  12.4M      0  0:00:06  0:00:06 --:--:-- 12.7M
Starting file:  ../data/fulltextloadzone/ipa060112.xml
Loaded file:  ../data/fulltextloadzone/ipa060112.xml
Downloading file:2
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 79.8M  100 79.8M   

BadZipFile: File is not a zip file