Permalink
Browse files

Merge branch 'master' of github.com:gtfierro/patentprocessor

  • Loading branch information...
2 parents 75f5179 + b65e38b commit 4ff94af31c5774b9a39cba3cb7aae2759a14a0ec @gtfierro gtfierro committed Sep 12, 2012
Showing with 41 additions and 28 deletions.
  1. +21 −20 parse.py
  2. +20 −8 test/test_parse_config.py
View
@@ -18,38 +18,39 @@
from fwork import *
# setup argparse
-parser = argparse.ArgumentParser(description='Specify source directory/directories for xml files to be parsed')
-parser.add_argument('--directory','-d', type=str, nargs='+', help='comma separated list of directories relative to $PATENTROOT that parse.py will search for .xml files')
-parser.add_argument('--patentroot','-p', type=str, nargs='?', help='root directory of all patent files/directories')
-parser.add_argument('--xmlregex','-x', type=str, nargs='?', default=r"ipg\d{6}.xml", help='regex used to match xml files in each directory')
+parser = argparse.ArgumentParser(description=\
+ 'Specify source directory/directories for xml files to be parsed')
+parser.add_argument('--directory','-d', type=str, nargs='+', default='',\
+ help='comma separated list of directories relative to $PATENTROOT that \
+ parse.py will search for .xml files')
+parser.add_argument('--patentroot','-p', type=str, nargs='?',\
+ default=os.environ['PATENTROOT'] \
+ if os.environ.has_key('PATENTROOT') else '/',\
+ help='root directory of all patent files/directories')
+parser.add_argument('--xmlregex','-x', type=str, \
+ nargs='?', default=r"ipg\d{6}.xml",\
+ help='regex used to match xml files in each directory')
# parse arguments and assign values
args = parser.parse_args()
DIRECTORIES = args.directory
XMLREGEX = args.xmlregex
-if args.patentroot:
- PATENTROOT = args.patentroot
-elif os.environ.has_key('PATENTROOT'):
- PATENTROOT = os.environ['PATENTROOT']
-else:
- PATENTROOT = '/data/patentdata/patents/2012'
+PATENTROOT = args.patentroot
-flder = PATENTROOT
-
-#flder = '/var/share/patentdata/patents/2007'
-#logfile = flder + "/" + 'xml-parsing.log'
logfile = "./" + 'xml-parsing.log'
logging.basicConfig(filename=logfile, level=logging.DEBUG)
t1 = datetime.datetime.now()
#get a listing of all files within the directory that follow the naming pattern
-files = [x for x in os.listdir(flder)
- if re.match(XMLREGEX, x, re.I)!=None]
+files = [directory+'/'+fi for directory in DIRECTORIES for fi in \
+ os.listdir(PATENTROOT+'/'+directory) \
+ if re.match(XMLREGEX, fi, re.I) != None]
print "Total files: %d" % (len(files))
logging.info("Total files: %d" % (len(files)))
-tables = ["assignee", "citation", "class", "inventor", "patent", "patdesc", "lawyer", "sciref", "usreldoc"]
+tables = ["assignee", "citation", "class", "inventor", "patent",\
+ "patdesc", "lawyer", "sciref", "usreldoc"]
total_count = 0
total_patents = 0
for filenum, filename in enumerate(files):
@@ -60,7 +61,7 @@
.*?
[<][/]us[-]patent[-]grant[>]) #and here is the end tag
""",
- open(flder+"/"+files[filenum]).read(), re.I + re.S + re.X)
+ open(PATENTROOT+"/"+files[filenum]).read(), re.I + re.S + re.X)
print " - Total Patents: %d" % (len(XMLs))
logging.info(" - Total Patents: %d" % (len(XMLs)))
@@ -92,7 +93,7 @@
# Cut the chaining here to better parameterize the call, allowing
# the databases to be built in place
# (/var/share/patentdata/patents/<year>)
- # outdb = flder + "/" + table
+ # outdb = PATENTROOT + "/" + table
q = SQLPatent().tblBuild(xmllist, tbl=table)
SQLPatent().dbBuild(q, tbl=table, week=filename)
#SQLPatent().dbBuild(q=SQLPatent().tblBuild(xmllist, tbl=table), tbl=table, week=filename)
@@ -107,5 +108,5 @@
#for table in tables:
# filename = table + ".sqlite3"
-# shutil.move(filename,flder)
+# shutil.move(filename,PATENTROOT)
@@ -4,6 +4,7 @@
import os
import logging
import sys
+import subprocess
# Setup test files and logs
dir = os.path.dirname(__file__)
@@ -21,6 +22,7 @@ class TestPatentConfig(unittest.TestCase):
def setUp(self):
# make sure we can call parse.py using the os module
# for the purpose of testing command line arguments
+ self.null_out = open('/dev/null','wb')
current_directory = os.getcwd()
if not current_directory.endswith('test'):
logging.error('Please run from the patentprocessor/test directory')
@@ -30,33 +32,43 @@ def setUp(self):
def test_argparse_patentroot(self):
# test that argparse is setting the variables correctly for patentroot
- exit_status = os.system('python parse.py --patentroot %s' % (os.getcwd() + '/test/unittest/fixtures'))
+ exit_status = subprocess.call('python parse.py --patentroot %s' % \
+ (os.getcwd() + '/test/unittest/fixtures'), \
+ stdout=self.null_out, shell=True)
# valid directory, but no xml files
self.assertTrue(exit_status == 0)
- exit_status = os.system('python parse.py --patentroot /dev/null')
- # specify invalid directory, should fail
- self.assertTrue(exit_status != 0)
+ exit_status = subprocess.call('python parse.py --patentroot /asdf', \
+ stdout=self.null_out, shell=True)
+ # specify invalid directory, should not have any files, but still pass
+ self.assertTrue(exit_status == 0)
# test a working, valid directory
- exit_status = os.system('python parse.py --patentroot %s' % (os.environ['PATENTROOT']))
+ exit_status = subprocess.call('python parse.py --patentroot %s' % \
+ (os.environ['PATENTROOT']), stdout=self.null_out, shell=True)
# this should pass
self.assertTrue(exit_status == 0)
def test_argparse_regex(self):
# test that argparse is setting the regular expression correctly
# test valid regex on unittest/fixtures folder
- exit_status = os.system("python parse.py --patentroot %s --xmlregex '201\d_\d.xml'" % (os.getcwd() + '/test/unittest/fixtures'))
+ exit_status = subprocess.call("python parse.py \
+ --patentroot %s --xmlregex '2012_\d.xml'" % \
+ (os.getcwd() + '/test/unittest/fixtures'), \
+ stdout=self.null_out, shell=True)
self.assertTrue(exit_status == 0)
def test_argparse_directory(self):
# test that argparse is setting the variables correctly for directories
# parse.py should not find any .xml files, but this should still pass
- exit_status = os.system('python parse.py --patentroot %s' % (os.getcwd() + '/test/unittest'))
+ exit_status = subprocess.call('python parse.py --patentroot %s' % \
+ (os.getcwd() + '/test/unittest'), stdout=self.null_out, shell=True)
self.assertTrue(exit_status == 0)
# parse.py should concatentate the correct directory and find xml files
- exit_status = os.system("python parse.py --patentroot %s --directory fixtures --xmlregex '201\d_\d.xml'" % (os.getcwd() + '/test/unittest'))
+ exit_status = subprocess.call("python parse.py --patentroot %s \
+ --directory fixtures --xmlregex '2012_\d.xml'" % \
+ (os.getcwd() + '/test/unittest'), stdout=self.null_out, shell=True)
self.assertTrue(exit_status == 0)
# TODO: make test for iterating through multiple directories

0 comments on commit 4ff94af

Please sign in to comment.