Updates

dagwieers · Mar 14, 2006 · bbc2a04 · bbc2a04
1 parent e43d659
commit bbc2a04
Show file tree

Hide file tree

Showing 7 changed files with 860 additions and 32 deletions.
diff --git a/README b/README
@@ -7,26 +7,27 @@ Tim Rupp. Available from:
 
 	svn co http://caphrim.net/svn/fermi/aerrate/trunk aerrate
 
-And issuing the following script:
+Then apply the latest patch from patches/:
+
+	patch -p0 < patches/aerrate-changes-dag-TIMESTAMP.diff
+
+And issue the following command to scrape the RHN website for the most up-to-date
+advisories:
 
 	cd aerrate
-	for release in rh{el4,el3,21}{as,es,ws} rh-desktop-{4,3} rh21aw rhel{4,3}-extras rhel3{cluster,devsuite} rhshas; do
-		echo "== $release =="
-		python ./aerrate.py -r --source=site --type=all --release=$release
-	done
+	./aerrate.py -r --source=site --type=all --release=enterprise
 	cd -
 
 This will copy all errata as XML files into ./aerrata/advisories/
 
-
 USING SARAH
 ^^^^^^^^^^^
 sarah currently expects the advisories to be available from ./advisories/. So
 making a symlink from aerrate/advisories to ./advisories is probably easiest.
 
 	ln -sf aerrate/advisories .
 
-Then to create an sqlite database out of these XML files, run:
+Then create an sqlite database out of these XML files, by doing:
 
 	./sarahdb.py
 
@@ -37,7 +38,8 @@ To create some statistics from this database, use:
 
 The sarahinfo utility currently shows how to query the database. Not all
 information is currently available in the XML files. Red Hat will be releasing
-these XML files in the future with much more info.
+these XML files in the future with more information we can get out of the 
+RHN website.
 
 I also added sarahsql to allow to query the database on the commandline, you
 can do queries in bash, like:

diff --git a/TODO b/TODO
@@ -1,7 +1,10 @@
 aerrate.py: scrapes rhn advisory information
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-+ Allow to list a number of releases (eg. --release=rhel4as,rhel3as,rhel2.1as)
-+ Make aerrate smart about advisories already scraped (recently)
++ Fix file-element syntax to match new RHN xml output
++ Fix reference-element syntax to match new RHN xml output
++ Add bugzilla references and bugzilla reference synopsis information
++ Add references to RHBA and RHEA as well
++ Fix the problem with RHBA-2004-470.xml (aerrate freezes)
 + Use timestamp and HTTP HEAD requests to check for updates
 + Parallelize the download process (use HTTP Pipelining and use
   eg. 4 connections)

diff --git a/patches/aerrate-changes-dag-20060202.diff b/patches/aerrate-changes-dag-20060202.diff
@@ -0,0 +1,261 @@
+Index: parsers/security.py
+===================================================================
+--- parsers/security.py	(revision 16)
++++ parsers/security.py	(working copy)
+@@ -24,6 +24,7 @@
+ 		errata_p.__init__(self)
+ 		self.odir		= "advisories/"
+ 		self.ofile 		= "security.xml"
++		self.synopsis		= []
+ 		self.detail		= []
+ 		self.cve_counter	= 0
+ 		self.errata_counter	= 0
+@@ -41,7 +42,7 @@
+ 			if os.environ['http_proxy'] == '':
+ 				self.proxy = None
+ 			else:
+-				self.proxy		= { 'http' : os.environ['http_proxy'] }
++				self.proxy		= { 'http' : os.environ['http_proxy'], 'https' : os.environ['http_proxy'] }
+ 		except:
+ 			self.proxy = None
+
+@@ -93,6 +94,13 @@
+
+ 		self.detail = result
+
++		# Get RHSA title (contains severity and synopsis)
++		result = re.search('<h1>(.+?)</h1>', page,  re.I | re.M)
++		if result:
++			self.synopsis = result.group(1)
++		else:
++			self.synopsis = 'Unknown: Unknown'
++
+ 	def update_database(self, link):
+ 		"""
+ 		Updates the output XML file to reflect the new contents
+@@ -111,15 +119,20 @@
+ 		# 6 - RPMs Required
+ 		# 7 - References
+ 		# 8 - Requirements (not used)
+-		self.get_rhsa_detail(link)
++		try:
++			self.get_rhsa_detail(link)
++		except urlgrabber.grabber.URLGrabError:
++			print 'Error grabbing link', link
++			return
+
+ 		# Hack to get past RPMs that outdate other rpms.
+ 		# FIXME: Check to see what RPM is outdated and update XML file
+ 		# as necessary
+-		for detail in self.detail:
+-			if detail.find("File outdated") > 0:
+-				#print "Encountered outdated RPM"
+-				return
++#		if not __main__.all_advisories:
++#			for detail in self.detail:
++#				if detail.find("File outdated") > 0:
++#					#print "Encountered outdated RPM"
++#					return
+
+ 		self.advisory_url = link
+
+@@ -162,7 +175,7 @@
+ 		self.parse_rights(w)
+ 		self.parse_type(w)
+
+-		self.parse_synopsis(w, self.detail[1])
++		self.parse_synopsis(w, self.synopsis)
+ 		self.parse_issue_date(w, self.strip_html(self.detail[0]))
+ 		self.parse_updated_on(w, self.strip_html(self.detail[0]))
+
+@@ -268,30 +281,16 @@
+ 		"""
+ 		data = self.tags_to_space(data)
+ 		data = self.strip_html(data)
+-		block = data.split("\n")
++		line = data.strip().split(':')
+
+-		# Red hat doesnt separate out the severity information
+-		# in their webpages like they do in their email archives.
+-		# Therefore I'm looking for the severity string to
+-		# determine the severity level.
+-		for line in block:
+-			if line.find("moderate security") > -1:
+-				w.element("severity", "moderate")
+-				break
+-			elif line.find("important security") > -1:
+-				w.element("severity", "important")
+-				break
+-			elif line.find("critical security") > -1:
+-				w.element("severity", "critical")
+-				break
+-			elif line.find("low security") > -1:
+-				w.element("severity", "low")
+-				break
++		severity = line[0].strip().lower()
++		synopsis = ':'.join(line[1:]).strip()
++
++		if severity in ('low', 'moderate', 'important', 'critical'):
++			w.element('severity', severity)
+ 		else:
+-			w.element("severity", "unknown")
++			w.element('severity', 'unknown')
+
+-		synopsis = block[0].strip()
+-
+ 		w.element("synopsis", synopsis, lang="en_US")
+
+ 	def tags_to_space(self, item):
+@@ -464,7 +463,7 @@
+ 		# in the script to differentiate between releases
+ 		for i in block:
+ 			item = i.strip()
+-			if item == "":
++			if not item:
+ 				continue
+ 			else:
+ 				if self.at_arch(item[0:-1]):
+@@ -508,9 +507,8 @@
+ 					self.parse_srpm_arch(w, block, item)
+ 				elif self.at_arch(item):
+ 					self.parse_arch(w, block, item)
+-				elif item == "":
+-					item = block.pop().replace("&#160;", '').strip()
+-					continue
++				elif not item:
++					pass
+ 				else:
+ 					block.append(item)
+ 					break
+@@ -561,11 +559,11 @@
+ 					error = "new_arch"
+ 					break
+
+-				if filename == "":
++				elif not filename:
+ 					error = "new_arch"
+ 					break
+
+-				if not self.linked:
++				elif not self.linked:
+ 					junk = block.pop()
+
+ 				checksum	= block.pop().replace("&#160;", '').strip()
+@@ -594,18 +592,21 @@
+ 				error = "header"
+ 				break
+
+-			if self.at_arch(filename):
++			elif self.at_arch(filename):
+ 				error = "new_arch"
+ 				break
+
+-			if filename == "":
++			elif not filename:
+ 				error = "new_arch"
+ 				break
+
+-			if not self.linked:
++			elif not self.linked:
+ 				junk = block.pop()
+
+-			checksum 	= block.pop().replace("&#160;", '').strip()
++			try:
++				checksum = block.pop().replace("&#160;", '').strip()
++			except IndexError:
++				break
+
+ 			# Hack to make RH webpages that list IA-32 stuff
+ 			# equal the data that is actually listed in the
+@@ -618,7 +619,7 @@
+ 			w.element("sum", checksum, type="md5")
+ 			w.end("file")
+
+-		if error == "header" or error == "new_arch":
++		if error in ('header', 'new_arch'):
+ 			block.append(filename)
+ 		return
+
+Index: scrapers/site.py
+===================================================================
+--- scrapers/site.py	(revision 16)
++++ scrapers/site.py	(working copy)
+@@ -63,7 +63,11 @@
+ 		self.releases 	= {}
+ 		p 		= re.compile('\<ul\>')
+
+-		page = urlgrabber.urlread(self.release_list+"/errata", proxies=self.proxy)
++		try:
++			page = urlgrabber.urlread(self.release_list+"/errata", proxies=self.proxy)
++		except urlgrabber.grabber.URLGrabError:
++			print 'Error grabbing list of all releases from', self.release_list+"/errata"
++			return
+
+ 		result = p.split(page)
+
+Index: aerrate.py
+===================================================================
+--- aerrate.py	(revision 16)
++++ aerrate.py	(working copy)
+@@ -30,8 +30,11 @@
+ 	enhancements
+
+ Argument list
+-  --source		Source to read from
++  --all                 Download al advisories (do not skip outdated advisories)
+
++  --source		Source to read from (either archive, feed or site)
++			*Defaults to site*
++
+   --type		The type of errata list to read from
+
+   --release		Parse errata for specified release
+@@ -42,11 +45,13 @@
+ 	source		= 0
+ 	type 		= 0
+ 	release		= 'none'
++	all_advisories  = 0
+ 	print_releases	= 0
+
+ 	# Try to parse any command line arguments
+ 	try:
+ 		opts, args = getopt.getopt(sys.argv[1:], "hstru", [	"help",
++									"all",
+ 									"source=",
+ 									"type=",
+ 									"printreleases",
+@@ -63,24 +68,27 @@
+ 		if o in ("-h", "--help"):
+ 			usage()
+ 			sys.exit()
+-		if o in ("-s", "--source"):
++		elif o in ("-s", "--source"):
+ 			source = a
+-		if o in ("-t", "--type"):
++		elif o in ("-t", "--type"):
+ 			type = a
+-		if o in ("-p", "--printreleases"):
++		elif o in ("-a", "--all"):
++			all_advisories = 1
++		elif o in ("-p", "--printreleases"):
+ 			print_releases = 1
+-		if o in ("-r", "--release"):
++		elif o in ("-r", "--release"):
+ 			release = a
++		else:
++			print 'Option %s not understood.' %o
++			sys.exit(1)
+
+ 	# Print releases is one argument in particular
+ 	# that doesnt require the source argument.
+ 	# So if that argument has been sent, skip
+ 	# over this small argument checking block
+ 	if not print_releases:
+-		# The source argument is required always
+ 		if not source:
+-			usage()
+-			sys.exit(0)
++			source = "site"
+ 		# The site argument requires a type
+ 		# argument and a release arg be sent too
+ 		if source == "site":