From d25bd1e5df9ac9372036ab51e54568b3c763c09c Mon Sep 17 00:00:00 2001 From: Robert Picard Date: Mon, 11 Jun 2012 18:00:11 -0400 Subject: [PATCH] pypi/parse.rb: Formatting of the abstract and switch to general output format This now uses the format: "Package description: the package description goes here." The first letter of the abstract is lowercased unless the second letter was originally capitlalized too, signaling an acronym. It's using the general output format instead of the programming format now too. Thanks to ezgraphs for including both formats in the code! --- pypi/parse.rb | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pypi/parse.rb b/pypi/parse.rb index ae9a0c21dc..0ea0df7f25 100644 --- a/pypi/parse.rb +++ b/pypi/parse.rb @@ -35,8 +35,19 @@ def get_item(doc,label='Author:', child_tag="span") page = a[0] next if abstract == "UNKNOWN" || abstract == "" + + # Test if the first word is an acronym + isAcronym = abstract =~ /^.[A-Z]/ + - abstract = "Package description - #{abstract}" unless a[1].nil? + # Lowercase the first letter for formatting "Package description: abstract goes here" + unless isAcronym + firstChar = abstract.split(//).first.downcase + abstract.slice!(0) + abstract = firstChar + abstract + end + + abstract = "Package description: #{abstract}" unless a[1].nil? # Get the License and Home Page of the project from the detail page if available # 06.07.2012 - Was having problems opening the URL so I've commented this out for now @@ -53,10 +64,11 @@ def get_item(doc,label='Author:', child_tag="span") abstract.gsub!("\n", ' ') abstract.gsub!("\r", ' ') end - -# puts "#{page}\tA\t\t\t#{categories}\t\t#{internal_links}\t\t#{external_links}\t\t#{images}\t#{abstract}\t#{source_url}\n" + + # Use general format + puts "#{page}\tA\t\t\t#{categories}\t\t#{internal_links}\t\t#{external_links}\t\t#{images}\t#{abstract}\t#{source_url}\n" # Use programming format. - puts "#{page}\t\t#{source_url}\t#{abstract}\t\t\t\t\n" +# puts "#{page}\t\t#{source_url}\t#{abstract}\t\t\t\t\n" end