This repository has been archived by the owner on Oct 15, 2022. It is now read-only.
/
parse.rb
75 lines (55 loc) · 2.21 KB
/
parse.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/ruby
# -*- coding: utf-8 -*-
require 'rubygems'
require 'hpricot'
require 'open-uri'
def get_item(doc,label='Author:', child_tag="span")
str=''
begin
str=(doc/"//ul[@class='nodot']/li/strong[text()='#{label}']../#{child_tag}").inner_text
rescue
nil
end
return str
end
doc=Hpricot(open('download/pypy.html'))
apps = doc/"//table[@class='list']/tr"
apps.each do |l|
if (l/'/td').size==2 # Its not the table header
categories = ''
internal_links = ''
external_links = 'http://pypi.python.org'+(l/"/td[1]/a").attr('href')
images = ''
abstract = (l/'/td[2]').inner_text
source_url = external_links
tmp = (l/'/td[1]').inner_text
a=tmp.split(' ')
page = a[0]
next if abstract == "UNKNOWN" || abstract == ""
# Test if the first word is an acronym
isAcronym = abstract =~ /^.[A-Z]/
# Lowercase the first letter for formatting "Package description: abstract goes here"
unless isAcronym
firstChar = abstract.split(//).first.downcase
abstract.slice!(0)
abstract = firstChar + abstract
end
abstract = "Package description: #{abstract}" unless a[1].nil?
# Get the License and Home Page of the project from the detail page if available
# 06.07.2012 - Was having problems opening the URL so I've commented this out for now
detail_doc= '' #Hpricot(open(source_url))
license = get_item(detail_doc,'License:')
abstract += " License: #{license}." unless license.nil? or license.strip==''
official_site=get_item(detail_doc, 'Home Page:', 'a')
abstract += " <a href='#{official_site}'>Official Site</a>" unless official_site.nil? or official_site.strip==''
unless abstract.nil?
abstract.gsub!("\t", ' ')
abstract.gsub!("\n", ' ')
abstract.gsub!("\r", ' ')
end
# Use general format
puts "#{page}\tA\t\t\t#{categories}\t\t#{internal_links}\t\t#{external_links}\t\t#{images}\t#{abstract}\t#{source_url}\n"
# Use programming format.
# puts "#{page}\t\t#{source_url}\t#{abstract}\t\t\t\t\n"
end
end