Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Updates to fetch and parse (multiprocess)

  • Loading branch information...
commit ca183eccebc5416af4670e7a5f5089f86e586b3c 1 parent be4c883
ezgraphs ezgraphs authored
15 gamespot_com/fetch.sh
View
@@ -1,2 +1,17 @@
#!/bin/bash
+rm -rf download
+rm -rf download2
+rm -rf download3
+rm -rf download4
+
+mkdir -p download
+mkdir -p download2
+mkdir -p download3
+mkdir -p download4
+
ruby fetch.rb
+
+mv download/S* download2
+mv download2/Sp* download4
+mv download/R* download3
+mv download/A* download3
16 gamespot_com/parse.rb
View
@@ -2,7 +2,7 @@
['rubygems', 'hpricot', 'open-uri'].each{|r|require r}
DOWNLOAD_DIR='download'
-PARSE_DOWNLOAD_DIR='parse_download'
+
# Have logic related to previous record (to aggregate game systems for a game)
@game_system = []
@external_links = []
@@ -12,26 +12,26 @@
@page = ''
@source_url = ''
-def get_details(url,name)
+def get_details(url,name, output_file)
img=''
abstract_text=''
begin
- output = "#{PARSE_DOWNLOAD_DIR}/" + name.gsub(/(\.|\-|\:|\'| )/,'_')+".html"
- system "curl -L #{url} -o #{output}"
- doc2=Hpricot(open(output))
+ tmp="tmp_"+output_file
+ system "curl -L #{url} -o #{tmp}"
+ doc2=Hpricot(open(tmp))
abstract_text = (doc2/"//span[@class='mainDeck']").inner_text
img=(doc2/"//div[@class='boxshot']/a/img").first['src']
rescue
puts "*** WARNING: Could not retrieve abstract / image for #{url}"
end
-
+
+ system "rm #{tmp}" rescue puts "WARNING: Could not remove #{tmp} (#{$!})"
return [img,abstract_text]
end
def parse_file(file, output_file)
- system "mkdir -p #{PARSE_DOWNLOAD_DIR}"
doc=Hpricot(open(file))
apps = doc/"//a"
@@ -47,7 +47,7 @@ def parse_file(file, output_file)
@external_links.each_with_index{|link,i|external_links+="[#{link} #{@game_system[i]}]\\n"}
# Go get the detail information once we are down to one version of the game...
- @images,@text_abstract=get_details(@source_url, @page)
+ @images,@text_abstract=get_details(@source_url, @page, output_file)
str="#{@page}\tA\t\t\t#{categories}\t\t#{internal_links}\t\t#{external_links}\t\t#{@images}\t#{@text_abstract}\t#{@source_url}\n"
File.open(output_file, 'a'){|f|f.puts str}
34 gamespot_com/parse.sh
View
@@ -1,8 +1,34 @@
# Externalized the looping through files.
# This limits memory/resource usage/garbage collection issues in ruby and
# makes the crawling more fault tolerant
-for i in `find . -name '*.html' | grep -v 'index.html'`;
-do
+
+function parse {
+
+ input_dir=$1;
+ output_file=$2;
+
+ for i in `find $input_dir -name '*.html' | grep -v 'index.html'`;
+ do
echo $i;
- ruby parse.rb $i output.txt
-done
+ ruby parse.rb $i $output_file;
+ done
+
+}
+
+# Multiprocessing
+echo "Staring: `date`";
+echo "Starting output1";
+(parse 'download' 'output1.txt') &
+echo "Starting output2";
+(parse 'download2' 'output2.txt') &
+echo "Starting output3";
+(parse 'download3' 'output3.txt') &
+echo "Starting output4";
+(parse 'download4' 'output4.txt') &
+
+wait
+echo "Concatenating..."
+export LC_ALL='C'
+cat output1.txt output2.txt output3.txt output4.txt | sort > output.txt
+
+echo "Done: `date`";
Please sign in to comment.
Something went wrong with that request. Please try again.