Permalink
Browse files

Parse tumblr video uploads and separate assets for our future AssetDu…

…mper.
  • Loading branch information...
1 parent 9f36c51 commit 60e235687858aa9129d4bfe1cb57380eae48cdbf @dodecaphonic committed Jul 5, 2011
Showing with 177 additions and 116 deletions.
  1. +4 −2 lib/tumble_out/contentizer.rb
  2. +59 −36 lib/tumble_out/post.rb
  3. +82 −67 test/assets/sample.xml
  4. +8 −7 test/unit/test_contentizer.rb
  5. +24 −4 test/unit/test_post.rb
@@ -38,7 +38,7 @@ def posts
end
def each_post(&blk)
- all_posts.each &blk
+ posts.each &blk
end
def dump(directory)
@@ -54,7 +54,9 @@ def dump(directory)
private
def raw_posts(offset=0)
uri = "http://#{@url}/api/read?start=#{offset}"
- doc = Nokogiri::XML(Net::HTTP.get(URI.parse(uri)))
+
+ raw = Net::HTTP.get(URI.parse(uri))
+ doc = Nokogiri::XML(raw)
if @total_posts.nil?
@total_posts = doc.search("posts").first.
View
@@ -2,7 +2,7 @@ module TumbleOut
class Post
attr_reader :type, :date, :format, :title, :body,
:slug, :permalink, :use_permalink,
- :topics
+ :topics, :assets
def initialize(raw_post, use_permalink=false)
@raw_post = raw_post
@@ -16,6 +16,7 @@ def initialize(raw_post, use_permalink=false)
@slug = nil
@topics = nil
@private = false
+ @assets = []
@coder = HTMLEntities.new
parse raw_post
@@ -33,54 +34,69 @@ def dump(directory)
private
def parse(raw_post)
- @type = raw_post["type"]
+ @type = raw_post["type"].strip.to_sym
@date = Time.at(raw_post["unix-timestamp"].to_i)
@slug = raw_post["slug"]
@format = raw_post["format"]
@permalink = raw_post["url-with-slug"].scan(/(\/post\/\d+\/\S+)$/).flatten.shift
@topics = raw_post.search("tag").map { |t| t.text }
- @title, @body = case @type
- when "audio"
- parse_audio raw_post
- when "video"
- parse_video raw_post
- when "regular"
- rt = raw_post.search("regular-title")
- rb = raw_post.search("regular-body")
-
- [rt ? rt.text : nil, rb.text]
- when "conversation"
- parse_conversation raw_post
- when "quote"
- qt = raw_post.search("quote-text")
- qs = raw_post.search("quote-source")
-
- [nil, "#{qt.text}<br/>#{qs}"]
- when "photo"
- parse_photo raw_post
- when "answer"
- parse_answer raw_post
- end
+ @title, @body, @assets = case @type
+ when :audio
+ parse_audio raw_post
+ when :video
+ parse_video raw_post
+ when :regular
+ rt = raw_post.search("regular-title")
+ rb = raw_post.search("regular-body")
+
+ [rt ? rt.text : nil, rb.text, []]
+ when :conversation
+ parse_conversation raw_post
+ when :quote
+ qt = raw_post.search("quote-text")
+ qs = raw_post.search("quote-source")
+
+ [nil, "#{qt.text}<br/>#{qs}", []]
+ when :photo
+ parse_photo raw_post
+ when :answer
+ parse_answer raw_post
+ end
end
+ # TODO: rewrite to use HTML5 audio + flash fallback
def parse_audio(post)
- title = post.search("id3-title").text
- body = @coder.decode(post.search("audio-player").
- inner_html +
+ title = post.search("id3-title").text
+ caption = post.search("audio-player")
+ assets = caption.search("embed").map { |n|
+ n.attr("src").scan(/audio_file=(.*?)=/).flatten.shift
+ }
+
+ body = @coder.decode(caption.inner_html +
post.search("audio-caption").
inner_html)
- [title, body]
+ [title, body, assets]
end
+ # TODO: rewrite to use HTML5 video + flash fallback
def parse_video(post)
- body = @coder.decode(post.search("video-player").first.
- inner_html +
+ contents = post.search("video-player").first
+ scripts = contents.search("script")
+
+ assets = if scripts.empty?
+ nil
+ else
+ URI.extract(scripts.first.text).shift.scan(%r{(http://.*?)\'}).
+ flatten
+ end
+
+ body = @coder.decode(contents.inner_html +
post.search("video-caption").
inner_html)
- [nil, body]
+ [nil, body, assets]
end
def parse_conversation(post)
@@ -93,7 +109,7 @@ def parse_conversation(post)
body += "</ul>"
- [title, body]
+ [title, body, nil]
end
def parse_answer(post)
@@ -104,26 +120,33 @@ def parse_answer(post)
body = "<div class=\"question\">#{question}</div>\n"
body += "<div class=\"answer\">#{answer}</div>"
- [nil, body]
+ [nil, body, nil]
end
def parse_photo(post)
+ assets = []
+
body = if !(photoset = post.search("photoset photo")).empty?
photoset.map do |p|
- src = p.search("photo-url").first.text
+ src = p.search("photo-url").first.text.strip
caption = p.attr("caption")
+
+ assets << src
+
"<p><img src=\"#{src}\"><br/>#{caption}</p>"
end.join
else
caption = @coder.decode(post.
search("photo-caption").
inner_html)
+ src = post.search("photo-url").first.text.strip
+
+ assets << src
- src = post.search("photo-url").first.text
"<img src=\"#{src}\"><br/>#{caption}"
end
- [nil, body]
+ [nil, body, assets]
end
def create_front_matter
View
@@ -1,7 +1,88 @@
<tumblr version="1.0">
<tumblog name="sample" timezone="US/Eastern" cname="sample.tumblr.com" title="Break this parser">
</tumblog>
- <posts start="0" total="8">
+ <posts start="0" total="9">
+ <post id="9" url="http://sample.tumblr.com/post/7271539734" url-with-slug="http://sample.tumblr.com/post/7271539734" type="video" date-gmt="2011-07-05 19:01:19 GMT" date="Tue, 05 Jul 2011 16:01:19" unix-timestamp="1309892479" format="markdown" reblog-key="mN7TnMra" slug="">
+ <video-source>
+ a:2:{i:0;a:0:{}s:2:"o1";a:6:{s:12:"content_type";s:15:"video/quicktime";s:9:"extension";s:3:"mov";s:5:"width";i:640;s:6:"height";i:480;s:8:"duration";d:13.96000000000000085265128291212022304534912109375;s:8:"revision";i:1;}}
+ </video-source>
+ <video-player>
+ <span id="video_player_7271539734">[<a href="http://www.adobe.com/shockwave/download/download.cgi?P1_Prod_Version=ShockwaveFlash" target="_blank">Flash 10</a> is required to watch video.]</span><script type="text/javascript">renderVideo("video_player_7271539734",'http://sample.tumblr.com/video_file/7271539734/tumblr_lnvji6Kfru1qzn59d',400,300,'poster=http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame1.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame2.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame3.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame4.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame5.jpg')</script>
+ </video-player>
+ <video-player max-width="500">
+ <span id="video_player_7271539734">[<a href="http://www.adobe.com/shockwave/download/download.cgi?P1_Prod_Version=ShockwaveFlash" target="_blank">Flash 10</a> is required to watch video.]</span><script type="text/javascript">renderVideo("video_player_7271539734",'http://sample.tumblr.com/video_file/7271539734/tumblr_lnvji6Kfru1qzn59d',500,375,'poster=http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame1.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame2.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame3.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame4.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame5.jpg')</script>
+ </video-player>
+ <video-player max-width="250">
+ <span id="video_player_7271539734">[<a href="http://www.adobe.com/shockwave/download/download.cgi?P1_Prod_Version=ShockwaveFlash" target="_blank">Flash 10</a> is required to watch video.]</span><script type="text/javascript">renderVideo("video_player_7271539734",'http://sample.tumblr.com/video_file/7271539734/tumblr_lnvji6Kfru1qzn59d',250,188,'poster=http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame1.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame2.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame3.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame4.jpg,http%3A%2F%2Fmedia.tumblr.com%2Ftumblr_lnvji6Kfru1qzn59d_r1_frame5.jpg')</script>
+ </video-player>
+ </post>
+
+ <post id="8" url="http://sample.tumblr.com/post/8" url-with-slug="http://sample.tumblr.com/post/4956577203/are-facebook-and-flickr-the-new-homepage-photos" type="photo" date-gmt="2011-04-27 06:17:10 GMT" date="Wed, 27 Apr 2011 08:17:10" unix-timestamp="1303885030" format="html" reblog-key="jt03LZMQ" slug="are-facebook-and-flickr-the-new-homepage-photos" width="1170" height="1177">
+ <photo-caption>
+ <p>Are <a href="http://www.facebook.com/4010telekomshop#!/4010telekomshop?sk=app_141843815862034" target="_blank">Facebook</a> and <a href="http://www.flickr.com/photos/4010_telekom_shop/sets/72157626265084090/detail/" target="_blank">Flickr</a> the new homepage?</p> <p>Photos by me for Telekom (<a href="http://mareenfischinger.de/commissioned/4010-store-architecture/" target="_blank">see closer here</a>).</p> <p>via <a href="http://4010.de/home?loc=K%C3%B6ln" target="_blank">4010&#160;Köln</a></p>
+ </photo-caption>
+ <photo-url max-width="1280">
+ http://blog.mareenfischinger.com/photo/1280/4956577203/1/tumblr_lk9jzhrEYC1qz5f4r
+ </photo-url>
+ <photo-url max-width="500">
+ http://29.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_500.png
+ </photo-url>
+ <photo-url max-width="400">
+ http://28.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_400.png
+ </photo-url>
+ <photo-url max-width="250">
+ http://27.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_250.png
+ </photo-url>
+ <photo-url max-width="100">
+ http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_100.png
+ </photo-url>
+ <photo-url max-width="75">
+ http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_75sq.png
+ </photo-url>
+ <photoset>
+ <photo offset="o1" caption="" width="1170" height="1177">
+ <photo-url max-width="1280">
+ http://blog.mareenfischinger.com/photo/1280/4956577203/1/tumblr_lk9jzhrEYC1qz5f4r
+ </photo-url>
+ <photo-url max-width="500">
+ http://29.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_500.png
+ </photo-url>
+ <photo-url max-width="400">
+ http://28.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_400.png
+ </photo-url>
+ <photo-url max-width="250">
+ http://27.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_250.png
+ </photo-url>
+ <photo-url max-width="100">
+ http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_100.png
+ </photo-url>
+ <photo-url max-width="75">
+ http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_75sq.png
+ </photo-url>
+ </photo>
+ <photo offset="o2" caption="" width="1170" height="1179">
+ <photo-url max-width="1280">
+ http://blog.mareenfischinger.com/photo/1280/4956577203/2/tumblr_lk9jzhrEYC1qz5f4r
+ </photo-url>
+ <photo-url max-width="500">
+ http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_500.png
+ </photo-url>
+ <photo-url max-width="400">
+ http://26.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_400.png
+ </photo-url>
+ <photo-url max-width="250">
+ http://24.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_250.png
+ </photo-url>
+ <photo-url max-width="100">
+ http://26.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_100.png
+ </photo-url>
+ <photo-url max-width="75">
+ http://24.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_75sq.png
+ </photo-url>
+ </photo>
+ </photoset>
+ </post>
+
<post id="7" url="http://sample.tumblr.com/post/7" url-with-slug="http://52livros.com/post/7/sluggy" type="photo" date-gmt="2011-05-08 15:45:00 GMT" date="Sun, 08 May 2011 11:45:00" unix-timestamp="1304869500" format="markdown" reblog-key="76zsD8ZR" slug="sluggy" width="2592" height="1936">
<photo-caption>A really big photo.</photo-caption>
<photo-url max-width="1280">
@@ -78,71 +159,5 @@
<line name="Her" label="Her:">I have not.</line>
</conversation>
</post>
-
- <post id="8" url="http://sample.tumblr.com/post/8" url-with-slug="http://sample.tumblr.com/post/4956577203/are-facebook-and-flickr-the-new-homepage-photos" type="photo" date-gmt="2011-04-27 06:17:10 GMT" date="Wed, 27 Apr 2011 08:17:10" unix-timestamp="1303885030" format="html" reblog-key="jt03LZMQ" slug="are-facebook-and-flickr-the-new-homepage-photos" width="1170" height="1177">
- <photo-caption>
- <p>Are <a href="http://www.facebook.com/4010telekomshop#!/4010telekomshop?sk=app_141843815862034" target="_blank">Facebook</a> and <a href="http://www.flickr.com/photos/4010_telekom_shop/sets/72157626265084090/detail/" target="_blank">Flickr</a> the new homepage?</p> <p>Photos by me for Telekom (<a href="http://mareenfischinger.de/commissioned/4010-store-architecture/" target="_blank">see closer here</a>).</p> <p>via <a href="http://4010.de/home?loc=K%C3%B6ln" target="_blank">4010&#160;Köln</a></p>
- </photo-caption>
- <photo-url max-width="1280">
- http://blog.mareenfischinger.com/photo/1280/4956577203/1/tumblr_lk9jzhrEYC1qz5f4r
- </photo-url>
- <photo-url max-width="500">
- http://29.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_500.png
- </photo-url>
- <photo-url max-width="400">
- http://28.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_400.png
- </photo-url>
- <photo-url max-width="250">
- http://27.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_250.png
- </photo-url>
- <photo-url max-width="100">
- http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_100.png
- </photo-url>
- <photo-url max-width="75">
- http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_75sq.png
- </photo-url>
- <photoset>
- <photo offset="o1" caption="" width="1170" height="1177">
- <photo-url max-width="1280">
- http://blog.mareenfischinger.com/photo/1280/4956577203/1/tumblr_lk9jzhrEYC1qz5f4r
- </photo-url>
- <photo-url max-width="500">
- http://29.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_500.png
- </photo-url>
- <photo-url max-width="400">
- http://28.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_400.png
- </photo-url>
- <photo-url max-width="250">
- http://27.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_250.png
- </photo-url>
- <photo-url max-width="100">
- http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_100.png
- </photo-url>
- <photo-url max-width="75">
- http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro1_r1_75sq.png
- </photo-url>
- </photo>
- <photo offset="o2" caption="" width="1170" height="1179">
- <photo-url max-width="1280">
- http://blog.mareenfischinger.com/photo/1280/4956577203/2/tumblr_lk9jzhrEYC1qz5f4r
- </photo-url>
- <photo-url max-width="500">
- http://30.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_500.png
- </photo-url>
- <photo-url max-width="400">
- http://26.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_400.png
- </photo-url>
- <photo-url max-width="250">
- http://24.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_250.png
- </photo-url>
- <photo-url max-width="100">
- http://26.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_100.png
- </photo-url>
- <photo-url max-width="75">
- http://24.media.tumblr.com/tumblr_lk9jzhrEYC1qz5f4ro2_r1_75sq.png
- </photo-url>
- </photo>
- </photoset>
- </post>
</posts>
</tumblr>
@@ -6,17 +6,17 @@ def setup
raw_data = open(
File.join(File.dirname(__FILE__), "..",
"assets", "sample.xml")
- )
+ )
+
+ Net::HTTP.expects(:get).returns raw_data
- Net::HTTP.expects(:get).
- with(URI.parse("http://sample.tumblr.com/api/read?start=0")).returns raw_data
@contentizer = TumbleOut::Contentizer.new("sample.tumblr.com")
end
def test_if_number_of_posts_is_correct
posts = @contentizer.posts
- assert_equal 8, posts.size
+ assert_equal 9, posts.size
end
def test_that_post_types_are_of_a_given_count
@@ -27,8 +27,9 @@ def test_that_post_types_are_of_a_given_count
end
def test_whether_posts_are_of_specific_types
- valid_types = %w(audio regular video quote photo
- answer conversation).sort
+ valid_types = [:audio, :regular, :video, :quote, :photo,
+ :answer, :conversation].sort
+
post_types = @contentizer.posts.map { |p|
p.type
}.uniq.sort
@@ -45,7 +46,7 @@ def test_that_dump_creates_a_file_for_each_post
files = Dir.glob(full_path)
- assert_equal 8, files.size
+ assert_equal 9, files.size
files.each { |f| File.delete f }
end
Oops, something went wrong.

0 comments on commit 60e2356

Please sign in to comment.