Permalink
Browse files

Add the parser

  • Loading branch information...
0 parents commit 60216cf79845d3f022821a17e5876148adcc78ca Daniel Abrahamsson committed May 9, 2011
Showing with 685 additions and 0 deletions.
  1. +13 −0 README
  2. +244 −0 lib/multipart_parser.rb
  3. +150 −0 lib/multipart_reader.rb
  4. +111 −0 test/fixtures/multipart.rb
  5. +95 −0 test/multipart_parser_test.rb
  6. +72 −0 test/multipart_reader_test.rb
13 README
@@ -0,0 +1,13 @@
+multipart-parser is a simple parser for multipart MIME messages, written in Ruby, based on felixge/node-formidable's parser.
+
+Some things to note:
+
+* Pure Ruby
+
+* Event-driven API
+
+* Only supports one level of multipart parsing. Invoke another parser if you need to handle nested messages.
+
+* Does not perform I/O.
+
+* Does not depend on any other library.
244 lib/multipart_parser.rb
@@ -0,0 +1,244 @@
+# A low level parser for multipart messages,
+# based on the node-formidable parser.
+class MultipartParser
+
+ def initialize
+ @boundary = nil
+ @boundary_chars = nil
+ @lookbehind = nil
+ @state = :parser_uninitialized
+ @index = 0 # Index into boundary or header
+ @flags = {}
+ @marks = {} # Keep track of different parts
+ @callbacks = {}
+ end
+
+ # Initializes the parser, using the given boundary
+ def init_with_boundary(boundary)
+ @boundary = "\r\n--" + boundary
+ @lookbehind = "\0"*(@boundary.length + 8)
+ @state = :start
+
+ @boundary_chars = {}
+ @boundary.each_byte do |b|
+ @boundary_chars[b] = true
+ end
+ end
+
+ # Registers a callback to be called when the
+ # given event occurs. Each callback is expected to
+ # take three parameters: buffer, start_index, and end_index.
+ # All of these parameters may be null, depending on the callback.
+ # Valid callbacks are:
+ # :end
+ # :header_field
+ # :header_value
+ # :header_end
+ # :headers_end
+ # :part_begin
+ # :part_data
+ # :part_end
+ def on(event, &callback)
+ @callbacks[event] = callback
+ end
+
+ # Writes data to the parser.
+ # Returns the number of bytes parsed.
+ # In practise, this means that if the return value
+ # is less than the buffer length, a parse error occured.
+ def write(buffer)
+ i = 0
+ buffer_length = buffer.length
+ index = @index
+ flags = @flags.dup
+ state = @state
+ lookbehind = @lookbehind
+ boundary = @boundary
+ boundary_chars = @boundary_chars
+ boundary_length = @boundary.length
+ boundary_end = boundary_length - 1
+
+ while i < buffer_length
+ c = buffer[i, 1]
+ case state
+ when :parser_uninitialized
+ return i;
+ when :start
+ index = 0;
+ state = :start_boundary
+ when :start_boundary # Differs in that it has no preceeding \r\n
+ if index == boundary_length - 2
+ return i unless c == "\r"
+ index += 1
+ elsif index - 1 == boundary_length - 2
+ return i unless c == "\n"
+ # Boundary read successfully, begin next part
+ callback(:part_begin)
+ state = :header_field_start
+ else
+ return i unless c == boundary[index+2, 1] # Unexpected character
+ index += 1
+ end
+ i += 1
+ when :header_field_start
+ state = :header_field
+ @marks[:header_field] = i
+ index = 0
+ when :header_field
+ if c == "\r"
+ @marks.delete :header_field
+ state = :headers_almost_done
+ else
+ index += 1
+ unless c == "-" # Skip hyphens
+ if c == ":"
+ return i if index == 1 # Empty header field
+ data_callback(:header_field, buffer, i, :clear => true)
+ state = :header_value_start
+ else
+ cl = c.downcase
+ return i if cl < "a" || cl > "z"
+ end
+ end
+ end
+ i += 1
+ when :header_value_start
+ if c == " " # Skip spaces
+ i += 1
+ else
+ @marks[:header_value] = i
+ state = :header_value
+ end
+ when :header_value
+ if c == "\r"
+ data_callback(:header_value, buffer, i, :clear => true)
+ callback(:header_end)
+ state = :header_value_almost_done
+ end
+ i += 1
+ when :header_value_almost_done
+ return i unless c == "\n"
+ state = :header_field_start
+ i += 1
+ when :headers_almost_done
+ return i unless c == "\n"
+ callback(:headers_end)
+ state = :part_data_start
+ i += 1
+ when :part_data_start
+ state = :part_data
+ @marks[:part_data] = i
+ when :part_data
+ prev_index = index
+
+ if index == 0
+ # Boyer-Moore derived algorithm to safely skip non-boundary data
+ # See http://debuggable.com/posts/parsing-file-uploads-at-500-
+ # mb-s-with-node-js:4c03862e-351c-4faa-bb67-4365cbdd56cb
+ while i + boundary_length <= buffer_length
+ break if boundary_chars.has_key? buffer[i + boundary_end]
+ i += boundary_length
+ end
+ c = buffer[i, 1]
+ end
+
+ if index < boundary_length
+ if boundary[index, 1] == c
+ if index == 0
+ data_callback(:part_data, buffer, i, :clear => true)
+ end
+ index += 1
+ else # It was not the boundary we found, after all
+ index = 0
+ end
+ elsif index == boundary_length
+ index += 1
+ if c == "\r"
+ flags[:part_boundary] = true
+ elsif c == "-"
+ flags[:last_boundary] = true
+ else # We did not find a boundary after all
+ index = 0
+ end
+ elsif index - 1 == boundary_length
+ if flags[:part_boundary]
+ index = 0
+ if c == "\n"
+ flags.delete :part_boundary
+ callback(:part_end)
+ callback(:part_begin)
+ state = :header_field_start
+ i += 1
+ next # Ugly way to break out of the case statement
+ end
+ elsif flags[:last_boundary]
+ if c == "-"
+ callback(:part_end)
+ callback(:end)
+ state = :end
+ else
+ index = 0 # False alarm
+ end
+ else
+ index = 0
+ end
+ end
+
+ if index > 0
+ # When matching a possible boundary, keep a lookbehind
+ # reference in case it turns out to be a false lead
+ lookbehind[index-1] = c
+ elsif prev_index > 0
+ # If our boundary turns out to be rubbish,
+ # the captured lookbehind belongs to part_data
+ callback(:part_data, lookbehind, 0, prev_index)
+ @marks[:part_data] = i
+
+ # Reconsider the current character as it might be the
+ # beginning of a new sequence.
+ i -= 1
+ end
+
+ i += 1
+ when :end
+ i += 1
+ else
+ return i;
+ end
+ end
+
+ data_callback(:header_field, buffer, buffer_length)
+ data_callback(:header_value, buffer, buffer_length)
+ data_callback(:part_data, buffer, buffer_length)
+
+ @index = index
+ @state = state
+ @flags = flags
+
+ return buffer_length
+ end
+
+ private
+
+ # Issues a callback.
+ def callback(event, buffer = nil, start = nil, the_end = nil)
+ return if !start.nil? && start == the_end
+ if @callbacks.has_key? event
+ @callbacks[event].call(buffer, start, the_end)
+ end
+ end
+
+ # Issues a data callback,
+ # The only valid options is :clear,
+ # which, if true, will reset the appropriate mark to 0,
+ # If not specified, the mark will be removed.
+ def data_callback(data_type, buffer, the_end, options = {})
+ return unless @marks.has_key? data_type
+ callback(data_type, buffer, @marks[data_type], the_end)
+ unless options[:clear]
+ @marks[data_type] = 0
+ else
+ @marks.delete data_type
+ end
+ end
+end
150 lib/multipart_reader.rb
@@ -0,0 +1,150 @@
+require File.dirname(__FILE__) + '/multipart_parser'
+
+class NotMultipartError < StandardError; end;
+
+# A more high level interface to MultipartParser.
+class MultipartReader
+
+ # Initializes a MultipartReader, that will
+ # read a request with the given boundary value.
+ def initialize(boundary)
+ @parser = MultipartParser.new
+ @parser.init_with_boundary(boundary)
+ @header_field = ''
+ @header_value = ''
+ @part = nil
+ @ended = false
+ @on_error = nil
+ @on_part = nil
+
+ init_parser_callbacks
+ end
+
+ # Returns true if the parser has finished parsing
+ def ended?
+ @ended
+ end
+
+ # Sets to a code block to call
+ # when part headers have been parsed.
+ def on_part(&callback)
+ @on_part = callback
+ end
+
+ # Sets a code block to call when
+ # a parser error occurs.
+ def on_error(&callback)
+ @on_error = callback
+ end
+
+ # Write data from the given buffer (String)
+ # into the reader.
+ def write(buffer)
+ bytes_parsed = @parser.write(buffer)
+ if bytes_parsed != buffer.size
+ msg = "Parser error, #{bytes_parsed} of #{buffer.length} bytes parsed"
+ @on_error.call(msg) unless @on_error.nil?
+ end
+ end
+
+ # Extracts a boundary value from a Content-Type header.
+ # Note that it is the header value you provide here.
+ # Raises NotMultipartError if content_type is invalid.
+ def self.extract_boundary_value(content_type)
+ if content_type =~ /multipart/i
+ if match = (content_type =~ /boundary=(?:"([^"]+)"|([^;]+))/i)
+ $1 || $2
+ else
+ raise NotMultipartError.new("No multipart boundary")
+ end
+ else
+ raise NotMultipartError.new("Not a multipart content type!")
+ end
+ end
+
+ class Part
+ attr_accessor :filename, :headers, :name, :mime
+
+ def initialize
+ @headers = {}
+ @data_callback = nil
+ @end_callback = nil
+ end
+
+ # Calls the data callback with the given data
+ def emit_data(data)
+ @data_callback.call(data) unless @data_callback.nil?
+ end
+
+ # Calls the end callback
+ def emit_end
+ @end_callback.call unless @end_callback.nil?
+ end
+
+ # Sets a block to be called when part data
+ # is read. The block should take one parameter,
+ # namely the read data.
+ def on_data(&callback)
+ @data_callback = callback
+ end
+
+ # Sets a block to be called when all data
+ # for the part has been read.
+ def on_end(&callback)
+ @end_callback = callback
+ end
+ end
+
+ private
+
+ def init_parser_callbacks
+ @parser.on(:part_begin) do
+ @part = Part.new
+ @header_field = ''
+ @header_value = ''
+ end
+
+ @parser.on(:header_field) do |b, start, the_end|
+ @header_field << b[start...the_end]
+ end
+
+ @parser.on(:header_value) do |b, start, the_end|
+ @header_value << b[start...the_end]
+ end
+
+ @parser.on(:header_end) do
+ @header_field.downcase!
+ @part.headers[@header_field] = @header_value
+ if @header_field == 'content-disposition'
+ if @header_value =~ /name="([^"]+)"/i
+ @part.name = $1
+ end
+ if @header_value =~ /filename="([^;]+)"/i
+ match = $1
+ start = (match.rindex("\\") || -1)+1
+ @part.filename = match[start...(match.length)]
+ end
+ elsif @header_field == 'content-type'
+ @part.mime = @header_value
+ end
+ @header_field = ''
+ @header_value = ''
+ end
+
+ @parser.on(:headers_end) do
+ @on_part.call(@part) unless @on_part.nil?
+ end
+
+ @parser.on(:part_data) do |b, start, the_end|
+ @part.emit_data b[start...the_end]
+ end
+
+ @parser.on(:part_end) do
+ @part.emit_end
+ end
+
+ @parser.on(:end) do
+ @ended = true
+ end
+ end
+end
111 test/fixtures/multipart.rb
@@ -0,0 +1,111 @@
+# Contains fixturues to
+module MultipartFixtures
+ # Returns all fixtures in the module
+ def fixtures
+ [Rfc1867.new, NoTrailingCRLF.new, EmptyHeader.new]
+ end
+ extend self
+
+ class Rfc1867
+ def boundary
+ 'AaB03x'
+ end
+
+ def expect_error
+ false
+ end
+
+ def parts
+ part1, part2 = {}, {}
+ part1[:headers] = {'content-disposition' => 'form-data; name="field1"'}
+ part1[:data] = "Joe Blow\r\nalmost tricked you!"
+ part2[:headers] = {}
+ part2[:headers]['content-disposition'] = 'form-data; name="pics"; ' +
+ 'filename="file1.txt"'
+ part2[:headers]['Content-Type'] = 'text/plain'
+ part2[:data] = "... contents of file1.txt ...\r"
+ [part1, part2]
+ end
+
+ def raw
+ ['--AaB03x',
+ 'content-disposition: form-data; name="field1"',
+ '',
+ "Joe Blow\r\nalmost tricked you!",
+ '--AaB03x',
+ 'content-disposition: form-data; name="pics"; filename="file1.txt"',
+ 'Content-Type: text/plain',
+ '',
+ "... contents of file1.txt ...\r",
+ '--AaB03x--',
+ ''
+ ].join("\r\n")
+ end
+ end
+
+ class NoTrailingCRLF
+ def boundary
+ 'AaB03x'
+ end
+
+ def expect_error
+ false
+ end
+
+ def parts
+ part1, part2 = {}, {}
+ part1[:headers] = {'content-disposition' => 'form-data; name="field1"'}
+ part1[:data] = "Joe Blow\r\nalmost tricked you!"
+ part2[:headers] = {}
+ part2[:headers]['content-disposition'] = 'form-data; name="pics"; ' +
+ 'filename="file1.txt"'
+ part2[:headers]['Content-Type'] = 'text/plain'
+ part2[:data] = "... contents of file1.txt ...\r"
+ [part1, part2]
+ end
+
+ def raw
+ ['--AaB03x',
+ 'content-disposition: form-data; name="field1"',
+ '',
+ "Joe Blow\r\nalmost tricked you!",
+ '--AaB03x',
+ 'content-disposition: form-data; name="pics"; filename="file1.txt"',
+ 'Content-Type: text/plain',
+ '',
+ "... contents of file1.txt ...\r",
+ '--AaB03x--'
+ ].join("\r\n")
+ end
+ end
+
+ class EmptyHeader
+ def boundary
+ 'AaB03x'
+ end
+
+ def expect_error
+ true
+ end
+
+ def parts
+ [] # Should never be called
+ end
+
+ def raw
+ ['--AaB03x',
+ 'content-disposition: form-data; name="field1"',
+ ': foo',
+ '',
+ "Joe Blow\r\nalmost tricked you!",
+ '--AaB03x',
+ 'content-disposition: form-data; name="pics"; filename="file1.txt"',
+ 'Content-Type: text/plain',
+ '',
+ "... contents of file1.txt ...\r",
+ '--AaB03x--',
+ ''
+ ].join("\r\n")
+ end
+ end
+end
95 test/multipart_parser_test.rb
@@ -0,0 +1,95 @@
+require 'test/unit'
+require File.dirname(__FILE__) + "/../lib/multipart_parser"
+require File.dirname(__FILE__) + "/fixtures/multipart"
+
+class MultipartParserTest < Test::Unit::TestCase
+ def test_init_with_boundary
+ parser = MultipartParser.new
+ def parser.boundary; @boundary end
+ def parser.boundary_chars; @boundary_chars end
+
+ parser.init_with_boundary("abc")
+ assert_equal "\r\n--abc", parser.boundary
+ expected_bc = {13 => true, 10 => true, 45 => true, 97 => true,
+ 98 => true, 99 => true}
+ assert_equal expected_bc, parser.boundary_chars
+ end
+
+ def test_parser_error
+ parser = MultipartParser.new
+ parser.init_with_boundary("abc")
+ assert_equal 3, parser.write("--ad")
+ end
+
+ def test_fixtures
+ parser = MultipartParser.new
+ chunk_length = 10
+ MultipartFixtures.fixtures.each do |fixture|
+ buffer = fixture.raw
+ parts = []
+ part, header_field, header_value = nil, nil, nil
+ end_called = false
+ got_error = false
+
+ parser.init_with_boundary(fixture.boundary)
+
+ parser.on(:part_begin) do
+ part = {:headers => {}, :data => ''}
+ parts.push(part)
+ header_field = ''
+ header_value = ''
+ end
+
+ parser.on(:header_field) do |b, start, the_end|
+ header_field += b[start...the_end]
+ end
+
+ parser.on(:header_value) do |b, start, the_end|
+ header_value += b[start...the_end]
+ end
+
+ parser.on(:header_end) do
+ part[:headers][header_field] = header_value
+ header_field = ''
+ header_value = ''
+ end
+
+ parser.on(:part_data) do |b, start, the_end|
+ part[:data] += b[start...the_end]
+ end
+
+ parser.on(:end) do
+ end_called = true
+ end
+
+ offset = 0
+ while offset < buffer.length
+ if(offset + chunk_length < buffer.length)
+ chunk = buffer[offset, chunk_length]
+ else
+ chunk = buffer[offset...buffer.length]
+ end
+ offset += chunk_length
+
+ nparsed = parser.write(chunk)
+ if nparsed != chunk.length
+ unless fixture.expect_error
+ puts "--ERROR--"
+ puts chunk
+ flunk "#{fixture.class.name}: #{chunk.length} bytes written, " +
+ "but only #{nparsed} bytes parsed!"
+ else
+ got_error = true
+ end
+ end
+ end
+ unless got_error
+ assert true, end_called
+ assert_equal fixture.parts, parts
+ else
+ assert fixture.expect_error,
+ "#{fixture.class.name}: Expected parse error did not happen"
+ end
+ end
+ end
+end
72 test/multipart_reader_test.rb
@@ -0,0 +1,72 @@
+require 'test/unit'
+require File.dirname(__FILE__) + "/../lib/multipart_reader"
+require File.dirname(__FILE__) + "/fixtures/multipart"
+
+class MultipartReaderTest < Test::Unit::TestCase
+ def test_extract_boundary_value
+ assert_raise(NotMultipartError) do
+ not_multipart = "text/plain"
+ MultipartReader.extract_boundary_value(not_multipart)
+ end
+
+ assert_raise(NotMultipartError) do
+ no_boundary = "multipart/form-data"
+ MultipartReader.extract_boundary_value(no_boundary)
+ end
+
+ valid_content_type = "multipart/form-data; boundary=9asdadsdfv"
+ boundary = MultipartReader.extract_boundary_value(valid_content_type)
+ assert_equal "9asdadsdfv", boundary
+ end
+
+ def test_error_callback
+ on_error_called = false
+ reader = MultipartReader.new("boundary")
+ reader.on_error do |err|
+ on_error_called = true
+ end
+ reader.write("not boundary atleast")
+ assert on_error_called
+ end
+
+ def test_success_scenario
+ fixture = MultipartFixtures::Rfc1867.new
+ reader = MultipartReader.new(fixture.boundary)
+ on_error_called = false
+ parts = {}
+
+ reader.on_error do |err|
+ on_error_called = true
+ end
+
+ reader.on_part do |part|
+ part_entry = {:part => part, :data => '', :ended => false}
+ parts[part.name] = part_entry
+ part.on_data do |data|
+ part_entry[:data] << data
+ end
+ part.on_end do
+ part_entry[:ended] = true
+ end
+ end
+
+ reader.write(fixture.raw)
+
+ assert !on_error_called
+ assert reader.ended?
+
+ assert_equal parts.size, fixture.parts.size
+ assert parts.all? {|k, v| v[:ended]}
+
+ field = parts['field1']
+ assert !field.nil?
+ assert_equal 'field1', field[:part].name
+ assert_equal fixture.parts.first[:data], field[:data]
+
+ file = parts['pics']
+ assert !file.nil?
+ assert_equal 'pics', file[:part].name
+ assert_equal 'file1.txt', file[:part].filename
+ assert_equal fixture.parts.last[:data], file[:data]
+ end
+end

0 comments on commit 60216cf

Please sign in to comment.