Move code to parse the PersonIdent structure from a tag or commit obj…

…ect into PersonIdent module. (#36)
elixir-git · Jul 20, 2019 · ee82363 · ee82363
1 parent f2adcf2
commit ee82363
Show file tree

Hide file tree

Showing 4 changed files with 229 additions and 215 deletions.
diff --git a/lib/xgit/core/person_ident.ex b/lib/xgit/core/person_ident.ex
@@ -51,6 +51,11 @@ defmodule Xgit.Core.PersonIdent do
   A combination of a person identity and time in git.
   """
 
+  alias Xgit.Util.RawParseUtils
+
+  @typedoc "Time zone offset in minutes +/- from GMT."
+  @type tz_offset :: -720..840
+
   @typedoc ~S"""
   The tuple of name, email, time, and time zone that specifies who wrote or
   committed something.
@@ -66,12 +71,96 @@ defmodule Xgit.Core.PersonIdent do
           name: String.t(),
           email: String.t(),
           when: integer,
-          tz_offset: Xgit.Lib.Constants.tz_offset()
+          tz_offset: tz_offset()
         }
 
   @enforce_keys [:name, :email, :when, :tz_offset]
   defstruct [:name, :email, :when, :tz_offset]
 
+  @doc ~S"""
+  Parse a name line (e.g. author, committer, tagger) into a `PersonIdent` struct.
+
+  ## Parameters
+
+  `b` should be a charlist of an "author" or "committer" line pointing to the
+  character after the header name and space.
+
+  The functions `Xgit.Util.RawParseUtils.author/1` and `Xgit.Util.RawParseUtils.committer/1`
+  will return suitable charlists.
+
+  ## Return Value
+
+  Returns a `PersonIdent` struct or `nil` if the charlist did not point to a
+  properly-formatted identity.
+  """
+  @spec from_byte_list(b :: [byte]) :: t() | nil
+  def from_byte_list(b) when is_list(b) do
+    with [?< | email_start] <- RawParseUtils.next_lf(b, ?<),
+         true <- has_closing_angle_bracket?(email_start),
+         email <- RawParseUtils.until_next_lf(email_start, ?>),
+         name <- parse_name(b),
+         {time, tz} <- parse_tz(email_start) do
+      %__MODULE__{
+        name: RawParseUtils.decode(name),
+        email: RawParseUtils.decode(email),
+        when: time,
+        tz_offset: tz
+      }
+    else
+      _ -> nil
+    end
+  end
+
+  defp has_closing_angle_bracket?(b), do: Enum.any?(b, &(&1 == ?>))
+
+  defp parse_name(b) do
+    b
+    |> RawParseUtils.until_next_lf(?<)
+    |> Enum.reverse()
+    |> drop_first_if_space()
+    |> Enum.reverse()
+  end
+
+  defp drop_first_if_space([?\s | b]), do: b
+  defp drop_first_if_space(b), do: b
+
+  defp parse_tz(first_email_start) do
+    # Start searching from end of line, as after first name-email pair,
+    # another name-email pair may occur. We will ignore all kinds of
+    # "junk" following the first email.
+
+    # We've to use (emailE - 1) for the case that raw[email] is LF,
+    # otherwise we would run too far. "-2" is necessary to position
+    # before the LF in case of LF termination resp. the penultimate
+    # character if there is no trailing LF.
+
+    [?> | first_email_end] = RawParseUtils.next_lf(first_email_start, ?>)
+    rev = Enum.reverse(first_email_end)
+
+    {tz, rev} = trim_word_and_rev(rev)
+    {time, _rev} = trim_word_and_rev(rev)
+
+    case {time, tz} do
+      {[_ | _], [_ | _]} ->
+        {time |> RawParseUtils.parse_base_10() |> elem(0),
+         tz |> RawParseUtils.parse_timezone_offset() |> elem(0)}
+
+      _ ->
+        {0, 0}
+    end
+  end
+
+  defp trim_word_and_rev(rev) do
+    rev = Enum.drop_while(rev, &(&1 == ?\s))
+
+    word =
+      rev
+      |> Enum.take_while(&(&1 != ?\s))
+      |> Enum.reverse()
+
+    {word, Enum.drop(rev, Enum.count(word))}
+  end
+
   @doc ~S"""
   Sanitize the given string for use in an identity and append to output.
 
@@ -88,7 +177,7 @@ defmodule Xgit.Core.PersonIdent do
   @doc ~S"""
   Formats a timezone offset.
   """
-  @spec format_timezone(offset :: Xgit.Lib.Constants.tz_offset()) :: String.t()
+  @spec format_timezone(offset :: tz_offset()) :: String.t()
   def format_timezone(offset) when is_integer(offset) do
     sign =
       if offset < 0,

diff --git a/lib/xgit/util/raw_parse_utils.ex b/lib/xgit/util/raw_parse_utils.ex
@@ -50,8 +50,6 @@ defmodule Xgit.Util.RawParseUtils do
   Handy utility functions to parse raw object contents.
   """
 
-  alias Xgit.Core.PersonIdent
-
   @doc ~S"""
   Does the charlist `b` start with the same characters as `str`?
 
@@ -183,7 +181,7 @@ defmodule Xgit.Util.RawParseUtils do
   that was found (or 0 if no number found there) and `new_buffer` is the charlist
   following the number that was parsed.
   """
-  @spec parse_timezone_offset(b :: charlist) :: {Xgit.Lib.Constants.tz_offset(), charlist}
+  @spec parse_timezone_offset(b :: charlist) :: {Xgit.Core.PersonIdent.tz_offset(), charlist}
   def parse_timezone_offset(b) when is_list(b) do
     {v, b} = parse_base_10(b)
 
@@ -359,78 +357,6 @@ defmodule Xgit.Util.RawParseUtils do
   defp trim_if_string(s) when is_binary(s), do: String.trim(s)
   defp trim_if_string(s), do: s
 
-  @doc ~S"""
-  Parse a name line (e.g. author, committer, tagger) into a `PersonIdent` struct.
-
-  When passing in a charlist for `b` callers should use the return value of
-  `author/1` or `committer/1`, as these functions provide the proper subset of
-  the buffer.
-
-  Returns `%PersonIdent{}` or `nil` in case the identity could not be parsed.
-  """
-  @spec parse_person_ident(b :: charlist) :: Xgit.Core.PersonIdent.t()
-  def parse_person_ident(b) when is_list(b) do
-    with [?< | email_start] <- next_lf(b, ?<),
-         true <- has_closing_angle_bracket?(email_start),
-         email <- until_next_lf(email_start, ?>),
-         name <- parse_name(b),
-         {time, tz} <- parse_tz(email_start) do
-      %PersonIdent{name: decode(name), email: decode(email), when: time, tz_offset: tz}
-    else
-      # Could not parse the line as a PersonIdent.
-      _ -> nil
-    end
-  end
-
-  defp has_closing_angle_bracket?(b), do: Enum.any?(b, &(&1 == ?>))
-
-  defp parse_name(b) do
-    b
-    |> until_next_lf(?<)
-    |> Enum.reverse()
-    |> drop_first_if_space()
-    |> Enum.reverse()
-  end
-
-  defp drop_first_if_space([?\s | b]), do: b
-  defp drop_first_if_space(b), do: b
-
-  defp parse_tz(first_email_start) do
-    # Start searching from end of line, as after first name-email pair,
-    # another name-email pair may occur. We will ignore all kinds of
-    # "junk" following the first email.
-
-    # We've to use (emailE - 1) for the case that raw[email] is LF,
-    # otherwise we would run too far. "-2" is necessary to position
-    # before the LF in case of LF termination resp. the penultimate
-    # character if there is no trailing LF.
-
-    [?> | first_email_end] = next_lf(first_email_start, ?>)
-    rev = Enum.reverse(first_email_end)
-
-    {tz, rev} = trim_word_and_rev(rev)
-    {time, _rev} = trim_word_and_rev(rev)
-
-    case {time, tz} do
-      {[_ | _], [_ | _]} ->
-        {time |> parse_base_10() |> elem(0), tz |> parse_timezone_offset() |> elem(0)}
-
-      _ ->
-        {0, 0}
-    end
-  end
-
-  defp trim_word_and_rev(rev) do
-    rev = Enum.drop_while(rev, &(&1 == ?\s))
-
-    word =
-      rev
-      |> Enum.take_while(&(&1 != ?\s))
-      |> Enum.reverse()
-
-    {word, Enum.drop(rev, Enum.count(word))}
-  end
-
   @doc ~S"""
   Convert a list of bytes to an Elixir (UTF-8) string when the encoding is not
   definitively know. Try parsing as a UTF-8 byte array first, then try ISO-8859-1.

diff --git a/test/xgit/core/person_ident_test.exs b/test/xgit/core/person_ident_test.exs
@@ -49,6 +49,143 @@ defmodule Xgit.Core.PersonIdentTest do
 
   alias Xgit.Core.PersonIdent
 
+  describe "from_byte_list/1" do
+    defp assert_person_ident(line, expected) do
+      actual_pi = PersonIdent.from_byte_list(line)
+      assert expected == actual_pi
+    end
+
+    test "legal cases" do
+      whxn = 1_234_567_890
+      tz = -420
+
+      assert_person_ident(
+        'Me <me@example.com> 1234567890 -0700',
+        %PersonIdent{name: "Me", email: "me@example.com", when: whxn, tz_offset: tz}
+      )
+
+      assert_person_ident(
+        ' Me <me@example.com> 1234567890 -0700',
+        %PersonIdent{name: " Me", email: "me@example.com", when: whxn, tz_offset: tz}
+      )
+
+      assert_person_ident(
+        'A U Thor <author@example.com> 1234567890 -0700',
+        %PersonIdent{name: "A U Thor", email: "author@example.com", when: whxn, tz_offset: tz}
+      )
+
+      assert_person_ident(
+        'A U Thor<author@example.com> 1234567890 -0700',
+        %PersonIdent{name: "A U Thor", email: "author@example.com", when: whxn, tz_offset: tz}
+      )
+
+      assert_person_ident(
+        'A U Thor<author@example.com>1234567890 -0700',
+        %PersonIdent{name: "A U Thor", email: "author@example.com", when: whxn, tz_offset: tz}
+      )
+
+      assert_person_ident(
+        ' A U Thor   < author@example.com > 1234567890 -0700',
+        %PersonIdent{
+          name: " A U Thor  ",
+          email: " author@example.com ",
+          when: whxn,
+          tz_offset: tz
+        }
+      )
+
+      assert_person_ident(
+        'A U Thor<author@example.com>1234567890 -0700',
+        %PersonIdent{name: "A U Thor", email: "author@example.com", when: whxn, tz_offset: tz}
+      )
+    end
+
+    test "fuzzy cases" do
+      whxn = 1_234_567_890
+      tz = -420
+
+      assert_person_ident(
+        'A U Thor <author@example.com>,  C O. Miter <comiter@example.com> 1234567890 -0700',
+        %PersonIdent{name: "A U Thor", email: "author@example.com", when: whxn, tz_offset: tz}
+      )
+
+      assert_person_ident(
+        'A U Thor <author@example.com> and others 1234567890 -0700',
+        %PersonIdent{name: "A U Thor", email: "author@example.com", when: whxn, tz_offset: tz}
+      )
+    end
+
+    test "incomplete cases" do
+      whxn = 1_234_567_890
+      tz = -420
+
+      assert_person_ident('Me <> 1234567890 -0700', %PersonIdent{
+        name: "Me",
+        email: "",
+        when: whxn,
+        tz_offset: tz
+      })
+
+      assert_person_ident(
+        ' <me@example.com> 1234567890 -0700',
+        %PersonIdent{name: "", email: "me@example.com", when: whxn, tz_offset: tz}
+      )
+
+      assert_person_ident(' <> 1234567890 -0700', %PersonIdent{
+        name: "",
+        email: "",
+        when: whxn,
+        tz_offset: tz
+      })
+
+      assert_person_ident('<>', %PersonIdent{name: "", email: "", when: 0, tz_offset: 0})
+
+      assert_person_ident(' <>', %PersonIdent{name: "", email: "", when: 0, tz_offset: 0})
+
+      assert_person_ident('<me@example.com>', %PersonIdent{
+        name: "",
+        email: "me@example.com",
+        when: 0,
+        tz_offset: 0
+      })
+
+      assert_person_ident(' <me@example.com>', %PersonIdent{
+        name: "",
+        email: "me@example.com",
+        when: 0,
+        tz_offset: 0
+      })
+
+      assert_person_ident('Me <>', %PersonIdent{name: "Me", email: "", when: 0, tz_offset: 0})
+
+      assert_person_ident('Me <me@example.com>', %PersonIdent{
+        name: "Me",
+        email: "me@example.com",
+        when: 0,
+        tz_offset: 0
+      })
+
+      assert_person_ident('Me <me@example.com> 1234567890', %PersonIdent{
+        name: "Me",
+        email: "me@example.com",
+        when: 0,
+        tz_offset: 0
+      })
+
+      assert_person_ident('Me <me@example.com> 1234567890 ', %PersonIdent{
+        name: "Me",
+        email: "me@example.com",
+        when: 0,
+        tz_offset: 0
+      })
+    end
+
+    test "malformed cases" do
+      assert_person_ident('Me me@example.com> 1234567890 -0700', nil)
+      assert_person_ident('Me <me@example.com 1234567890 -0700', nil)
+    end
+  end
+
   describe "sanitized/1" do
     test "strips whitespace and non-parseable characters from raw string" do
       assert PersonIdent.sanitized(" Baz>\n\u1234<Quux ") == "Baz\u1234Quux"