Skip to content

Commit

Permalink
Simplify tests
Browse files Browse the repository at this point in the history
  • Loading branch information
josevalim committed Jul 2, 2024
1 parent c83334e commit fcfa87f
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 112 deletions.
112 changes: 112 additions & 0 deletions lib/elixir/test/elixir/kernel/string_tokenizer_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,116 @@ defmodule Kernel.StringTokenizerTest do
assert {:error, _} = Code.string_to_quoted("Ola?")
assert {:error, _} = Code.string_to_quoted("Ola!")
end

describe "script mixing" do
test "prevents Restricted codepoints in identifiers" do
exception = assert_raise SyntaxError, fn -> Code.string_to_quoted!("_shibㅤ = 1") end

assert Exception.message(exception) =~
"unexpected token: \"\" (column 6, code point U+3164)"
end

test "prevents unsafe mixing in identifiers" do
exception =
assert_raise SyntaxError, fn ->
Code.string_to_quoted!("if аdmin_, do: :ok, else: :err")
end

assert Exception.message(exception) =~ "nofile:1:9:"
assert Exception.message(exception) =~ "invalid mixed-script identifier found: аdmin"

for s <- [
"\\u0430 а {Cyrillic}",
"\\u0064 d {Latin}",
"\\u006D m {Latin}",
"\\u0069 i {Latin}",
"\\u006E n {Latin}",
"\\u005F _"
] do
assert Exception.message(exception) =~ s
end

# includes suggestion about what to change
assert Exception.message(exception) =~ """
Hint: You could write the above in a similar way that is accepted by Elixir:
"""

assert Exception.message(exception) =~ """
"admin_" (code points 0x00061 0x00064 0x0006D 0x00069 0x0006E 0x0005F)
"""

# a is in cyrillic
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end

# c is Latin
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end

# T is in cyrillic
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
end

test "allows legitimate script mixing" do
# writing systems that legitimately mix multiple scripts, and Common chars like _
assert Code.eval_string("幻ㄒㄧㄤ = 1") == {1, [幻ㄒㄧㄤ: 1]}
assert Code.eval_string("幻ㄒㄧㄤ1 = 1") == {1, [幻ㄒㄧㄤ1: 1]}
assert Code.eval_string("__सवव_1? = 1") == {1, [__सवव_1?: 1]}

# works with atoms too
assert Code.eval_string(":Tシャツ") == {:Tシャツ, []}

# elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
# ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
assert Code.eval_string("μs = 1") == {1, [μs: 1]}

# allows mixed scripts if the chunks are all single-script or highly restrictive
assert Code.eval_string("http_сервер = 1") == {1, [http_сервер: 1]}
assert Code.eval_string("сервер_http = 1") == {1, [сервер_http: 1]}
end

test "bidi" do
# test that the implementation of String.Tokenizer.Security.unbidify/1 agrees
# w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples
#
# you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2
# inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
#
# there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and
# deleting it makes these examples hard to read in many/most editors!
"""
foo;A;0066 006F 006F;0 1 2
_foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4
__foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6
لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8
foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12
foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4
foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4
foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4
foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4
foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17
foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18
"""
|> String.split("\n", trim: true)
|> Enum.map(&String.split(&1, ";", trim: true))
|> Enum.each(fn
[ident, _, bytes, indices | _rest] ->
bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16))
indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1)
display_ordered = for i <- indices, do: Enum.at(bytes, i)
unbidified = String.Tokenizer.Security.unbidify(bytes)

if display_ordered != unbidified do
raise """
Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}'
bytes : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")}
byte order : #{bytes |> Enum.intersperse(32)}
uax9 order : #{display_ordered |> Enum.intersperse(32)}
uax9 indices : #{indices |> Enum.join(" ")}
unbidify/1 : #{unbidified |> Enum.intersperse(32)}
"""
end
end)
end
end
end
112 changes: 0 additions & 112 deletions lib/elixir/test/elixir/kernel/warning_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,6 @@ defmodule Kernel.WarningTest do
end)
end

defp capture_quoted(source) do
capture_err(fn ->
Code.string_to_quoted!(source, columns: true)
end)
end

defp capture_compile(source) do
capture_err(fn ->
quoted = Code.string_to_quoted!(source, columns: true)
Expand Down Expand Up @@ -93,13 +87,6 @@ defmodule Kernel.WarningTest do
end

describe "unicode identifier security" do
test "prevents Restricted codepoints in identifiers" do
exception = assert_raise SyntaxError, fn -> Code.string_to_quoted!("_shibㅤ = 1") end

assert Exception.message(exception) =~
"unexpected token: \"\" (column 6, code point U+3164)"
end

test "warns on confusables" do
assert_warn_quoted(
["nofile:1:6", "confusable identifier: 'a' looks like 'а' on line 1"],
Expand Down Expand Up @@ -161,105 +148,6 @@ defmodule Kernel.WarningTest do
],
"a_א1 or a_1א"
)

# test that the implementation of String.Tokenizer.Security.unbidify/1 agrees
# w/Unicode Bidi Algo (UAX9) for these (identifier-specific, no-bracket) examples
#
# you can create new examples with: https://util.unicode.org/UnicodeJsps/bidic.jsp?s=foo_%D9%84%D8%A7%D9%85%D8%AF%D8%A7_baz&b=0&u=140&d=2
# inspired by (none of these are directly usable for our idents): https://www.unicode.org/Public/UCD/latest/ucd/BidiCharacterTest.txt
#
# there's a spurious ;A; after the identifier, because the semicolon is dir-neutral, and
# deleting it makes these examples hard to read in many/most editors!
"""
foo;A;0066 006F 006F;0 1 2
_foo_ ;A;005F 0066 006F 006F 005F;0 1 2 3 4
__foo__ ;A;005F 005F 0066 006F 006F 005F 005F;0 1 2 3 4 5 6
لامدا_foo ;A;0644 0627 0645 062F 0627 005F 0066 006F 006F;4 3 2 1 0 5 6 7 8
foo_لامدا_baz ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 0062 0061 007A;0 1 2 3 8 7 6 5 4 9 10 11 12
foo_لامدا ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627;0 1 2 3 8 7 6 5 4
foo_لامدا1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 0031;0 1 2 3 9 8 7 6 5 4
foo_لامدا_حدد ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F;0 1 2 3 12 11 10 9 8 7 6 5 4
foo_لامدا_حدد1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4
foo_لامدا_حدد1_bar ;A; 0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17
foo_لامدا_حدد1_bar1 ;A;0066 006F 006F 005F 0644 0627 0645 062F 0627 005F 062D 062F 062F 0031 005F 0062 0061 0072 0031;0 1 2 3 13 12 11 10 9 8 7 6 5 4 14 15 16 17 18
"""
|> String.split("\n", trim: true)
|> Enum.map(&String.split(&1, ";", trim: true))
|> Enum.each(fn
[ident, _, bytes, indices | _rest] ->
bytes = String.split(bytes, " ", trim: true) |> Enum.map(&String.to_integer(&1, 16))
indices = String.split(indices, " ", trim: true) |> Enum.map(&String.to_integer/1)
display_ordered = for i <- indices, do: Enum.at(bytes, i)
unbidified = String.Tokenizer.Security.unbidify(bytes)

assert(display_ordered == unbidified, """
Failing String.Tokenizer.Security.unbidify/1 case for: '#{ident}'
bytes : #{bytes |> Enum.map(&Integer.to_string(&1, 16)) |> Enum.join(" ")}
byte order : #{bytes |> Enum.intersperse(32)}
uax9 order : #{display_ordered |> Enum.intersperse(32)}
uax9 indices : #{indices |> Enum.join(" ")}
unbidify/1 : #{unbidified |> Enum.intersperse(32)}
""")
end)
end

test "prevents unsafe script mixing in identifiers" do
exception =
assert_raise SyntaxError, fn ->
Code.string_to_quoted!("if аdmin_, do: :ok, else: :err")
end

assert Exception.message(exception) =~ "nofile:1:9:"
assert Exception.message(exception) =~ "invalid mixed-script identifier found: аdmin"

for s <- [
"\\u0430 а {Cyrillic}",
"\\u0064 d {Latin}",
"\\u006D m {Latin}",
"\\u0069 i {Latin}",
"\\u006E n {Latin}",
"\\u005F _"
] do
assert Exception.message(exception) =~ s
end

# includes suggestion about what to change
assert Exception.message(exception) =~ """
Hint: You could write the above in a similar way that is accepted by Elixir:
"""

assert Exception.message(exception) =~ """
"admin_" (code points 0x00061 0x00064 0x0006D 0x00069 0x0006E 0x0005F)
"""

# a is in cyrillic
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[аdmin: 1]") end
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[{:аdmin, 1}]") end
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("quote do: аdmin(1)") end

# c is Latin
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("http_cервер = 1") end

# T is in cyrillic
assert_raise SyntaxError, ~r/mixed/, fn -> Code.string_to_quoted!("[Тシャツ: 1]") end
end

test "allows legitimate script mixing" do
# writing systems that legitimately mix multiple scripts, and Common chars like _
assert capture_eval("幻ㄒㄧㄤ = 1") == ""
assert capture_eval("幻ㄒㄧㄤ1 = 1") == ""
assert capture_eval("__सवव_1? = 1") == ""

# uts39 5.2 allowed 'highly restrictive' script mixing, like 't-shirt' in Jpan:
assert capture_quoted(":Tシャツ") == ""

# elixir's normalizations combine scriptsets of the 'from' and 'to' characters,
# ex: {Common} MICRO => {Greek} MU == {Common, Greek}; Common intersects w/all
assert capture_quoted("μs") == ""

# allows mixed scripts if the chunks are all single-script or highly restrictive
assert capture_eval("http_сервер = 1") == ""
assert capture_eval("сервер_http = 1") == ""
end
end

Expand Down

0 comments on commit fcfa87f

Please sign in to comment.