diff --git a/Project.toml b/Project.toml index 9c8184f..58573b4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "WordCloud" uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b" authors = ["guoyongzhi "] -version = "0.13.1" +version = "0.13.2" [deps] ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4" diff --git a/src/textprocessing.jl b/src/textprocessing.jl index 4a31b24..ecde049 100644 --- a/src/textprocessing.jl +++ b/src/textprocessing.jl @@ -294,22 +294,24 @@ function processtext(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}} end function html2text(content::AbstractString) patterns = [ + r"\"[\s\S]*?\"" => " ", r"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?/[\s]*?script[\s]*?>" => " ", r"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?/[\s]*?style[\s]*?>" => " ", - r"" => " ", "
" => "\n", r"<[\s\S]*?>" => " ", - " " => " ", + ] + for p in patterns + content = replace(content, p) # single pass not work + end + patterns = [ + " " => " ", """ => "\"", "&" => "&", "<" => "<", ">" => ">", r"&#?\w{1,6};" => " ", ] - for p in patterns - content = replace(content, p) - end - content + replace(content, patterns...) end html2text(file::IO) = html2text(read(file, String)) end diff --git a/test/test_textprocessing.jl b/test/test_textprocessing.jl index e455098..263d694 100644 --- a/test/test_textprocessing.jl +++ b/test/test_textprocessing.jl @@ -75,4 +75,7 @@ @test abs(pm(12.5, 2-1e-8) - sqrt(12.5^2/2+1/2)) < 1e-6 @test pm(π, Inf) ≈ π @test pm(7π, -Inf) == 1. + + htstr = """£abcd
""" + @test strip(html2text(htstr)) == "abcd" end \ No newline at end of file