Skip to content

Commit

Permalink
fix html2text
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed May 12, 2024
1 parent 6865221 commit c4a27f8
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "WordCloud"
uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b"
authors = ["guoyongzhi <guo-yong-zhi@outlook.com>"]
version = "0.13.1"
version = "0.13.2"

[deps]
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
Expand Down
14 changes: 8 additions & 6 deletions src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -294,22 +294,24 @@ function processtext(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}
end
function html2text(content::AbstractString)
patterns = [
r"\"[\s\S]*?\"" => " ",
r"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?/[\s]*?script[\s]*?>" => " ",
r"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?/[\s]*?style[\s]*?>" => " ",
r"<!--[\s\S]*?-->" => " ",
"<br>" => "\n",
r"<[\s\S]*?>" => " ",
"&nbsp;" => " ",
]
for p in patterns
content = replace(content, p) # single pass not work
end
patterns = [
"&nbsp;" => " ",
"&quot;" => "\"",
"&amp;" => "&",
"&lt;" => "<",
"&gt;" => ">",
r"&#?\w{1,6};" => " ",
]
for p in patterns
content = replace(content, p)
end
content
replace(content, patterns...)
end
html2text(file::IO) = html2text(read(file, String))
end
3 changes: 3 additions & 0 deletions test/test_textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,7 @@
@test abs(pm(12.5, 2-1e-8) - sqrt(12.5^2/2+1/2)) < 1e-6
@test pm(π, Inf) π
@test pm(7π, -Inf) == 1.

htstr = """&pound;abcd<div x-component-name="DisasterSokuho" x-component-data="{&quot;earthquake&quot;:&quot;&lt;!-- 地震速報のメッセージを消しました (2024-04-25 12:00:08)-->\n&quot;,&quot;tsunami&quot;:&quot;&lt;!-- 津波速報のメッセージを消しました (2024-04-25 12:05:35)-->\n&quot;}"><div class="tYQVs"><div>"""
@test strip(html2text(htstr)) == "abcd"
end

0 comments on commit c4a27f8

Please sign in to comment.