Skip to content

Commit

Permalink
use power mean to rescale weights
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed Apr 10, 2023
1 parent 572a03c commit 285a90f
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "WordCloud"
uuid = "6385f0a0-cb03-45b6-9089-4e0acc74b26b"
authors = ["guoyongzhi <guo-yong-zhi@outlook.com>"]
version = "0.10.11"
version = "0.10.12"

[deps]
ColorSchemes = "35d6a980-a343-548e-a6ea-1d62b119f2f4"
Expand Down
11 changes: 7 additions & 4 deletions plutoapp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ end
# ╔═╡ b38c3ad9-7885-4af6-8394-877fde8ed83b
md"**mask outline:** $(@bind outlinewidth NumberField(-1:100, default=-1)) *-1 means random*"

# ╔═╡ 6e614caa-38dc-4028-b0a7-05f7030d5b43
md"**layout style:** $(@bind style Select([:auto, :uniform, :gathering]))"

# ╔═╡ 872f2653-303f-4b53-8e01-26bec86fc413
md"""**text density:** $(@bind density NumberField(0.1:0.01:10.0, default=0.5))  **min word spacing:** $(@bind spacing NumberField(0:100, default=2))"""

Expand Down Expand Up @@ -219,12 +222,11 @@ weightscale_funcs = [
(n->n^2) => "",
expm1 => "exp x",
]
weightscale_types = [:wordarea, :fontsize, :diagonallength]
nothing
end

# ╔═╡ 6e614caa-38dc-4028-b0a7-05f7030d5b43
md"**layout style:** $(@bind style Select([:auto, :uniform, :gathering]))  **rescale weights:** $(@bind rescale_func Select(weightscale_funcs)) $(@bind scalekeeptype Select(weightscale_types))"
# ╔═╡ dfe608b0-077c-437a-adf2-b1382a0eb4eb
md"**rescale weights:** $(@bind rescale_func Select(weightscale_funcs))  **word length balance:** $(@bind word_length_balance Slider(-8:0.1:8, default=0))"

# ╔═╡ e7ec8cd7-f60b-4eb0-88fc-76d694976f9d
begin
Expand Down Expand Up @@ -309,7 +311,7 @@ try
else
text = read_table(text_)
end
dict_process = rescaleweights(rescale_func, scalekeeptype) casemerge! lemmatize!
dict_process = rescaleweights(rescale_func, word_length_balance) casemerge! lemmatize!
if text isa AbstractString && ischinese(text)
println("检测到中文")
text = wordseg_cn(text)
Expand Down Expand Up @@ -421,6 +423,7 @@ end
# ╟─1aa632dc-b3e8-4a9d-9b9e-c13cd05cf97e
# ╟─b38c3ad9-7885-4af6-8394-877fde8ed83b
# ╟─6e614caa-38dc-4028-b0a7-05f7030d5b43
# ╟─dfe608b0-077c-437a-adf2-b1382a0eb4eb
# ╟─872f2653-303f-4b53-8e01-26bec86fc413
# ╟─26d6b795-1cc3-4548-aa07-86c2f6ee0776
# ╟─7993fd44-2fcf-488e-9280-4b4d0bf0e22c
Expand Down
17 changes: 9 additions & 8 deletions src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,16 @@ function lemmatize!(d::AbstractDict)
d
end

function _rescaleweights(dict, func=identity, keep=:wordarea)
@assert keep in [:wordarea, :fontsize, :diagonallength]
if keep == :wordarea
newdict = Dict(k => func(v) / sqrt(length(k)) for (k, v) in dict)
elseif keep == :fontsize
newdict = Dict(k => func(v) for (k, v) in dict)
function _rescaleweights(dict, func=identity, p=0) # p is the exponent of the power mean
# ((fontsize^p + (wordlength*fontsize)^p)/2) ^ (1/p) = weight
# p=-1, harmonic mean; p=0, geometric mean; p=1, arithmetic mean; p=2, root mean square;
# p=-∞, minimum; p=∞, maximum;
if p == 0
wordlength_scale = l->sqrt(l)
else
newdict = Dict(k => func(v) / sqrt(length(k)^2 + 1) for (k, v) in dict)
wordlength_scale = l->(l^p+1)^(1/p)
end
newdict = Dict(k => func(v) / wordlength_scale(length(k)) for (k, v) in dict)
sc = sum(values(dict)) / sum(values(newdict))
for k in keys(newdict)
newdict[k] *= sc
Expand Down Expand Up @@ -140,7 +141,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
minfrequency=0,
maxnum=500,
minweight=1 / maxnum, maxweight=:auto,
process=rescaleweights(identity, :wordarea) casemerge! lemmatize!)
process=rescaleweights(identity, p=0) casemerge! lemmatize!)
stopwords isa AbstractSet || (stopwords = Set(stopwords))
counter = process(counter)
print("Total words: $(round(sum(values(counter)), digits=2)). ")
Expand Down

0 comments on commit 285a90f

Please sign in to comment.