Skip to content

Commit

Permalink
stable powermean
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed Apr 11, 2023
1 parent 285a90f commit ea080dc
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 12 deletions.
4 changes: 2 additions & 2 deletions plutoapp.jl
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ nothing
end

# ╔═╡ dfe608b0-077c-437a-adf2-b1382a0eb4eb
md"**rescale weights:** $(@bind rescale_func Select(weightscale_funcs))  **word length balance:** $(@bind word_length_balance Slider(-8:0.1:8, default=0))"
md"**rescale weights:** $(@bind rescale_func Select(weightscale_funcs))  **word length balance:** $(@bind word_length_balance Slider(-1:0.01:1, default=0, show_value=true))"

# ╔═╡ e7ec8cd7-f60b-4eb0-88fc-76d694976f9d
begin
Expand Down Expand Up @@ -311,7 +311,7 @@ try
else
text = read_table(text_)
end
dict_process = rescaleweights(rescale_func, word_length_balance) casemerge! lemmatize!
dict_process = rescaleweights(rescale_func, tan(word_length_balance*π/2)) casemerge! lemmatize!
if text isa AbstractString && ischinese(text)
println("检测到中文")
text = wordseg_cn(text)
Expand Down
36 changes: 26 additions & 10 deletions src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,23 +104,39 @@ function lemmatize!(d::AbstractDict)
d
end

function _rescaleweights(dict, func=identity, p=0) # p is the exponent of the power mean
# ((fontsize^p + (wordlength*fontsize)^p)/2) ^ (1/p) = weight
# p=-1, harmonic mean; p=0, geometric mean; p=1, arithmetic mean; p=2, root mean square;
# p=-∞, minimum; p=∞, maximum;
if p == 0
wordlength_scale = l->sqrt(l)
function powermeanwith1(x, p)
x = float(x)
xp = x^p
if xp == 1.0
return sqrt(x)
elseif isinf(xp)
return exp(log(x) - log(2) / p)
else
wordlength_scale = l->(l^p+1)^(1/p)
return exp((log((xp / 2 + 1 / 2))) / p)
end
newdict = Dict(k => func(v) / wordlength_scale(length(k)) for (k, v) in dict)
end

function _rescaleweights(dict, func=identity, p=0)
newdict = Dict(k => func(v) / powermeanwith1(length(k), p) for (k, v) in dict)
sc = sum(values(dict)) / sum(values(newdict))
for k in keys(newdict)
newdict[k] *= sc
end
newdict
end
rescaleweights(a...; ka...) = dict -> _rescaleweights(dict, a...; ka...)

"""
rescaleweights(func=identity, p=0)
This function takes word length into account.So the weights after rescaled can be used as font size coefficients.
func(w::Real)->Real is used to remap weight, say `weight=func(weight)`; p is the exponent of the power mean.
We set `weight = powermean(1*fontsize, wordlength*fontsize) = ((fontsize^p + (wordlength*fontsize)^p)/2) ^ (1/p)`
That is `weight = fontsize * powermean(1, wordlength)`
Overall, that makes `fontsize = func(weight) / powermean(1, wordlength)`
p=-Inf, powermean is minimum (fontsize=weight); p=Inf, powermean is maximum (fontsize=weight/wordlength);
p=-1, powermean is harmonic mean; p=0, powermean is geometric mean (keep the word area);
p=1, powermean is arithmetic mean; p=2, powermean is root mean square (keep the diagonal length);
"""
rescaleweights(func=identity, p=0) = dict -> _rescaleweights(dict, func, p)

"""
processtext the text, filter the words, and adjust the weights. return words vector and weights vector.
Expand All @@ -141,7 +157,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
minfrequency=0,
maxnum=500,
minweight=1 / maxnum, maxweight=:auto,
process=rescaleweights(identity, p=0) casemerge! lemmatize!)
process=rescaleweights(identity, 0) casemerge! lemmatize!)
stopwords isa AbstractSet || (stopwords = Set(stopwords))
counter = process(counter)
print("Total words: $(round(sum(values(counter)), digits=2)). ")
Expand Down
11 changes: 11 additions & 0 deletions test/test_textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,15 @@
@test length(processtext(["cat" => 1, "dog" => 1, "dogs" => 3, "Dogs" => 2, "Dog" => 1])[1]) == 2
@test processtext(["cat" => 3, "Dog" => 1, "dogs" => 2])[2] |> diff |> only |> iszero
@test processtext("word cloud") == processtext(["word","cloud"], [12,12]) == processtext([("word", 3), ("cloud", 3)])

pm = WordCloud.TextProcessing.powermeanwith1
@test pm(1, -12.34) == 1
@test abs(pm(7, 1e-8) - sqrt(7)) < 1e-6
@test pm(9, 0) == 3.
@test pm(17, 1) 9
@test pm(2, -1) 2/(1/2+1)
@test pm(Inf, -1) 2
@test abs(pm(12.5, 2-1e-8) - sqrt(12.5^2/2+1/2)) < 1e-6
@test pm(π, Inf) π
@test pm(7π, -Inf) == 1.
end

2 comments on commit ea080dc

@guo-yong-zhi
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register()

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/81431

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.10.12 -m "<description of version>" ea080dc6ccc89d8676557f285ba5d5821267b446
git push origin v0.10.12

Please sign in to comment.