Merge branch 'master' of https://github.com/guo-yong-zhi/WordCloud.jl

guo-yong-zhi · May 12, 2024 · 53ef4d2 · 53ef4d2
2 parents 4da4a77 + 6865221
commit 53ef4d2
Show file tree

Hide file tree

Showing 14 changed files with 35 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Word cloud (tag cloud or wordle) is a novelty visual representation of text data
 
 [🌐 Try the online generator 🌐](https://mybinder.org/v2/gh/guo-yong-zhi/pluto-on-binder/master?urlpath=pluto/open?url=https%3A%2F%2Fraw.githubusercontent.com%2Fguo-yong-zhi%2FWordCloud.jl%2Fmaster%2FWordCloudApp.jl)  
 
-[✨ Go to the gallery ✨](https://github.com/guo-yong-zhi/WordCloud-Gallery) 
+[✨ Go to the gallery ✨](https://github.com/guo-yong-zhi/WordCloud-Gallery/blob/main/README.md) 
 
 <br>
 

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -22,7 +22,7 @@ Modules = [WordCloud, WordCloud.TextProcessing, WordCloud.Render]
 ```
 
 ## Gallery
-* [WordCloud-Gallery](https://github.com/guo-yong-zhi/WordCloud-Gallery)
+* [WordCloud-Gallery](https://github.com/guo-yong-zhi/WordCloud-Gallery/blob/main/README.md)
 
 ## Index
 ```@index

diff --git a/examples/animation1.jl b/examples/animation1.jl
@@ -1,9 +1,8 @@
 #md# This animation shows how the initial layout is generated.
 using WordCloud
-stopwords_extra = ["said"]
 textfile = pkgdir(WordCloud)*"/res/alice.txt"
 wc = wordcloud(
-    processtext(open(textfile), stopwords=stopwords, maxnum=300), 
+    processtext(open(textfile), maxnum=300), 
     masksize = (300, 200),
     outline = 3,
     angles = 0:90,

diff --git a/examples/compare.jl b/examples/compare.jl
@@ -1,14 +1,13 @@
 #md# ### First generate the wordcloud on the left  
 using WordCloud
 
-stwords = ["us"];
 println("==Obama's==")
 cs = WordCloud.randomscheme() # :Set1_8
 as = WordCloud.randomangles() # (0,90,45,-45)
 fs = WordCloud.randomfonts()
 dens = 0.45 # not too high
 wca = wordcloud(
-    processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), stopwords_extra=stwords), 
+    open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), 
     colors=cs,
     angles=as,
     density=dens,
@@ -19,7 +18,7 @@ wca = wordcloud(
 #md# ### Then generate the wordcloud on the right      
 println("==Trump's==")
 wcb = wordcloud(
-    processtext(open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"), stopwords_extra=stwords),
+    open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"),
     mask=getsvgmask(wca),
     masksize=:original,
     colors=cs,

diff --git a/examples/compare2.jl b/examples/compare2.jl
@@ -2,13 +2,12 @@
 #md# ### Prepare two wordcloud objects
 using WordCloud
 
-stwords = ["us"];
 cs = WordCloud.randomscheme() # :Set1_8#
 as = WordCloud.randomangles() # (0,90,45,-45)#
 fs = WordCloud.randomfonts()
 dens = 0.45 # not too high
 wca = wordcloud(
-    processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), stopwords_extra=stwords), 
+    open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), 
     colors=cs,
     angles=as,
     density=dens,
@@ -17,7 +16,7 @@ wca = wordcloud(
     state=identity, # turn off the initialize! and layout! in advance
 )
 wcb = wordcloud(
-    processtext(open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"), stopwords_extra=stwords),
+    open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"),
     mask=getsvgmask(wca),
     masksize=:original,
     colors=cs,

diff --git a/examples/custom.jl b/examples/custom.jl
@@ -1,6 +1,6 @@
 using WordCloud
 wc = wordcloud(
-    processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), stopwords_extra=["said"], maxweight=1, maxnum=300), 
+    processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), maxweight=1, maxnum=300), 
     # mask = pad(WordCloud.tobitmap(shape(ellipse, 600, 500, color=(0.98, 0.97, 0.99), backgroundcolor=0.97)), 0.1),
     mask=shape(ellipse, 600, 500, color=(0.98, 0.97, 0.99), backgroundcolor=0.97, backgroundsize=(700, 550)),
     masksize=:original,

diff --git a/examples/gathering.jl b/examples/gathering.jl
@@ -1,7 +1,7 @@
 #md# By setting `style=:gathering` in the `layout!` function, larger words will be positioned closer to the center.
 using WordCloud
 wc = wordcloud(
-    processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), stopwords_extra=["said"]), 
+    open(pkgdir(WordCloud) * "/res/alice.txt"), 
     angles=0, density=0.55,
     mask=squircle, rt=2.5 * rand(),
     state=initialize!)

diff --git a/examples/highdensity.jl b/examples/highdensity.jl
@@ -2,7 +2,7 @@ using WordCloud
 #md# In certain scenarios, there might be a need for generating a high-density output, and you might attempt to achieve it using the following code:
 #md# ```julia
 #md# wc = wordcloud(
-#md#     processtext(open(pkgdir(WordCloud)*"/res/alice.txt"), stopwords_extra=["said"]), 
+#md#     open(pkgdir(WordCloud)*"/res/alice.txt"), 
 #md#     mask = shape(box, 500, 400, cornerradius=10),
 #md#     colors = :Dark2_3,
 #md#     angles = (0, 90), # spacing = 2,
@@ -13,7 +13,7 @@ using WordCloud
 #md# This is mainly because the minimum gap between two words is set to 2 pixels, controlled by the `spacing` parameter of the `wordcloud` function.  
 #md# In cases where the image is small, the cost of 2 pixels becomes relatively higher. To address this issue, you have the option to set `spacing=0` or `spacing=1`. Alternatively, increasing the image size can also alleviate the issue.
 wc = wordcloud(
-    processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), stopwords_extra=["said"]), 
+    open(pkgdir(WordCloud) * "/res/alice.txt"), 
     mask=shape(box, 500 * 2, 400 * 2, cornerradius=10 * 2),
     masksize=:original,
     colors=:Dark2_3,

diff --git a/examples/japanese.jl b/examples/japanese.jl
@@ -0,0 +1,12 @@
+#md# This package does not come with an integrated Japanese tokenizer. You can leverage the [`TinySegmenter.jl`](https://github.com/JuliaStrings/TinySegmenter.jl) package instead.
+using WordCloud
+import TinySegmenter
+WordCloud.settokenizer!("jpn", TinySegmenter.tokenize)
+
+wc = wordcloud("花は桜木、人は武士", language="jpn") |> generate! # the argumet `language` is optional
+
+println("results are saved to japanese.svg")
+paint(wc, "japanese.svg")
+wc
+#eval# runexample(:japanese)
+#md# ![](japanese.svg)  
diff --git a/examples/languages.jl b/examples/languages.jl
@@ -1,4 +1,4 @@
-#md# For languages that are not processed perfectly, you can refer to [the example for Chinese](#中文) or you can input the data in the form of a "word => weight" list, as illustrated in the following example.
+#md# For languages that are not processed perfectly, you can refer to [the example for Chinese](#中文) and [the example for Japanese](#japanese). Or you can directly input the data in the form of a "word => weight" list, as illustrated in the following example.
 using WordCloud
 words_weights = [
     "普通话" => 939.0,

diff --git a/examples/semantic.jl b/examples/semantic.jl
@@ -1,7 +1,6 @@
 #md# ### Words
 using WordCloud
-stwords = ["us"];
-words_weights = processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), stopwords_extra=stwords)
+words_weights = processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"))
 words_weights = Dict(zip(words_weights...))
 #md# ### Embedding
 #md# The positions of words can be initialized with pre-trained word vectors so that similar words will appear near each other.

diff --git a/examples/中文.jl b/examples/中文.jl
@@ -1,4 +1,4 @@
-#md# 中文需要分词，可以通过PythonCall调用python版的结巴分词  
+#md# 中文分词功能没有内建，可以通过PythonCall调用python版的结巴分词。
 
 using CondaPkg; CondaPkg.add("jieba")
 using WordCloud
@@ -10,6 +10,11 @@ TheInternationale = "起来，饥寒交迫的奴隶！\n起来，全世界受苦
 
 jieba.add_word("英特纳雄耐尔")
 
+#md# 方案1：你可以使用`WordCloud.settokenizer!`为中文注册分词器。当检测到中文文本输入时该分词器会被自动调用。
+WordCloud.settokenizer!("zh", t->pyconvert(Vector{String}, jieba.lcut(t)))
+wc = wordcloud(TheInternationale)
+@show wc
+#md# 方案2：如果你只是单次使用不想注册，也可以传入手动分词之后的的词列表。
 wc = wordcloud(
     processtext(pyconvert(Vector{String}, jieba.lcut(TheInternationale))), 
     colors="#DE2910",

diff --git a/src/textprocessing.jl b/src/textprocessing.jl
@@ -226,7 +226,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
     minlength=1, maxlength=30,
     minfrequency=0,
     maxnum=500,
-    minweight=1 / maxnum, maxweight=:auto,
+    minweight=:auto, maxweight=:auto,
     process=rescaleweights(identity, 0) ∘ casemerge!)
 
     language = detect_language(keys(counter), language)
@@ -257,7 +257,8 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
     print("The top $(length(words)) words are kept. ")
     @assert !isempty(weights)
     weights = weights ./ sum(weights)
-    maxweight == :auto && (maxweight = max(20minweight, 20 / maxnum))
+    minweight == :auto && (minweight = min(0.01, 1 / length(words)))
+    maxweight == :auto && (maxweight = max(20minweight, 10 / length(words)))
     m = weights .> maxweight
     weights[m] .= log1p.(weights[m] .- maxweight) ./ 10 .+ maxweight
     weights .+= minweight

diff --git a/src/wc-class.jl b/src/wc-class.jl
@@ -111,6 +111,7 @@ function wordcloud(words::AbstractVector{<:AbstractString}, weights::AbstractVec
     if minfontsize == :auto
         minfontsize = min(maxfontsize, 8, sqrt(volume / length(words) / 8))
         # 只和单词数量有关，和单词长度无关。不管单词多长，字号小了依然看不见。
+        # 单词平均长度为4，volume大约为12*12*length(words)，故sqrt(12*12*单词平均长度/8)约等于8.5
     end
     @debug "set fontsize ∈ [$minfontsize, $maxfontsize]"
     params[:minfontsize] = minfontsize