Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
guo-yong-zhi committed May 12, 2024
2 parents 4da4a77 + 6865221 commit 53ef4d2
Show file tree
Hide file tree
Showing 14 changed files with 35 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Word cloud (tag cloud or wordle) is a novelty visual representation of text data

[🌐 Try the online generator 🌐](https://mybinder.org/v2/gh/guo-yong-zhi/pluto-on-binder/master?urlpath=pluto/open?url=https%3A%2F%2Fraw.githubusercontent.com%2Fguo-yong-zhi%2FWordCloud.jl%2Fmaster%2FWordCloudApp.jl)

[✨ Go to the gallery ✨](https://github.com/guo-yong-zhi/WordCloud-Gallery)
[✨ Go to the gallery ✨](https://github.com/guo-yong-zhi/WordCloud-Gallery/blob/main/README.md)

<br>

Expand Down
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Modules = [WordCloud, WordCloud.TextProcessing, WordCloud.Render]
```

## Gallery
* [WordCloud-Gallery](https://github.com/guo-yong-zhi/WordCloud-Gallery)
* [WordCloud-Gallery](https://github.com/guo-yong-zhi/WordCloud-Gallery/blob/main/README.md)

## Index
```@index
Expand Down
3 changes: 1 addition & 2 deletions examples/animation1.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
#md# This animation shows how the initial layout is generated.
using WordCloud
stopwords_extra = ["said"]
textfile = pkgdir(WordCloud)*"/res/alice.txt"
wc = wordcloud(
processtext(open(textfile), stopwords=stopwords, maxnum=300),
processtext(open(textfile), maxnum=300),
masksize = (300, 200),
outline = 3,
angles = 0:90,
Expand Down
5 changes: 2 additions & 3 deletions examples/compare.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
#md# ### First generate the wordcloud on the left
using WordCloud

stwords = ["us"];
println("==Obama's==")
cs = WordCloud.randomscheme() # :Set1_8
as = WordCloud.randomangles() # (0,90,45,-45)
fs = WordCloud.randomfonts()
dens = 0.45 # not too high
wca = wordcloud(
processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), stopwords_extra=stwords),
open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"),
colors=cs,
angles=as,
density=dens,
Expand All @@ -19,7 +18,7 @@ wca = wordcloud(
#md# ### Then generate the wordcloud on the right
println("==Trump's==")
wcb = wordcloud(
processtext(open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"), stopwords_extra=stwords),
open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"),
mask=getsvgmask(wca),
masksize=:original,
colors=cs,
Expand Down
5 changes: 2 additions & 3 deletions examples/compare2.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,12 @@
#md# ### Prepare two wordcloud objects
using WordCloud

stwords = ["us"];
cs = WordCloud.randomscheme() # :Set1_8#
as = WordCloud.randomangles() # (0,90,45,-45)#
fs = WordCloud.randomfonts()
dens = 0.45 # not too high
wca = wordcloud(
processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), stopwords_extra=stwords),
open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"),
colors=cs,
angles=as,
density=dens,
Expand All @@ -17,7 +16,7 @@ wca = wordcloud(
state=identity, # turn off the initialize! and layout! in advance
)
wcb = wordcloud(
processtext(open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"), stopwords_extra=stwords),
open(pkgdir(WordCloud) * "/res/Donald Trump's Inaugural Address.txt"),
mask=getsvgmask(wca),
masksize=:original,
colors=cs,
Expand Down
2 changes: 1 addition & 1 deletion examples/custom.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
using WordCloud
wc = wordcloud(
processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), stopwords_extra=["said"], maxweight=1, maxnum=300),
processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), maxweight=1, maxnum=300),
# mask = pad(WordCloud.tobitmap(shape(ellipse, 600, 500, color=(0.98, 0.97, 0.99), backgroundcolor=0.97)), 0.1),
mask=shape(ellipse, 600, 500, color=(0.98, 0.97, 0.99), backgroundcolor=0.97, backgroundsize=(700, 550)),
masksize=:original,
Expand Down
2 changes: 1 addition & 1 deletion examples/gathering.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#md# By setting `style=:gathering` in the `layout!` function, larger words will be positioned closer to the center.
using WordCloud
wc = wordcloud(
processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), stopwords_extra=["said"]),
open(pkgdir(WordCloud) * "/res/alice.txt"),
angles=0, density=0.55,
mask=squircle, rt=2.5 * rand(),
state=initialize!)
Expand Down
4 changes: 2 additions & 2 deletions examples/highdensity.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ using WordCloud
#md# In certain scenarios, there might be a need for generating a high-density output, and you might attempt to achieve it using the following code:
#md# ```julia
#md# wc = wordcloud(
#md# processtext(open(pkgdir(WordCloud)*"/res/alice.txt"), stopwords_extra=["said"]),
#md# open(pkgdir(WordCloud)*"/res/alice.txt"),
#md# mask = shape(box, 500, 400, cornerradius=10),
#md# colors = :Dark2_3,
#md# angles = (0, 90), # spacing = 2,
Expand All @@ -13,7 +13,7 @@ using WordCloud
#md# This is mainly because the minimum gap between two words is set to 2 pixels, controlled by the `spacing` parameter of the `wordcloud` function.
#md# In cases where the image is small, the cost of 2 pixels becomes relatively higher. To address this issue, you have the option to set `spacing=0` or `spacing=1`. Alternatively, increasing the image size can also alleviate the issue.
wc = wordcloud(
processtext(open(pkgdir(WordCloud) * "/res/alice.txt"), stopwords_extra=["said"]),
open(pkgdir(WordCloud) * "/res/alice.txt"),
mask=shape(box, 500 * 2, 400 * 2, cornerradius=10 * 2),
masksize=:original,
colors=:Dark2_3,
Expand Down
12 changes: 12 additions & 0 deletions examples/japanese.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#md# This package does not come with an integrated Japanese tokenizer. You can leverage the [`TinySegmenter.jl`](https://github.com/JuliaStrings/TinySegmenter.jl) package instead.
using WordCloud
import TinySegmenter
WordCloud.settokenizer!("jpn", TinySegmenter.tokenize)

wc = wordcloud("花は桜木、人は武士", language="jpn") |> generate! # the argumet `language` is optional

println("results are saved to japanese.svg")
paint(wc, "japanese.svg")
wc
#eval# runexample(:japanese)
#md# ![](japanese.svg)
2 changes: 1 addition & 1 deletion examples/languages.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#md# For languages that are not processed perfectly, you can refer to [the example for Chinese](#中文) or you can input the data in the form of a "word => weight" list, as illustrated in the following example.
#md# For languages that are not processed perfectly, you can refer to [the example for Chinese](#中文) and [the example for Japanese](#japanese). Or you can directly input the data in the form of a "word => weight" list, as illustrated in the following example.
using WordCloud
words_weights = [
"普通话" => 939.0,
Expand Down
3 changes: 1 addition & 2 deletions examples/semantic.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#md# ### Words
using WordCloud
stwords = ["us"];
words_weights = processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"), stopwords_extra=stwords)
words_weights = processtext(open(pkgdir(WordCloud) * "/res/Barack Obama's First Inaugural Address.txt"))
words_weights = Dict(zip(words_weights...))
#md# ### Embedding
#md# The positions of words can be initialized with pre-trained word vectors so that similar words will appear near each other.
Expand Down
7 changes: 6 additions & 1 deletion examples/中文.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#md# 中文需要分词,可以通过PythonCall调用python版的结巴分词
#md# 中文分词功能没有内建,可以通过PythonCall调用python版的结巴分词

using CondaPkg; CondaPkg.add("jieba")
using WordCloud
Expand All @@ -10,6 +10,11 @@ TheInternationale = "起来,饥寒交迫的奴隶!\n起来,全世界受苦

jieba.add_word("英特纳雄耐尔")

#md# 方案1:你可以使用`WordCloud.settokenizer!`为中文注册分词器。当检测到中文文本输入时该分词器会被自动调用。
WordCloud.settokenizer!("zh", t->pyconvert(Vector{String}, jieba.lcut(t)))
wc = wordcloud(TheInternationale)
@show wc
#md# 方案2:如果你只是单次使用不想注册,也可以传入手动分词之后的的词列表。
wc = wordcloud(
processtext(pyconvert(Vector{String}, jieba.lcut(TheInternationale))),
colors="#DE2910",
Expand Down
5 changes: 3 additions & 2 deletions src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
minlength=1, maxlength=30,
minfrequency=0,
maxnum=500,
minweight=1 / maxnum, maxweight=:auto,
minweight=:auto, maxweight=:auto,
process=rescaleweights(identity, 0) casemerge!)

language = detect_language(keys(counter), language)
Expand Down Expand Up @@ -257,7 +257,8 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
print("The top $(length(words)) words are kept. ")
@assert !isempty(weights)
weights = weights ./ sum(weights)
maxweight == :auto && (maxweight = max(20minweight, 20 / maxnum))
minweight == :auto && (minweight = min(0.01, 1 / length(words)))
maxweight == :auto && (maxweight = max(20minweight, 10 / length(words)))
m = weights .> maxweight
weights[m] .= log1p.(weights[m] .- maxweight) ./ 10 .+ maxweight
weights .+= minweight
Expand Down
1 change: 1 addition & 0 deletions src/wc-class.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ function wordcloud(words::AbstractVector{<:AbstractString}, weights::AbstractVec
if minfontsize == :auto
minfontsize = min(maxfontsize, 8, sqrt(volume / length(words) / 8))
# 只和单词数量有关,和单词长度无关。不管单词多长,字号小了依然看不见。
# 单词平均长度为4,volume大约为12*12*length(words),故sqrt(12*12*单词平均长度/8)约等于8.5
end
@debug "set fontsize ∈ [$minfontsize, $maxfontsize]"
params[:minfontsize] = minfontsize
Expand Down

0 comments on commit 53ef4d2

Please sign in to comment.