In [1]:
# https://github.com/nusretipek/LanguageFinder
"""
Wikipedia corpus builder is designed to harvest random Wikipedia pages in a given 
language code such as es for Spanish and train the ngram weights based on these pages. 
The full list of the available languages can be found in https://en.wikipedia.org/wiki/List_of_Wikipedias
## Example
```julia 
julia> train_wikipedia_text("es", 10, 15)
julia> 
    "Successfully trained on 10 es wikipedia pages."
```
"""

using HTTP


#=
The dictionaries is stored within text files with custom format.
This function reads the custom text format and build a dictionary variable.
Retruns the dictionary read from the text file.
=#

function read_dictionary(InputFile::String)
    f = open(InputFile)
    raw_text = readlines(f)
    close(f)    
    dictionary = Dict()
    for i in raw_text
        push!(dictionary,split(i)[1] => split(i)[2]) end
    return dictionary
end

#=
The dictionaries are written with specific format for later use.
Specifically each line ad key + " " + value. This function writes
a defined dictionary to a text file.
Retruns nothing.
=#

function write_dictionary(InputFile::String, DICT::Dict)
    f = open(InputFile, "w")
    for (key, value) in DICT
        println(f, key, " ", value) end
    close(f)
end

#=
The Wikipedia page request function. Each Wikipedia subdomain has different random page url, instead of 
requesting that url, there is a known random page url text file stored and automatically updated when
a new language WP code is used. ("Wikipedia_Random.txt") 
HTTP library is used to get the webpage and the body text is pipelined to string.
Retruns the String type webpage body text.
=#

function get_random_wikipedia_page(LANG_CODE::String)
    url = ""
    random_urls = read_dictionary("Wikipedia_Random.txt")
    if(haskey(random_urls, LANG_CODE))
        url = "https://" * LANG_CODE * ".wikipedia.org" * random_urls[LANG_CODE]
    else
        homepage = HTTP.get("https://" * LANG_CODE * ".wikipedia.org" * "/wiki/")
        str = homepage.body |> String
        start_point = findfirst("n-randompage", str).stop+3
        text_rest = str[start_point:end]
        random_href = SubString(text_rest, findall("\"", text_rest)[1].start+1, findall("\"", text_rest)[2].start-1)
        push!(random_urls, LANG_CODE => random_href)
        write_dictionary("Wikipedia_Random.txt", random_urls)  
        url = "https://" * LANG_CODE * ".wikipedia.org" * random_urls[LANG_CODE] end
    r = HTTP.get(url)
    title = r.request.target
	title = replace(title, r"/" => "_")
	title = replace(title, r":" => "_")
    return title, r.body |> String
end

#=
The raw HTML string is not parsed as a tree and with a lot of standartized HTML tags.
It is necessary to take the useful string in between these tags. The extract element function
checks crawls in the HTML raw string and extract strings.
Retruns an Array of useful strings.
=#

function extract_elements(HTML::AbstractString, ELEMENT::String)
    open_p = findall("<"*ELEMENT, HTML)
    close_p = findall("</"*ELEMENT, HTML)
    arr = []
    try
        for i in 1:length(open_p)
            temp_text = SubString(HTML, open_p[i].start, close_p[i].stop+1) 
            push!(arr, temp_text) end catch x end
    return arr
end

#=
The strings removed from the HTML tags are still contaminated with the inline annotations with 
(),[],{} and etc. The text or other information inside is often not useful to train ngrams. 
This function cleans these special set charaters and the information inside.
Retruns a cleared string.
=#

function clean_inside_tags(TEXT::AbstractString, SYMBOL_START::String, SYMBOL_STOP::String)
    open_symbol = findall(SYMBOL_START, TEXT)
    close_symbol = findall(SYMBOL_STOP, TEXT)
    arr = []
    if(length(open_symbol) > 0 && length(close_symbol) > 0 && length(open_symbol) == length(close_symbol))
        for i in 1:min(length(open_symbol), length(close_symbol))   
            temp_text = SubString(TEXT, open_symbol[i].start, close_symbol[i].stop)
            push!(arr, temp_text) end end
    for j in arr
        TEXT = replace(TEXT, j => "") end
    return TEXT
end

#=
This function combines the extracting and cleaning most prominent sets from a random Wikipedia
page. It extract the text within the <p> tags (Always the case in Wikipedia). Then, utilize the 
clean_inside_tags function to clear <>, (), [] and {}.
Returns a String with clean text.
=#

function clean_text_wiki(HTML::AbstractString)
    temp_text = ""
    for i in extract_elements(HTML, "p")
        temp_text *= clean_inside_tags(clean_inside_tags(clean_inside_tags(clean_inside_tags(i, "<", ">"), "[", "]"), "(", ")"), "{", "}") end
    return temp_text
end




clean_text_wiki (generic function with 1 method)

In [2]:
# get_random_wikipedia_page("en")

In [3]:

using SHA

function html2text(content::AbstractString)
    patterns = [
        r"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?/[\s]*?script[\s]*?>" => " ",
        r"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?/[\s]*?style[\s]*?>" => " ",
        r"<!--[\s\S]*?-->" => " ",
        "<br>" => "\n",
        r"<[\s\S]*?>" => " ",
        "&nbsp;" => " ",
        "&quot;" => "\"",
        "&amp;" => "&",
        "&lt;" => "<",
        "&gt;" => ">",
        r"&#?\w{1,6};" => " ",
    ]
    for p in patterns
        content = replace(content, p)
    end
    content
end

function download_wikipedia_text(lang_code::String, pages::Integer=1, sleep_time::Integer = 10; path="./corpus")
	lang_code = lowercase(lang_code)
    # try HTTP.get("https://" * lang_code * ".wikipedia.org" * "/wiki/") catch x throw(ArgumentError("Invalid language code; check WP codes in https://en.wikipedia.org/wiki/List_of_Wikipedias")) end
	dirpath = joinpath(path, lang_code)
	mkpath(dirpath)
    try
        for i in length(readdir(dirpath))+1:pages
            if ispath(joinpath(path, "stop"))
                return
            end
            title, text = get_random_wikipedia_page(lang_code)
            if length(title) > 47
                title = title[1:20] * "_" * bytes2hex(sha1(title))[1:5] * "_" * title[end-19:end]
            end
            text = clean_text_wiki(text)
            text = html2text(text)
            text = replace(text, r"\n\n+" => "\n")
			fn = joinpath(dirpath, title*".txt")
            if ispath(fn)
                print("!+ ")
            end
			print(i, ". ", abspath(fn))
			if ! isempty(text)
	            open(fn, "w") do f
	                write(f, text) 
	            end
			end
            println(" ✓")
            flush(stdout)
            sleep(sleep_time)
        end
	catch e
		throw(e)
    end
end


download_wikipedia_text (generic function with 3 methods)

In [4]:
# download_wikipedia_text("en", path=raw"./corpus/train")

In [5]:

langs = ["ar", "cs", "da", "de", "el", "en", "es", "fa", "fi", "fr", "he", "hi", "hu", "it", "jp", "ko", "nl", "no", "pl", "pt", "ru", "sv", "tr", "uk", "zh"]

@show length(langs)
step = 10
total = 200
path=raw"./corpus/wikipedia/test"
using Dates

time_str = Dates.format(Dates.now(), "yyyy-mm-dd_HH-MM-SS")
log_file = "log_" * basename(path) * "_$time_str.txt"
@show log_file
redirect_stdio(stdout=log_file) do
	for i in [step:step:total; total]
		time_str = Dates.format(Dates.now(), "yyyy-mm-dd_HH-MM-SS")
		println("### $i ### [$time_str]")
		for lang in langs
			download_wikipedia_text(lang, i, path=path)
		end
	end
end
# touch corpus/wikipedia/test/stop
# rm corpus/wikipedia/test/stop

length(langs) = 25
log_file = 

"log_test_2023-08-10_18-19-09.txt"


In [None]:

langs = ["ar", "cs", "da", "de", "el", "en", "es", "fa", "fi", "fr", "he", "hi", "hu", "it", "jp", "ko", "nl", "no", "pl", "pt", "ru", "sv", "tr", "uk", "zh"]

@show length(langs)
step = 50
total = 5000
path=raw"./corpus/wikipedia/train"
using Dates

time_str = Dates.format(Dates.now(), "yyyy-mm-dd_HH-MM-SS")
log_file = "log_" * basename(path) * "_$time_str.txt"
@show log_file
redirect_stdio(stdout=log_file) do
	for i in [step:step:total; total]
		time_str = Dates.format(Dates.now(), "yyyy-mm-dd_HH-MM-SS")
		println("### $i ### [$time_str]")
		for lang in langs
			download_wikipedia_text(lang, i, path=path)
		end
	end
end
# touch corpus/wikipedia/train/stop
# rm corpus/wikipedia/train/stop

In [51]:
for (root, dirs, files) in walkdir(raw"./corpus/wikipedia/train")
    for file in files
        if occursin(":", file)
            newname = replace(file, ":" => "_")
            mv(joinpath(root, file), joinpath(root, newname))
            println("Renamed $file to $newname")
        end
    end
end

Renamed _wiki_Artemis_Fowl:_Evighedskoden.txt to _wiki_Artemis_Fowl__Evighedskoden.txt
Renamed _wiki_B%C3%A5nd_1:_Ernst.txt to _wiki_B%C3%A5nd_1__Ernst.txt
Renamed _wiki_Rush_ABC:_Live_d8cb9__Agora_Ballroom_1974.txt to _wiki_Rush_ABC__Live_d8cb9__Agora_Ballroom_1974.txt
Renamed _wiki_Tales_of_Aravorn:_Seasons_of_the_Wolf.txt to _wiki_Tales_of_Aravorn__Seasons_of_the_Wolf.txt
Renamed _wiki_Anexo:Abierto__62288_ndividual_masculino).txt to _wiki_Anexo_Abierto__62288_ndividual_masculino).txt
Renamed _wiki_Anexo:Medaller_d6ae9_tud_de_Singapur_2010.txt to _wiki_Anexo_Medaller_d6ae9_tud_de_Singapur_2010.txt
Renamed _wiki_Anexo:Personajes_de_Read_or_Die.txt to _wiki_Anexo_Personajes_de_Read_or_Die.txt
Renamed _wiki_Anexo:Plantill_04bd9_tem_Pro_Cycling_Team.txt to _wiki_Anexo_Plantill_04bd9_tem_Pro_Cycling_Team.txt
Renamed _wiki_Anexo:Promoci%_4473e__de_Espa%C3%B1a_2008.txt to _wiki_Anexo_Promoci%_4473e__de_Espa%C3%B1a_2008.txt
Renamed _wiki_Anexo:Sismos_en_Chile_de_2014.txt to _wiki_Anexo_Sism