In [7]:
# https://github.com/nusretipek/LanguageFinder
"""
Wikipedia corpus builder is designed to harvest random Wikipedia pages in a given 
language code such as es for Spanish and train the ngram weights based on these pages. 
The full list of the available languages can be found in https://en.wikipedia.org/wiki/List_of_Wikipedias
## Example
```julia 
julia> train_wikipedia_text("es", 10, 15)
julia> 
    "Successfully trained on 10 es wikipedia pages."
```
"""

using HTTP


#=
The dictionaries is stored within text files with custom format.
This function reads the custom text format and build a dictionary variable.
Retruns the dictionary read from the text file.
=#

function read_dictionary(InputFile::String)
    f = open(InputFile)
    raw_text = readlines(f)
    close(f)    
    dictionary = Dict()
    for i in raw_text
        push!(dictionary,split(i)[1] => split(i)[2]) end
    return dictionary
end

#=
The dictionaries are written with specific format for later use.
Specifically each line ad key + " " + value. This function writes
a defined dictionary to a text file.
Retruns nothing.
=#

function write_dictionary(InputFile::String, DICT::Dict)
    f = open(InputFile, "w")
    for (key, value) in DICT
        println(f, key, " ", value) end
    close(f)
end

#=
The Wikipedia page request function. Each Wikipedia subdomain has different random page url, instead of 
requesting that url, there is a known random page url text file stored and automatically updated when
a new language WP code is used. ("Wikipedia_Random.txt") 
HTTP library is used to get the webpage and the body text is pipelined to string.
Retruns the String type webpage body text.
=#

function get_random_wikipedia_page(LANG_CODE::String)
    url = ""
    random_urls = read_dictionary("Wikipedia_Random.txt")
    if(haskey(random_urls, LANG_CODE))
        url = "https://" * LANG_CODE * ".wikipedia.org" * random_urls[LANG_CODE]
    else
        homepage = HTTP.get("https://" * LANG_CODE * ".wikipedia.org" * "/wiki/")
        str = homepage.body |> String
        start_point = findfirst("n-randompage", str).stop+3
        text_rest = str[start_point:end]
        random_href = SubString(text_rest, findall("\"", text_rest)[1].start+1, findall("\"", text_rest)[2].start-1)
        push!(random_urls, LANG_CODE => random_href)
        write_dictionary("Wikipedia_Random.txt", random_urls)  
        url = "https://" * LANG_CODE * ".wikipedia.org" * random_urls[LANG_CODE] end
    r = HTTP.get(url)
	title = replace(r.request.target, r"/" => "_")
    return title, r.body |> String
end

#=
The raw HTML string is not parsed as a tree and with a lot of standartized HTML tags.
It is necessary to take the useful string in between these tags. The extract element function
checks crawls in the HTML raw string and extract strings.
Retruns an Array of useful strings.
=#

function extract_elements(HTML::AbstractString, ELEMENT::String)
    open_p = findall("<"*ELEMENT, HTML)
    close_p = findall("</"*ELEMENT, HTML)
    arr = []
    try
        for i in 1:length(open_p)
            temp_text = SubString(HTML, open_p[i].start, close_p[i].stop+1) 
            push!(arr, temp_text) end catch x end
    return arr
end

#=
The strings removed from the HTML tags are still contaminated with the inline annotations with 
(),[],{} and etc. The text or other information inside is often not useful to train ngrams. 
This function cleans these special set charaters and the information inside.
Retruns a cleared string.
=#

function clean_inside_tags(TEXT::AbstractString, SYMBOL_START::String, SYMBOL_STOP::String)
    open_symbol = findall(SYMBOL_START, TEXT)
    close_symbol = findall(SYMBOL_STOP, TEXT)
    arr = []
    if(length(open_symbol) > 0 && length(close_symbol) > 0 && length(open_symbol) == length(close_symbol))
        for i in 1:min(length(open_symbol), length(close_symbol))   
            temp_text = SubString(TEXT, open_symbol[i].start, close_symbol[i].stop)
            push!(arr, temp_text) end end
    for j in arr
        TEXT = replace(TEXT, j => "") end
    return TEXT
end

#=
This function combines the extracting and cleaning most prominent sets from a random Wikipedia
page. It extract the text within the <p> tags (Always the case in Wikipedia). Then, utilize the 
clean_inside_tags function to clear <>, (), [] and {}.
Returns a String with clean text.
=#

function clean_text_wiki(HTML::AbstractString)
    temp_text = ""
    for i in extract_elements(HTML, "p")
        temp_text *= clean_inside_tags(clean_inside_tags(clean_inside_tags(clean_inside_tags(i, "<", ">"), "[", "]"), "(", ")"), "{", "}") end
    return temp_text
end




clean_text_wiki (generic function with 1 method)

In [16]:
get_random_wikipedia_page("en")

("_wiki_Megachile_cinctiventris", "<!DOCTYPE html>\n<html class=\"client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\" lang=\"en\" dir=\"ltr\">\n<head>\n<meta charset=\"UTF-8\">\n<title>Megachile cinctiventris - Wikipedia</title>\n<script>document.documentElement.className=\"client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled vector-feature-zebra-design-disabled\";(function(){v

In [37]:

using SHA

function html2text(content::AbstractString)
    patterns = [
        r"<[\s]*?script[^>]*?>[\s\S]*?<[\s]*?/[\s]*?script[\s]*?>" => " ",
        r"<[\s]*?style[^>]*?>[\s\S]*?<[\s]*?/[\s]*?style[\s]*?>" => " ",
        r"<!--[\s\S]*?-->" => " ",
        "<br>" => "\n",
        r"<[\s\S]*?>" => " ",
        "&nbsp;" => " ",
        "&quot;" => "\"",
        "&amp;" => "&",
        "&lt;" => "<",
        "&gt;" => ">",
        r"&#?\w{1,6};" => " ",
    ]
    for p in patterns
        content = replace(content, p)
    end
    content
end

function download_wikipedia_text(lang_code::String, pages::Integer=1, sleep_time::Integer = 10; path="./corpus")
	lang_code = lowercase(lang_code)
    # try HTTP.get("https://" * lang_code * ".wikipedia.org" * "/wiki/") catch x throw(ArgumentError("Invalid language code; check WP codes in https://en.wikipedia.org/wiki/List_of_Wikipedias")) end
	dirpath = joinpath(path, lang_code)
	mkpath(dirpath)
    try
        for i in length(readdir(dirpath))+1:pages
            if ispath(joinpath(path, "stop"))
                return
            end
            title, text = get_random_wikipedia_page(lang_code)
            if length(title) > 47
                title = title[1:20] * "_" * bytes2hex(sha1(title))[1:5] * "_" * title[end-19:end]
            end
            text = clean_text_wiki(text)
            text = html2text(text)
            text = replace(text, r"\n\n+" => "\n")
			fn = joinpath(dirpath, title*".txt")
            if ispath(fn)
                print("!+ ")
            end
			print(i, ". ", abspath(fn))
			if ! isempty(text)
	            open(fn, "w") do f
	                write(f, text) 
	            end
			end
            println(" ✓")
            sleep(sleep_time)
        end
	catch e
		throw(e)
    end
end


download_wikipedia_text (generic function with 3 methods)

In [11]:
download_wikipedia_text("en", path=raw"./corpus/train")

In [38]:

langs = ["ar", "cs", "da", "de", "el", "en", "es", "fa", "fi", "fr", "he", "hi", "hu", "it", "jp", "ko", "nl", "no", "pl", "pt", "ru", "sv", "tr", "uk", "zh"]

@show length(langs)
step = 10
total = 1000
for i in [step:step:total; total]
	println("### $i ###")
	for lang in langs
		download_wikipedia_text(lang, i, path=raw"./corpus/train")
	end
end
# touch corpus/train/stop
# rm corpus/train/stop

length(langs) = 25
### 0 ###


### 10 ###
### 20 ###
### 30 ###


23. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Carlby.txt ✓


24. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Mollkirch.txt ✓


25. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Coulter_(surname).txt ✓


26. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Entertainment__95d68_ssociation_of_Canada.txt ✓


27. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Otter_Lake_(Se_a7c29_d_District,_Ontario).txt ✓


28. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Madison_Townsh_8841a_rson_County,_Indiana.txt ✓


29. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Fernandocrambus_cuprescens.txt ✓


30. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/en/_wiki_Allobates_chalcopis.txt ✓


21. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Thomas_M%C3%BCller_(esquiador).txt ✓


22. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Discrom%C3%ADa.txt ✓


23. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Coetupo.txt ✓


24. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Anexo:Medaller_d6ae9_tud_de_Singapur_2010.txt ✓


25. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Puilboreau_(Hait%C3%AD).txt ✓


26. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Colin_Gibbons.txt ✓


27. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Hungerford_(Texas).txt ✓


28. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Prosopocoilus_dorsalis.txt ✓


29. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Stiphropus_gruberi.txt ✓


30. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/es/_wiki_Armando_Migliari.txt ✓


21. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D9%BE%D9%86%D_077b0_7%D9%86%D9%88%DA%86).txt ✓


22. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D9%86%D9%88%D_f1fe8_B4%D9%85%DB%8C%D8%AA.txt ✓


23. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D8%B4%D9%88%DA%AF%D9%88%D9%86.txt ✓




24. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D8%A8%D8%A7%D_6fac5_A7%D8%B3%DA%AF%D9%88.txt ✓


25. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D8%A8%D8%B1%D_1f2a6_A7%D9%86%D8%AF%DB%8C.txt ✓


26. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%DA%AF%DB%8C%D_6038f_A7%D9%85%D8%AA%DB%8C.txt ✓


27. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D8%B3%D8%B1%D_297d9_F%DA%AF%D8%A7%D9%86).txt ✓


28. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D9%87%D8%A7%D_cfd97_85%D8%A8%D8%A7%D8%AE.txt ✓


29. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%DA%86%D8%A7%D_6848e_B9%D9%84%DB%8C%D8%A7.txt ✓


30. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fa/_wiki_%D9%86%DB%8C%D8%B4%DA%AF%D9%88%D9%86.txt ✓


21. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Pierre_Bonnard.txt ✓


22. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Dubki.txt ✓


23. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Kootut_kertomukset_(Edgar_Allan_Poe).txt ✓


24. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Luettelo_Pohjois-Makedonian_kaupungeista.txt ✓


25. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Koloman_Moser.txt ✓


26. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Arvi_Pohjanp%C3%A4%C3%A4.txt ✓


27. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Juoksj%C3%A4rvi.txt ✓


28. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Antti_Pihlstr%C3%B6m.txt ✓


29. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Agathodaimon_(yhtye).txt ✓


30. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fi/_wiki_Adi_H%C3%BCtter.txt ✓


21. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fr/_wiki_Balanso.txt ✓


22. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fr/_wiki_Korioukivka.txt ✓




23. /home/guoyongzhi/LanguageIdentification.jl/corpus/train/fr/_wiki_Bennwihr.txt ✓
