From ee0a070cc7106602c0a007aeca4f84f2931da3a4 Mon Sep 17 00:00:00 2001 From: Robert Reininger Date: Wed, 27 Nov 2019 13:43:30 +0100 Subject: [PATCH 1/2] Add new --convert-umlauts parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This new parameter adds functionality to convert the common Latin-1 umlauts ä, ö, ü and ß to ae, oe, ue and ss --- README | 12 ++++++++---- cewl.rb | 28 +++++++++++++++++++--------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/README b/README index 47aadfc..51bb9ec 100644 --- a/README +++ b/README @@ -27,6 +27,10 @@ GitHub: https://github.com/digininja/CeWL Change Log ========== +Version 5.4.4 +------------- +Added the --convert-umlauts parameter to convert Latin-1 umlauts (e.g. "ä" to "ae", "ö" to "oe", etc.) + Version 5.4.3 ------------- Added the --with-number parameter to make words include letters and numbers @@ -224,21 +228,21 @@ Usage: cewl [OPTIONS] ... -c, --count: Show the count for each word found. -v, --verbose: Verbose. --debug: Extra debug information. - + Authentication --auth_type: Digest or basic. --auth_user: Authentication username. --auth_pass: Authentication password. - + Proxy Support --proxy_host: Proxy host. --proxy_port: Proxy port, default 8080. --proxy_username: Username for proxy, if required. --proxy_password: Password for proxy, if required. - + Headers --header, -H: In format name:value - can pass multiple. - + : The site to spider. Ruby Doc diff --git a/cewl.rb b/cewl.rb index 27b21e4..f5faa08 100755 --- a/cewl.rb +++ b/cewl.rb @@ -143,7 +143,7 @@ def allowed?(a_url, parsed_url) # Lifted from the original gem to fix the case statement # which checked for Fixednum not Integer as # Fixednum has been deprecated. - # + # def on(code, p = nil, &block) f = p ? p : block case code @@ -244,7 +244,7 @@ def get_page(uri, &block) #:nodoc: end res = http.request(req) - + if res.redirect? puts "Redirect URL" if @debug base_url = uri.to_s[0, uri.to_s.rindex('/')] @@ -475,6 +475,7 @@ def push(value) ['--meta_file', GetoptLong::REQUIRED_ARGUMENT], ['--email_file', GetoptLong::REQUIRED_ARGUMENT], ['--with-numbers', GetoptLong::NO_ARGUMENT], + ['--convert-umlauts', GetoptLong::NO_ARGUMENT], ['--meta', "-a", GetoptLong::NO_ARGUMENT], ['--email', "-e", GetoptLong::NO_ARGUMENT], ['--count', '-c', GetoptLong::NO_ARGUMENT], @@ -505,6 +506,7 @@ def usage -u, --ua : User agent to send. -n, --no-words: Don't output the wordlist. --with-numbers: Accept words with numbers in as well as just letters + --convert-umlauts: Convert common ISO-8859-1 (Latin-1) umlauts (ä-ae, ö-oe, ü-ue, ß-ss) -a, --meta: include meta data. --meta_file file: Output file for meta data. -e, --email: Include email addresses. @@ -513,21 +515,21 @@ def usage -c, --count: Show the count for each word found. -v, --verbose: Verbose. --debug: Extra debug information. - + Authentication --auth_type: Digest or basic. --auth_user: Authentication username. --auth_pass: Authentication password. - + Proxy Support --proxy_host: Proxy host. --proxy_port: Proxy port, default 8080. --proxy_username: Username for proxy, if required. --proxy_password: Password for proxy, if required. - + Headers --header, -H: In format name:value - can pass multiple. - + : The site to spider. " @@ -551,6 +553,7 @@ def usage meta_temp_dir = "/tmp/" keep = false words_with_numbers = false +convert_umlauts = false show_count = false auth_type = nil auth_user = nil @@ -575,6 +578,8 @@ def usage usage when "--with-numbers" words_with_numbers = true + when "--convert-umlauts" + convert_umlauts = true when "--count" show_count = true when "--meta-temp-dir" @@ -621,9 +626,9 @@ def usage # of each element in the array tmp_exclude_array.each do |line| exc = line.strip - if exc != "" + if exc != "" exclude_array << line.strip - # puts "Excluding #{ line.strip}" + # puts "Excluding #{ line.strip}" end end when '--ua' @@ -966,13 +971,18 @@ def usage end if wordlist - # Remove any symbols + # Remove/convert any symbols if words_with_numbers then words.gsub!(/[^[[:alnum:]]]/i, " ") else words.gsub!(/[^[[:alpha:]]]/i, " ") end + if convert_umlauts then + words.gsub!(/[^a-zäöüßÄÖÜ]/i, " ") + words.gsub!(/[äöüßÄÖÜ]/, "ä" => "ae", "ö" => "oe", "ü" => "ue", "ß" => "ss", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue") + end + # Add to the array words.split(" ").each do |word| if word.length >= min_word_length From a575a36d9ef142cf95b69c2f1464e840fabf156f Mon Sep 17 00:00:00 2001 From: 5p1n <20203510+5p1n@users.noreply.github.com> Date: Thu, 28 Nov 2019 17:26:42 +0100 Subject: [PATCH 2/2] =?UTF-8?q?Remove=20the=20substitution=20of=20characte?= =?UTF-8?q?rs=20other=20than=20=C3=A4=C3=B6=C3=BC=C3=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This part of code is a remnant of another use case in which characters not converted had to be removed. --- cewl.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/cewl.rb b/cewl.rb index 982fa08..6c30283 100755 --- a/cewl.rb +++ b/cewl.rb @@ -988,7 +988,6 @@ def usage end if convert_umlauts then - words.gsub!(/[^a-zäöüßÄÖÜ]/i, " ") words.gsub!(/[äöüßÄÖÜ]/, "ä" => "ae", "ö" => "oe", "ü" => "ue", "ß" => "ss", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue") end