From ee0a070cc7106602c0a007aeca4f84f2931da3a4 Mon Sep 17 00:00:00 2001
From: Robert Reininger <robert.reininger@sec-research.com>
Date: Wed, 27 Nov 2019 13:43:30 +0100
Subject: [PATCH 1/2] Add new --convert-umlauts parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This new parameter adds functionality to convert the common Latin-1 umlauts ä, ö, ü and ß to ae, oe, ue and ss
---
 README  | 12 ++++++++----
 cewl.rb | 28 +++++++++++++++++++---------
 2 files changed, 27 insertions(+), 13 deletions(-)
diff --git a/README b/README
index 47aadfc..51bb9ec 100644
--- a/README
+++ b/README
@@ -27,6 +27,10 @@ GitHub: https://github.com/digininja/CeWL
 Change Log
 ==========
 
+Version 5.4.4
+-------------
+Added the --convert-umlauts parameter to convert Latin-1 umlauts (e.g. "ä" to "ae", "ö" to "oe", etc.)
+
 Version 5.4.3
 -------------
 Added the --with-number parameter to make words include letters and numbers
@@ -224,21 +228,21 @@ Usage: cewl [OPTIONS] ... <url>
 	-c, --count: Show the count for each word found.
 	-v, --verbose: Verbose.
 	--debug: Extra debug information.
-      
+
 	Authentication
 	--auth_type: Digest or basic.
 	--auth_user: Authentication username.
 	--auth_pass: Authentication password.
-      
+
 	Proxy Support
 	--proxy_host: Proxy host.
 	--proxy_port: Proxy port, default 8080.
 	--proxy_username: Username for proxy, if required.
 	--proxy_password: Password for proxy, if required.
-      
+
 	Headers
 	--header, -H: In format name:value - can pass multiple.
-      
+
     <url>: The site to spider.
 
 Ruby Doc
diff --git a/cewl.rb b/cewl.rb
index 27b21e4..f5faa08 100755
--- a/cewl.rb
+++ b/cewl.rb
@@ -143,7 +143,7 @@ def allowed?(a_url, parsed_url)
 	# Lifted from the original gem to fix the case statement
 	# which checked for Fixednum not Integer as
 	# Fixednum has been deprecated.
-	# 
+	#
 	def on(code, p = nil, &block)
 		f = p ? p : block
 		case code
@@ -244,7 +244,7 @@ def get_page(uri, &block) #:nodoc:
 			end
 
 			res = http.request(req)
-			
+
 			if res.redirect?
 				puts "Redirect URL" if @debug
 				base_url = uri.to_s[0, uri.to_s.rindex('/')]
@@ -475,6 +475,7 @@ def push(value)
 		['--meta_file', GetoptLong::REQUIRED_ARGUMENT],
 		['--email_file', GetoptLong::REQUIRED_ARGUMENT],
 		['--with-numbers', GetoptLong::NO_ARGUMENT],
+		['--convert-umlauts', GetoptLong::NO_ARGUMENT],
 		['--meta', "-a", GetoptLong::NO_ARGUMENT],
 		['--email', "-e", GetoptLong::NO_ARGUMENT],
 		['--count', '-c', GetoptLong::NO_ARGUMENT],
@@ -505,6 +506,7 @@ def usage
 	-u, --ua <agent>: User agent to send.
 	-n, --no-words: Don't output the wordlist.
 	--with-numbers: Accept words with numbers in as well as just letters
+	--convert-umlauts: Convert common ISO-8859-1 (Latin-1) umlauts (ä-ae, ö-oe, ü-ue, ß-ss)
 	-a, --meta: include meta data.
 	--meta_file file: Output file for meta data.
 	-e, --email: Include email addresses.
@@ -513,21 +515,21 @@ def usage
 	-c, --count: Show the count for each word found.
 	-v, --verbose: Verbose.
 	--debug: Extra debug information.
-      
+
 	Authentication
 	--auth_type: Digest or basic.
 	--auth_user: Authentication username.
 	--auth_pass: Authentication password.
-      
+
 	Proxy Support
 	--proxy_host: Proxy host.
 	--proxy_port: Proxy port, default 8080.
 	--proxy_username: Username for proxy, if required.
 	--proxy_password: Password for proxy, if required.
-      
+
 	Headers
 	--header, -H: In format name:value - can pass multiple.
-      
+
     <url>: The site to spider.
 
 "
@@ -551,6 +553,7 @@ def usage
 meta_temp_dir = "/tmp/"
 keep = false
 words_with_numbers = false
+convert_umlauts = false
 show_count = false
 auth_type = nil
 auth_user = nil
@@ -575,6 +578,8 @@ def usage
 				usage
 			when "--with-numbers"
 				words_with_numbers = true
+			when "--convert-umlauts"
+				convert_umlauts = true
 			when "--count"
 				show_count = true
 			when "--meta-temp-dir"
@@ -621,9 +626,9 @@ def usage
 				# of each element in the array
 				tmp_exclude_array.each do |line|
 					exc = line.strip
-					if exc != "" 
+					if exc != ""
 						exclude_array << line.strip
-						# puts "Excluding #{ line.strip}" 
+						# puts "Excluding #{ line.strip}"
 					end
 				end
 			when '--ua'
@@ -966,13 +971,18 @@ def usage
 						end
 
 						if wordlist
-							# Remove any symbols
+							# Remove/convert any symbols
 							if words_with_numbers then
 								words.gsub!(/[^[[:alnum:]]]/i, " ")
 							else
 								words.gsub!(/[^[[:alpha:]]]/i, " ")
 							end
 
+							if convert_umlauts then
+								words.gsub!(/[^a-zäöüßÄÖÜ]/i, " ")
+								words.gsub!(/[äöüßÄÖÜ]/, "ä" => "ae", "ö" => "oe", "ü" => "ue", "ß" => "ss", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue")
+							end
+
 							# Add to the array
 							words.split(" ").each do |word|
 								if word.length >= min_word_length

From a575a36d9ef142cf95b69c2f1464e840fabf156f Mon Sep 17 00:00:00 2001
From: 5p1n <20203510+5p1n@users.noreply.github.com>
Date: Thu, 28 Nov 2019 17:26:42 +0100
Subject: [PATCH 2/2] =?UTF-8?q?Remove=20the=20substitution=20of=20characte?=
 =?UTF-8?q?rs=20other=20than=20=C3=A4=C3=B6=C3=BC=C3=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This part of code is a remnant of another use case in which characters not converted had to be removed.
---
 cewl.rb | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cewl.rb b/cewl.rb
index 982fa08..6c30283 100755
--- a/cewl.rb
+++ b/cewl.rb
@@ -988,7 +988,6 @@ def usage
 							end
 
 							if convert_umlauts then
-								words.gsub!(/[^a-zäöüßÄÖÜ]/i, " ")
 								words.gsub!(/[äöüßÄÖÜ]/, "ä" => "ae", "ö" => "oe", "ü" => "ue", "ß" => "ss", "Ä" => "Ae", "Ö" => "Oe", "Ü" => "Ue")
 							end