Merge branch 'master' of github.com:chrismore/Domain-Name-Status-Checker

chrismore · Jun 27, 2011 · d694f56 · d694f56
2 parents ae1cb14 + 2410117
commit d694f56
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 115 deletions.
diff --git a/active-websites.sh b/active-websites.sh
@@ -1,32 +1,24 @@
 #!/bin/bash
 
 address=$1
-input=$2
-output=$3
+output=$2
 
-domain=`echo $address | sed -r "s/^(.+\/\/)([^/]+)(.*)/\2/"`
-found=`grep -i $domain $input | wc -l | sed -r "s/ //g"`
+domain=`echo $address | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
+title=`curl -s $address | grep -i "title>" | sed -E "s/^[^<]+//g" | sed -E "s/<title>//g" | sed -E "s/<\/title>//g" | sed -E "s/\n|\t//g"`
+found=`grep -i $domain current-websites.txt | wc -l | sed -E "s/ //g"`
 
 if [ "$found" == "0" ]; then
-
-	title=`curl -s $address | grep -i "title>" | sed ':a;N;$!ba;s/\n//g' | sed -r "s/^[^<]+//g" | sed -r "s/<(title|TITLE)>//g" | sed -r "s/<\/(title|TITLE)>//g" | sed -r "s/([^<]+)(.*)/\1/g" | sed 's/[^a-z0-9\-\: ]*$//g'`
-
-	if [ "$title" != "Index of " ] && [ "$title" != "" ]; then
-
-echo "== $title ==
-* Prod URL:  $address
-* Stage URL:
-* Code Repo:
-* L10N Repo:
-* Code:
-* Licensing:
-* Product Owner:
-* Dev Team:
-* QA Lead:
-* Team Email:
-* Last reviewed:
-" >> $output
-
-	fi
-
-fi
+	echo "== [[Websites/Template|$title]] ==
+	* Prod URL:  $address
+	* Stage URL: http://stage.example.com/
+	* Code Repo: http://www.code-repository-url.com/
+	* L10N Repo: http://www.l10n-repository-url.com/
+	* Code: Language / Framework
+	* Product Owner: Group; Person
+	* Dev Team: Group; Person
+	* QA Lead: Person
+	* Team Email: team-email@example.com
+	* Last reviewed: Person on mm/dd/yyyy
+	
+	" >> $output
+fi
diff --git a/check-domain.sh b/check-domain.sh
@@ -1,14 +1,12 @@
 #!/bin/bash
 
 address=$1
-input=$2
-output=$3
+output=$2
 
-# These are the domains that should not be scanned for page analytics due to their size and use of comprehensive templates
-ignore_domain="allizom|\-cdn|\.stage|addons\.mozilla\.org|support\.mozilla\.com|developer\.mozilla\.org|www\.getpersonas\.com|creative\.mozilla\.org|stage\.|\-stage|stage\-|\-cdn"
+ignore_domain="allizom|-cdn|\.stage"
 analytics_string="webtrendslive.com"
 check_analytics_coverage=1
-concurrent_procs=10
+concurrent_procs=20
 
 #determine if this is a website or ftp server
 
@@ -34,32 +32,48 @@ elif [ $response == "500" ]; then
 	status_type="error"
 elif [ $response == "301" ] || [ $response == "302" ]; then
 	# Check to see if a website is just redirecting from http to https
-	website_redirected=$(curl --write-out %{url_effective} --silent --output /dev/null -L $pro://$address)
-	domain=`echo $website_redirected | sed -r 's/^(.+\/\/)([^/]+)(.*)/\2/'`
-
-	if [[ "$domain" == "$address" ]]; then
-        # website redireted, but stayed on domain.
-	# Check to make sure if the website was redirected, that it did not redirected to a 404 page.
-	response=$(curl --write-out %{http_code} --silent --output /dev/null $website_redirected)
-		if [ $response == "404" ]; then
-	                status="Error: $response Not Found"
-  			status_type="error"
-        	else
-       		 	status="Ok"
-	             	status_type="ok"
-                	check_html_url=$website_redirected
+	website_redirected=$(curl --write-out %{redirect_url} --silent --output /dev/null $pro://$address)
+	if [ "https://$address/" == "$website_redirected" ]; then
+		pro="https"
+		# Check redirector again incase it redirects a second time (localization)
+		website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
+		if [ "$website_redirected2" == "" ]; then
+			#If the website did not redirect again after switching to https, then set check_html_url to current address.
+			status="Ok"
+			status_type="ok"
+			check_html_url=$website_redirected
+		else
+			#If the website redirected a second time, set the check_html_url variable to the second redirected address.
+			status="Ok"
+			status_type="ok"
+			check_html_url=$website_redirected2
 		fi
-	elif [[ "www.$address" == $domain ]]; then	
-		# Redirected to www	
-		status="Ok"
-          	status_type="ok"
-           	check_html_url=$website_redirected
 	else
-		# website redirected to aother domain.
-             	status="Redirected: $website_redirected"
-             	status_type="redirect"
+		# website stayed http
+		domain=`echo $website_redirected | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
+		if [[ "$domain" == "$address" ]]; then
+
+			# website redirected, but stayed on the same domain. Probably l10n redirection.
+			website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
+
+			if [ "$website_redirected2" == "" ]; then
+				# website did not redirect to a subdirectory.
+				status="Ok"
+				status_type="ok"
+				check_html_url=$website_redirected
+			else
+				#If the website redirected a second time, set the check_html_url variable to the second redirected address.
+				status="Ok"
+				status_type="ok"
+				check_html_url=$website_redirected2
+			fi
+
+		else
+			# website redirected to aother domain.
+			status="Redirected: $website_redirected"
+			status_type="redirect"
+		fi		
 	fi
-
 elif [ $response == "000" ]; then
 	status="Error: Unable to connect"
 	status_type="error"
@@ -74,6 +88,15 @@ else
 	status_type="error"
 fi
 
+if [ "$check_html_url" != "" ]; then
+	# Check to make sure if the website was redirected, that it did not redirected to a 404 page.
+	response=$(curl --write-out %{http_code} --silent --output /dev/null $check_html_url)
+	if [ $response == "404" ]; then
+		status="Error: $response Not Found"
+		status_type="error"
+	fi
+fi
+
 #Check to see if the website has analytics code installed
 
 if [ "$status" == "Ok" ] && [ "$pro" != "ftp" ]; then
@@ -91,15 +114,15 @@ if [ "$status" == "Ok" ] && [ "$pro" != "ftp" ]; then
 		if [ $check_analytics_coverage == 1 ] && [ $ignore_domain_check == 0 ]; then
 
 			# First check to see how many wget spiders are running. This is to keep from running too many spiders and driving up the load average.
-			procs=`ps a | grep -i wget | wc -l | sed 's/ //g'`
+			procs=`ps -ax | grep -i wget | wc -l | sed 's/ //g'`
 			total_procs=`echo $procs-1|bc`
 
 			while [ $total_procs -gt $concurrent_procs ]
 			# If more then n wget's running, sleep for a minute and try again.
 			do
 				echo "Sleeping ($total_procs waiting)...."
 				sleep 60
-				procs=`ps a | grep -i wget | wc -l | sed 's/ //g'`
+				procs=`ps -ax | grep -i wget | wc -l | sed 's/ //g'`
 				total_procs=`echo $procs-1|bc`
 
 			done
@@ -125,14 +148,3 @@ fi
 echo "$address, $status_type, $analytics ($analytics_check)"
 status=`echo $status | sed 's/ /\+/g'` 
 echo "$address,$pro,$status,$status_type,$analytics,$coverage" >> $output
-
-## Check to see if all processes are finished to decide to run create-wiki.sh
-
-input_len=`wc -l $input | sed -r 's/^([0-9]+) (.+)/\1/g'`
-output_len=`wc -l $output | sed -r 's/^([0-9]+) (.+)/\1/g'`
-
-if [ "$input_len" == "$output_len" ]; then
-	# Write the already known giant websites temp file back to the output.txt file before creating the wiki.
-	cat output-temp.txt >> output.txt
-	./create-wiki.sh
-fi
diff --git a/check.sh b/check.sh
@@ -16,8 +16,8 @@ input=`cat $inputfile`
 
 for address in $input; do
 
-	#sleep 2
-	./check-domain.sh $address $inputfile $outputfile &
+	sleep 2
+	./check-domain.sh $address $outputfile &
 
 done
 
diff --git a/create-wiki.sh b/create-wiki.sh
@@ -2,19 +2,11 @@
 
 inputfile="output.txt"
 output="output-wiki.txt"
-output_ok="output-ok.txt"
-output_http="output-http.txt"
-output_ftp="output-ftp.txt"
 output_robots="output-robots.txt"
-output_websites="active-websites.txt"
-output_prod="output-prod.txt"
-input_wiki="current-websites.txt"
+websites_output="active-websites.txt"
 exec `sort -o $inputfile $inputfile`
 exec `cat /dev/null > $output`
-exec `cat /dev/null > $output_http`
-exec `cat /dev/null > $output_ftp`
-exec `cat /dev/null > $output_ok`
-exec `cat /dev/null > $output_websites`
+exec `cat /dev/null > $websites_output`
 
 total_websites=0
 total_analytics=0
@@ -23,20 +15,15 @@ total_error=0
 total_redirect=0
 total_ftp=0
 total_robots_blocked=0
-total_prod=0
 
-curl -s https://wiki.mozilla.org/Websites/Active_List > $input_wiki
-
-dev_domains="allizom|cdn\-|\-cdn|\.stage|\-stage|stage\-|stage\."
+curl -s https://wiki.mozilla.org/Webdev:WhoWorksOnWhat > current-websites.txt
 
 today=`date +%m-%d-%Y`
 
 echo "The following is a list of active websites that are blocked from ALL robot spidering:
 " > $output_robots
 
-echo "== Domain List ==
-
-The following list and updates is as of $today.
+echo "The following list and updates is as of $today.
 
 	{| class='wikitable sortable' border='1'
 	|-
@@ -67,21 +54,13 @@ for thisline in $input; do
 		robots=`./check-robots.sh $address`
 		if [ "$robots" == "1" ]; then
 			(( total_robots_blocked++ ))
-			echo "* [$pro://$address $address]" >> $output_robots
-		fi
-
-		ignore_domain_check=`echo $address | grep -i -E $dev_domains | wc -l | sed 's/ //g'`		
-
-		if [ $ignore_domain_check == 0 ]; then
-			(( total_prod++ ))
-			echo "* $address" >> $output_prod
+			echo "* [$pro://$address/robots.txt $address]" >> $output_robots
 		fi
-
+		
 		address_check=`./get-redirected-address.sh $address`
 		echo "Check: $address, $address_check"
-		exec `./active-websites.sh $address_check $input_wiki $output_websites > /dev/null`
-		echo "* $address" >> $output_ok
-
+		exec `./active-websites.sh $address_check $websites_output > /dev/null`
+
 	elif [ "$status_type" == "error" ]; then
 		(( total_error++ ))
 	elif [ "$status_type" == "redirect" ]; then
@@ -90,10 +69,8 @@ for thisline in $input; do
 
 	if [[ "$pro" == "ftp" ]]; then
 		(( total_ftp++ ))
-		echo "* $address" >> $output_ftp
 	else
 		(( total_websites++ ))
-		echo "* $address" >> $output_http
 	fi
 
 	if [ "$analytics" == "Yes" ]; then
@@ -112,15 +89,12 @@ echo "|}
 
 == Statistics ==
 
-* Total domains: [[Websites/Domain_List/http|$total_websites]]
-* Total ok: [[Websites/Domain_List/ok|$total_ok]]
-* Total prod: [[Websites/Domain_List/prod|$total_prod]]
+* Total websites: [[Domain Names/all|$total_websites]]
+* Total FTP servers: $total_ftp
+* Total ok: $total_ok
 * Total errors: $total_error
 * Total redirects: $total_redirect
 * Total analytics installed: $total_analytics
-* Total robot blocked websites: [[Websites/Domain_List/robots-blocked|$total_robots_blocked]]
-
-== Do you have changes to this list? ==
+* Total robot blocked websites: [[Domain Names/robots-blocked|$total_robots_blocked]]
 
-This wiki page is automatically generated by scripts. Please contact [https://ldap.mozilla.org/phonebook/tree.php#search/cmore@mozilla.com Chris More] for more 
-information. The source script for this page can be found [https://github.com/chrismore/Domain-Name-Status-Checker here]." >> $output
+The source script for this page can be found [https://github.com/chrismore/Domain-Name-Status-Checker here]." >> $output
diff --git a/find-analytics.sh b/find-analytics.sh
@@ -6,13 +6,13 @@
 
 address=$1
 analytics_string=$2
-domain=`echo $address | sed -r "s/^(.+\/\/)([^/]+)(.*)/\2/"`
+domain=`echo $address | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
 # Remove all spider cache
 # Spider every page, which requires HTTrack + libssl.so installed
 #exec `httrack "$address" -w -T5 -p1 -N3 -Q -%x -I0 -A9999999999 -%c10 -c5 -F "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" > /dev/null`
 exec `wget -D $domain -R .swf,.JPG,.PNG,.GIF,.tiff,.bmp,*smartproxy*,.ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -q -l 5 -nc --connect-timeout=5 -Pweb --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" $address` 
 # Grep for the number of pages that include the analytics string - stop at first occourance of string in file
-finds=`grep -lri "$analytics_string" ./web/$domain --include=*.html | wc -l | sed 's/ //g'`
+finds=`grep -lri "$analytics_string" ./web/$domain | wc -l | sed 's/ //g'`
 # Find how many HTML pages have been spidered
 files=`find ./web/$domain -type f \( -name "*.html" -or -name "*.htm" \) | wc -l | sed 's/ //g'`
 # There are some files that are mirroed that are behind a proxy, which are not part of the website, but wget still picks them up. The -E 'string' supports regex matching
@@ -26,4 +26,4 @@ if [ $files -ge 1 ]; then
 else
 	# Return 0 if none or one pages returned total.
 	echo "0"
-fi
+fi
diff --git a/get-redirected-address.sh b/get-redirected-address.sh
@@ -6,11 +6,11 @@ address=$1
 # Find redirected address
 
 # Check to see if a website is just redirecting from http to https
-	website_redirected=$(curl --write-out %{url_effective} --silent --output /dev/null -L $pro://$address)
+	website_redirected=$(curl --write-out %{redirect_url} --silent --output /dev/null $pro://$address)
 	if [ "https://$address/" == "$website_redirected" ]; then
 		pro="https"
 		# Check redirector again incase it redirects a second time (localization)
-		website_redirected2=$(curl --write-out %{url_effective} --silent --output /dev/null -L $website_redirected)
+		website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
 		if [ "$website_redirected2" == "" ]; then
 			#If the website did not redirect again after switching to https, then set address_final to current address.
 			address_final=$website_redirected
@@ -22,7 +22,7 @@ address=$1
 		# website stayed http
 		if [[ "$website_redirected" != "" ]]; then
 			# website redirected, but stayed on the same domain. Probably l10n redirection.
-			website_redirected2=$(curl --write-out %{url_effective} --silent --output /dev/null -L $website_redirected)
+			website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
 
 			if [ "$website_redirected2" == "" ]; then
 				# website did not redirect to a subdirectory.
@@ -37,4 +37,4 @@ address=$1
 		fi		
 	fi
 
-echo $address_final
+echo $address_final