Threaded model

chrismore · May 25, 2011 · a543edc · a543edc
1 parent 075bfb0
commit a543edc
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 144 deletions.
diff --git a/check.sh b/check.sh
@@ -4,153 +4,20 @@
 # The output of the script are wiki bullets
 # Author: Chris More
 
-## Settings ##
-
 inputfile="input.txt"
-exec `sort -f -o $inputfile $inputfile`
-output="output.txt"
-analytics_string="webtrendslive.com"
-check_analytics_coverage=1
-create_active_websites_wiki=1
-websites_output="active-websites.txt"
-ignore_domain="addons"
-#####
+outputfile="output.txt"
 
-total_websites=0
-total_analytics=0
-total_ok=0
-total_error=0
-total_redirect=0
-total_ftp=0
-today=`date +%m-%d-%Y`
+#####
 
-if [ $create_active_websites_wiki == 1 ]; then
-	exec `cat /dev/null > $websites_output`
-fi
+exec `cat /dev/null > $outputfile`
+exec `rm -rf web > /dev/null`
 
 input=`cat $inputfile`
 
-echo "The following list and updates is as of $today
-
-	{| class='wikitable sortable' border='1'
-	|-
-	! scope='col' | Web Address
-	! scope='col' | Status
-	! scope='col' | Analytics Installed
-	! scope='col' | Analytics Page Coverage" > $output
-
 for address in $input; do
 
-	coverage=0
-
-	echo "|-" >> $output 
-
-	#determine if this is a website or ftp server
-
-	if echo "$address" | grep -i '^ftp'; then
-                pro="ftp"
-		(( total_ftp++ ))
-        else
-                pro="http"
-		(( total_websites++ ))
-        fi
+	sleep .25
+	./check-domain.sh $address $outputfile &
 
-	#Check the status code of the address
-	response=$(curl --write-out %{http_code} --silent --output /dev/null $pro://$address)        
-
-	#Determine a human readable status code message
-	if [ $response == "200" ] || [ $response == "226" ]; then
-		status="Ok"
-		check_html_url="$pro://$address"
-		(( total_ok++ ))
-	elif [ $response == "404" ]; then
-		status="Error: $response Not Found"
-		(( total_error++ ))
-	elif [ $response == "500" ]; then
-		status="Error: $response Internal Server Error"
-		(( total_error++ ))
-	elif [ $response == "301" ] || [ $response == "302" ]; then
-		# Check to see if a website is just redirecting from http to https
-		website_redirected=$(curl --write-out %{redirect_url} --silent --output /dev/null $pro://$address)
-		if [ "https://$address/" == "$website_redirected" ]; then
-			pro="https"
-			# Check redirector again incase it redirects a second time (localization)
-			website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
-			if [ "$website_redirected2" == "" ]; then
-				#If the website did not redirect again after switching to https, then set check_html_url to current address.
-				status="Ok"
-				check_html_url=$website_redirected
-				(( total_ok++ ))
-			else
-				#If the website redirected a second time, set the check_html_url variable to the second redirected address.
-				status="Ok"
-				check_html_url=$website_redirected2
-				(( total_ok++ ))
-			fi
-
-		else
-			status="Redirected: $website_redirected"
-			(( total_redirect++ ))
-		fi
-	elif [ $response == "000" ]; then
-		status="Error: Unable to connect"
-		(( total_error++ ))
-	elif [ $response == "403" ]; then
-		status="Error: $response Forbidden"
-		(( total_error++ ))
-	elif [ $response == "502" ]; then
-		status="Error: $response Bad Gateway"
-		(( total_error++ ))
-	else
-		status="Error: $response"
-		(( total_error++ ))
-	fi
-
-	#Check to see if the website has analytics code installed
-
-	if [ "$status" == "Ok" ] && [ "$pro" != "ftp" ]; then
-		#Only check if the website is not redirecting or erroring out
-		analytics_check=$(curl --silent $check_html_url | grep -i $analytics_string | wc -m | sed 's/ //g')
-
-		if [ "$analytics_check" == "0" ]; then
-			analytics="No"
-			coverage="N/A"
-		else
-			analytics="Yes"
-			(( total_analytics++ ))
-			if [ $check_analytics_coverage == 1 ] && [ "$check_html_url" == "${check_html_url/$ignore_domain/}" ]; then
-				#Spider every page on the website to determine the % of pages with analytics
-				echo "Spidering $address..."
-				coverage=`./find-analytics.sh $check_html_url $analytics_string`
-			else
-				coverage="N/A"
-			fi
-			if [ coverage == 0 ]; then
-				coverage="N/A"
-			fi
-		fi
-
-		if [ $create_active_websites_wiki == 1 ]; then
-			exec `./active-websites.sh $check_html_url $websites_output > /dev/null`
-		fi
-	else
-		analytics="N/A"
-		analytics_check=0
-	fi
-
-	echo "$address, $response, $analytics ($analytics_check)"
-	echo "| [$pro://$address $address] || $status || $analytics || $coverage%" >> $output
 done
 
-echo "|}
-
-== Statistics ==
-
-* Total Websites: [[Domain Names/all|$total_websites]]
-* Total FTP servers: $total_ftp
-* Total Ok: $total_ok
-* Total Errors: $total_error
-* Total Redirects: $total_redirect
-* Total Analytics Installed: $total_analytics
-
-The source script for this page can be found at [https://github.com/chrismore/Domain-Name-Status-Checker here]." >> $output
diff --git a/find-analytics.sh b/find-analytics.sh
@@ -8,16 +8,17 @@ address=$1
 analytics_string=$2
 domain=`echo $address | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
 # Remove all spider cache
-exec `rm -rf web > /dev/null`
-#exec `rm -rf hts* > /dev/null`
 # Spider every page, which requires HTTrack + libssl.so installed
 #exec `httrack "$address" -w -T5 -p1 -N3 -Q -%x -I0 -A9999999999 -%c10 -c5 -F "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" > /dev/null`
-exec `wget -D $domain -R .ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -l99999 -nc --connect-timeout=5 -Pweb --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" $address` 
+exec `wget -D $domain -R *smartproxy*,.ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -q -l99999 -nc --connect-timeout=5 -Pweb --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" $address` 
 # Grep for the number of pages that include the analytics string - stop at first occourance of string in file
-#finds=`find ./web -name "*.html" | xargs grep -sicl $analytics_string | wc -l | sed 's/ //g'`
-finds=`grep -lri "$analytics_string" ./web | wc -l | sed 's/ //g'`
+finds=`grep -lri "$analytics_string" ./web/$domain | wc -l | sed 's/ //g'`
 # Find how many HTML pages have been spidered
 files=`find ./web/$domain -type f \( -name "*.html" -or -name "*.htm" \) | wc -l | sed 's/ //g'`
+# There are some files that are mirroed that are behind a proxy, which are not part of the website, but wget still picks them up. The -E 'string' supports regex matching
+ignore_files=`find ./web/$domain -type f \( -name "*.html" -or -name "*.htm" \) | grep -i -E 'smartproxy' | wc -l | sed 's/ //g'`
+# Subtract the ignored files from files to get a final number.
+files=`echo $files-$ignore_files|bc`
 #echo "found $finds files with string out of $files files"
 if [ $files -ge 2 ]; then
 	# If more then one page is returned, then calculate the percentage