Skip to content

Commit

Permalink
Threaded model
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris More committed May 25, 2011
1 parent 075bfb0 commit a543edc
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 144 deletions.
145 changes: 6 additions & 139 deletions check.sh
Expand Up @@ -4,153 +4,20 @@
# The output of the script are wiki bullets
# Author: Chris More

## Settings ##

inputfile="input.txt"
exec `sort -f -o $inputfile $inputfile`
output="output.txt"
analytics_string="webtrendslive.com"
check_analytics_coverage=1
create_active_websites_wiki=1
websites_output="active-websites.txt"
ignore_domain="addons"
#####
outputfile="output.txt"

total_websites=0
total_analytics=0
total_ok=0
total_error=0
total_redirect=0
total_ftp=0
today=`date +%m-%d-%Y`
#####

if [ $create_active_websites_wiki == 1 ]; then
exec `cat /dev/null > $websites_output`
fi
exec `cat /dev/null > $outputfile`
exec `rm -rf web > /dev/null`

input=`cat $inputfile`

echo "The following list and updates is as of $today
{| class='wikitable sortable' border='1'
|-
! scope='col' | Web Address
! scope='col' | Status
! scope='col' | Analytics Installed
! scope='col' | Analytics Page Coverage" > $output

for address in $input; do

coverage=0

echo "|-" >> $output

#determine if this is a website or ftp server

if echo "$address" | grep -i '^ftp'; then
pro="ftp"
(( total_ftp++ ))
else
pro="http"
(( total_websites++ ))
fi
sleep .25
./check-domain.sh $address $outputfile &

#Check the status code of the address
response=$(curl --write-out %{http_code} --silent --output /dev/null $pro://$address)

#Determine a human readable status code message
if [ $response == "200" ] || [ $response == "226" ]; then
status="Ok"
check_html_url="$pro://$address"
(( total_ok++ ))
elif [ $response == "404" ]; then
status="Error: $response Not Found"
(( total_error++ ))
elif [ $response == "500" ]; then
status="Error: $response Internal Server Error"
(( total_error++ ))
elif [ $response == "301" ] || [ $response == "302" ]; then
# Check to see if a website is just redirecting from http to https
website_redirected=$(curl --write-out %{redirect_url} --silent --output /dev/null $pro://$address)
if [ "https://$address/" == "$website_redirected" ]; then
pro="https"
# Check redirector again incase it redirects a second time (localization)
website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
if [ "$website_redirected2" == "" ]; then
#If the website did not redirect again after switching to https, then set check_html_url to current address.
status="Ok"
check_html_url=$website_redirected
(( total_ok++ ))
else
#If the website redirected a second time, set the check_html_url variable to the second redirected address.
status="Ok"
check_html_url=$website_redirected2
(( total_ok++ ))
fi

else
status="Redirected: $website_redirected"
(( total_redirect++ ))
fi
elif [ $response == "000" ]; then
status="Error: Unable to connect"
(( total_error++ ))
elif [ $response == "403" ]; then
status="Error: $response Forbidden"
(( total_error++ ))
elif [ $response == "502" ]; then
status="Error: $response Bad Gateway"
(( total_error++ ))
else
status="Error: $response"
(( total_error++ ))
fi

#Check to see if the website has analytics code installed

if [ "$status" == "Ok" ] && [ "$pro" != "ftp" ]; then
#Only check if the website is not redirecting or erroring out
analytics_check=$(curl --silent $check_html_url | grep -i $analytics_string | wc -m | sed 's/ //g')

if [ "$analytics_check" == "0" ]; then
analytics="No"
coverage="N/A"
else
analytics="Yes"
(( total_analytics++ ))
if [ $check_analytics_coverage == 1 ] && [ "$check_html_url" == "${check_html_url/$ignore_domain/}" ]; then
#Spider every page on the website to determine the % of pages with analytics
echo "Spidering $address..."
coverage=`./find-analytics.sh $check_html_url $analytics_string`
else
coverage="N/A"
fi
if [ coverage == 0 ]; then
coverage="N/A"
fi
fi

if [ $create_active_websites_wiki == 1 ]; then
exec `./active-websites.sh $check_html_url $websites_output > /dev/null`
fi
else
analytics="N/A"
analytics_check=0
fi

echo "$address, $response, $analytics ($analytics_check)"
echo "| [$pro://$address $address] || $status || $analytics || $coverage%" >> $output
done

echo "|}
== Statistics ==
* Total Websites: [[Domain Names/all|$total_websites]]
* Total FTP servers: $total_ftp
* Total Ok: $total_ok
* Total Errors: $total_error
* Total Redirects: $total_redirect
* Total Analytics Installed: $total_analytics
The source script for this page can be found at [https://github.com/chrismore/Domain-Name-Status-Checker here]." >> $output
11 changes: 6 additions & 5 deletions find-analytics.sh
Expand Up @@ -8,16 +8,17 @@ address=$1
analytics_string=$2
domain=`echo $address | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
# Remove all spider cache
exec `rm -rf web > /dev/null`
#exec `rm -rf hts* > /dev/null`
# Spider every page, which requires HTTrack + libssl.so installed
#exec `httrack "$address" -w -T5 -p1 -N3 -Q -%x -I0 -A9999999999 -%c10 -c5 -F "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" > /dev/null`
exec `wget -D $domain -R .ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -l99999 -nc --connect-timeout=5 -Pweb --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" $address`
exec `wget -D $domain -R *smartproxy*,.ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -q -l99999 -nc --connect-timeout=5 -Pweb --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" $address`
# Grep for the number of pages that include the analytics string - stop at first occourance of string in file
#finds=`find ./web -name "*.html" | xargs grep -sicl $analytics_string | wc -l | sed 's/ //g'`
finds=`grep -lri "$analytics_string" ./web | wc -l | sed 's/ //g'`
finds=`grep -lri "$analytics_string" ./web/$domain | wc -l | sed 's/ //g'`
# Find how many HTML pages have been spidered
files=`find ./web/$domain -type f \( -name "*.html" -or -name "*.htm" \) | wc -l | sed 's/ //g'`
# There are some files that are mirroed that are behind a proxy, which are not part of the website, but wget still picks them up. The -E 'string' supports regex matching
ignore_files=`find ./web/$domain -type f \( -name "*.html" -or -name "*.htm" \) | grep -i -E 'smartproxy' | wc -l | sed 's/ //g'`
# Subtract the ignored files from files to get a final number.
files=`echo $files-$ignore_files|bc`
#echo "found $finds files with string out of $files files"
if [ $files -ge 2 ]; then
# If more then one page is returned, then calculate the percentage
Expand Down

0 comments on commit a543edc

Please sign in to comment.