Skip to content

Commit

Permalink
Merge branch 'master' of github.com:chrismore/Domain-Name-Status-Checker
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris More committed Jun 27, 2011
2 parents ae1cb14 + 2410117 commit d694f56
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 115 deletions.
44 changes: 18 additions & 26 deletions active-websites.sh
@@ -1,32 +1,24 @@
#!/bin/bash

address=$1
input=$2
output=$3
output=$2

domain=`echo $address | sed -r "s/^(.+\/\/)([^/]+)(.*)/\2/"`
found=`grep -i $domain $input | wc -l | sed -r "s/ //g"`
domain=`echo $address | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
title=`curl -s $address | grep -i "title>" | sed -E "s/^[^<]+//g" | sed -E "s/<title>//g" | sed -E "s/<\/title>//g" | sed -E "s/\n|\t//g"`
found=`grep -i $domain current-websites.txt | wc -l | sed -E "s/ //g"`

if [ "$found" == "0" ]; then

title=`curl -s $address | grep -i "title>" | sed ':a;N;$!ba;s/\n//g' | sed -r "s/^[^<]+//g" | sed -r "s/<(title|TITLE)>//g" | sed -r "s/<\/(title|TITLE)>//g" | sed -r "s/([^<]+)(.*)/\1/g" | sed 's/[^a-z0-9\-\: ]*$//g'`

if [ "$title" != "Index of " ] && [ "$title" != "" ]; then

echo "== $title ==
* Prod URL: $address
* Stage URL:
* Code Repo:
* L10N Repo:
* Code:
* Licensing:
* Product Owner:
* Dev Team:
* QA Lead:
* Team Email:
* Last reviewed:
" >> $output

fi

fi
echo "== [[Websites/Template|$title]] ==
* Prod URL: $address
* Stage URL: http://stage.example.com/
* Code Repo: http://www.code-repository-url.com/
* L10N Repo: http://www.l10n-repository-url.com/
* Code: Language / Framework
* Product Owner: Group; Person
* Dev Team: Group; Person
* QA Lead: Person
* Team Email: team-email@example.com
* Last reviewed: Person on mm/dd/yyyy
" >> $output
fi
94 changes: 53 additions & 41 deletions check-domain.sh
@@ -1,14 +1,12 @@
#!/bin/bash

address=$1
input=$2
output=$3
output=$2

# These are the domains that should not be scanned for page analytics due to their size and use of comprehensive templates
ignore_domain="allizom|\-cdn|\.stage|addons\.mozilla\.org|support\.mozilla\.com|developer\.mozilla\.org|www\.getpersonas\.com|creative\.mozilla\.org|stage\.|\-stage|stage\-|\-cdn"
ignore_domain="allizom|-cdn|\.stage"
analytics_string="webtrendslive.com"
check_analytics_coverage=1
concurrent_procs=10
concurrent_procs=20

#determine if this is a website or ftp server

Expand All @@ -34,32 +32,48 @@ elif [ $response == "500" ]; then
status_type="error"
elif [ $response == "301" ] || [ $response == "302" ]; then
# Check to see if a website is just redirecting from http to https
website_redirected=$(curl --write-out %{url_effective} --silent --output /dev/null -L $pro://$address)
domain=`echo $website_redirected | sed -r 's/^(.+\/\/)([^/]+)(.*)/\2/'`

if [[ "$domain" == "$address" ]]; then
# website redireted, but stayed on domain.
# Check to make sure if the website was redirected, that it did not redirected to a 404 page.
response=$(curl --write-out %{http_code} --silent --output /dev/null $website_redirected)
if [ $response == "404" ]; then
status="Error: $response Not Found"
status_type="error"
else
status="Ok"
status_type="ok"
check_html_url=$website_redirected
website_redirected=$(curl --write-out %{redirect_url} --silent --output /dev/null $pro://$address)
if [ "https://$address/" == "$website_redirected" ]; then
pro="https"
# Check redirector again incase it redirects a second time (localization)
website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
if [ "$website_redirected2" == "" ]; then
#If the website did not redirect again after switching to https, then set check_html_url to current address.
status="Ok"
status_type="ok"
check_html_url=$website_redirected
else
#If the website redirected a second time, set the check_html_url variable to the second redirected address.
status="Ok"
status_type="ok"
check_html_url=$website_redirected2
fi
elif [[ "www.$address" == $domain ]]; then
# Redirected to www
status="Ok"
status_type="ok"
check_html_url=$website_redirected
else
# website redirected to aother domain.
status="Redirected: $website_redirected"
status_type="redirect"
# website stayed http
domain=`echo $website_redirected | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
if [[ "$domain" == "$address" ]]; then

# website redirected, but stayed on the same domain. Probably l10n redirection.
website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)

if [ "$website_redirected2" == "" ]; then
# website did not redirect to a subdirectory.
status="Ok"
status_type="ok"
check_html_url=$website_redirected
else
#If the website redirected a second time, set the check_html_url variable to the second redirected address.
status="Ok"
status_type="ok"
check_html_url=$website_redirected2
fi

else
# website redirected to aother domain.
status="Redirected: $website_redirected"
status_type="redirect"
fi
fi

elif [ $response == "000" ]; then
status="Error: Unable to connect"
status_type="error"
Expand All @@ -74,6 +88,15 @@ else
status_type="error"
fi

if [ "$check_html_url" != "" ]; then
# Check to make sure if the website was redirected, that it did not redirected to a 404 page.
response=$(curl --write-out %{http_code} --silent --output /dev/null $check_html_url)
if [ $response == "404" ]; then
status="Error: $response Not Found"
status_type="error"
fi
fi

#Check to see if the website has analytics code installed

if [ "$status" == "Ok" ] && [ "$pro" != "ftp" ]; then
Expand All @@ -91,15 +114,15 @@ if [ "$status" == "Ok" ] && [ "$pro" != "ftp" ]; then
if [ $check_analytics_coverage == 1 ] && [ $ignore_domain_check == 0 ]; then

# First check to see how many wget spiders are running. This is to keep from running too many spiders and driving up the load average.
procs=`ps a | grep -i wget | wc -l | sed 's/ //g'`
procs=`ps -ax | grep -i wget | wc -l | sed 's/ //g'`
total_procs=`echo $procs-1|bc`

while [ $total_procs -gt $concurrent_procs ]
# If more then n wget's running, sleep for a minute and try again.
do
echo "Sleeping ($total_procs waiting)...."
sleep 60
procs=`ps a | grep -i wget | wc -l | sed 's/ //g'`
procs=`ps -ax | grep -i wget | wc -l | sed 's/ //g'`
total_procs=`echo $procs-1|bc`

done
Expand All @@ -125,14 +148,3 @@ fi
echo "$address, $status_type, $analytics ($analytics_check)"
status=`echo $status | sed 's/ /\+/g'`
echo "$address,$pro,$status,$status_type,$analytics,$coverage" >> $output

## Check to see if all processes are finished to decide to run create-wiki.sh

input_len=`wc -l $input | sed -r 's/^([0-9]+) (.+)/\1/g'`
output_len=`wc -l $output | sed -r 's/^([0-9]+) (.+)/\1/g'`

if [ "$input_len" == "$output_len" ]; then
# Write the already known giant websites temp file back to the output.txt file before creating the wiki.
cat output-temp.txt >> output.txt
./create-wiki.sh
fi
4 changes: 2 additions & 2 deletions check.sh
Expand Up @@ -16,8 +16,8 @@ input=`cat $inputfile`

for address in $input; do

#sleep 2
./check-domain.sh $address $inputfile $outputfile &
sleep 2
./check-domain.sh $address $outputfile &

done

52 changes: 13 additions & 39 deletions create-wiki.sh
Expand Up @@ -2,19 +2,11 @@

inputfile="output.txt"
output="output-wiki.txt"
output_ok="output-ok.txt"
output_http="output-http.txt"
output_ftp="output-ftp.txt"
output_robots="output-robots.txt"
output_websites="active-websites.txt"
output_prod="output-prod.txt"
input_wiki="current-websites.txt"
websites_output="active-websites.txt"
exec `sort -o $inputfile $inputfile`
exec `cat /dev/null > $output`
exec `cat /dev/null > $output_http`
exec `cat /dev/null > $output_ftp`
exec `cat /dev/null > $output_ok`
exec `cat /dev/null > $output_websites`
exec `cat /dev/null > $websites_output`

total_websites=0
total_analytics=0
Expand All @@ -23,20 +15,15 @@ total_error=0
total_redirect=0
total_ftp=0
total_robots_blocked=0
total_prod=0

curl -s https://wiki.mozilla.org/Websites/Active_List > $input_wiki

dev_domains="allizom|cdn\-|\-cdn|\.stage|\-stage|stage\-|stage\."
curl -s https://wiki.mozilla.org/Webdev:WhoWorksOnWhat > current-websites.txt

today=`date +%m-%d-%Y`

echo "The following is a list of active websites that are blocked from ALL robot spidering:
" > $output_robots

echo "== Domain List ==
The following list and updates is as of $today.
echo "The following list and updates is as of $today.
{| class='wikitable sortable' border='1'
|-
Expand Down Expand Up @@ -67,21 +54,13 @@ for thisline in $input; do
robots=`./check-robots.sh $address`
if [ "$robots" == "1" ]; then
(( total_robots_blocked++ ))
echo "* [$pro://$address $address]" >> $output_robots
fi

ignore_domain_check=`echo $address | grep -i -E $dev_domains | wc -l | sed 's/ //g'`

if [ $ignore_domain_check == 0 ]; then
(( total_prod++ ))
echo "* $address" >> $output_prod
echo "* [$pro://$address/robots.txt $address]" >> $output_robots
fi

address_check=`./get-redirected-address.sh $address`
echo "Check: $address, $address_check"
exec `./active-websites.sh $address_check $input_wiki $output_websites > /dev/null`
echo "* $address" >> $output_ok

exec `./active-websites.sh $address_check $websites_output > /dev/null`

elif [ "$status_type" == "error" ]; then
(( total_error++ ))
elif [ "$status_type" == "redirect" ]; then
Expand All @@ -90,10 +69,8 @@ for thisline in $input; do

if [[ "$pro" == "ftp" ]]; then
(( total_ftp++ ))
echo "* $address" >> $output_ftp
else
(( total_websites++ ))
echo "* $address" >> $output_http
fi

if [ "$analytics" == "Yes" ]; then
Expand All @@ -112,15 +89,12 @@ echo "|}
== Statistics ==
* Total domains: [[Websites/Domain_List/http|$total_websites]]
* Total ok: [[Websites/Domain_List/ok|$total_ok]]
* Total prod: [[Websites/Domain_List/prod|$total_prod]]
* Total websites: [[Domain Names/all|$total_websites]]
* Total FTP servers: $total_ftp
* Total ok: $total_ok
* Total errors: $total_error
* Total redirects: $total_redirect
* Total analytics installed: $total_analytics
* Total robot blocked websites: [[Websites/Domain_List/robots-blocked|$total_robots_blocked]]
== Do you have changes to this list? ==
* Total robot blocked websites: [[Domain Names/robots-blocked|$total_robots_blocked]]
This wiki page is automatically generated by scripts. Please contact [https://ldap.mozilla.org/phonebook/tree.php#search/cmore@mozilla.com Chris More] for more
information. The source script for this page can be found [https://github.com/chrismore/Domain-Name-Status-Checker here]." >> $output
The source script for this page can be found [https://github.com/chrismore/Domain-Name-Status-Checker here]." >> $output
6 changes: 3 additions & 3 deletions find-analytics.sh
Expand Up @@ -6,13 +6,13 @@

address=$1
analytics_string=$2
domain=`echo $address | sed -r "s/^(.+\/\/)([^/]+)(.*)/\2/"`
domain=`echo $address | sed -E "s/^(.+\/\/)([^/]+)(.*)/\2/"`
# Remove all spider cache
# Spider every page, which requires HTTrack + libssl.so installed
#exec `httrack "$address" -w -T5 -p1 -N3 -Q -%x -I0 -A9999999999 -%c10 -c5 -F "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" > /dev/null`
exec `wget -D $domain -R .swf,.JPG,.PNG,.GIF,.tiff,.bmp,*smartproxy*,.ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -q -l 5 -nc --connect-timeout=5 -Pweb --no-check-certificate --html-extension -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" $address`
# Grep for the number of pages that include the analytics string - stop at first occourance of string in file
finds=`grep -lri "$analytics_string" ./web/$domain --include=*.html | wc -l | sed 's/ //g'`
finds=`grep -lri "$analytics_string" ./web/$domain | wc -l | sed 's/ //g'`
# Find how many HTML pages have been spidered
files=`find ./web/$domain -type f \( -name "*.html" -or -name "*.htm" \) | wc -l | sed 's/ //g'`
# There are some files that are mirroed that are behind a proxy, which are not part of the website, but wget still picks them up. The -E 'string' supports regex matching
Expand All @@ -26,4 +26,4 @@ if [ $files -ge 1 ]; then
else
# Return 0 if none or one pages returned total.
echo "0"
fi
fi
8 changes: 4 additions & 4 deletions get-redirected-address.sh
Expand Up @@ -6,11 +6,11 @@ address=$1
# Find redirected address

# Check to see if a website is just redirecting from http to https
website_redirected=$(curl --write-out %{url_effective} --silent --output /dev/null -L $pro://$address)
website_redirected=$(curl --write-out %{redirect_url} --silent --output /dev/null $pro://$address)
if [ "https://$address/" == "$website_redirected" ]; then
pro="https"
# Check redirector again incase it redirects a second time (localization)
website_redirected2=$(curl --write-out %{url_effective} --silent --output /dev/null -L $website_redirected)
website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)
if [ "$website_redirected2" == "" ]; then
#If the website did not redirect again after switching to https, then set address_final to current address.
address_final=$website_redirected
Expand All @@ -22,7 +22,7 @@ address=$1
# website stayed http
if [[ "$website_redirected" != "" ]]; then
# website redirected, but stayed on the same domain. Probably l10n redirection.
website_redirected2=$(curl --write-out %{url_effective} --silent --output /dev/null -L $website_redirected)
website_redirected2=$(curl --write-out %{redirect_url} --silent --output /dev/null $website_redirected)

if [ "$website_redirected2" == "" ]; then
# website did not redirect to a subdirectory.
Expand All @@ -37,4 +37,4 @@ address=$1
fi
fi

echo $address_final
echo $address_final

0 comments on commit d694f56

Please sign in to comment.