Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

executable file 38 lines (25 sloc) 1.235 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
#!/bin/bash

inputfile="input-social.txt"
outputdir="websocial"
cat output-prod-owned.txt | sed -r 's/[\* ]+//' > $inputfile
ignore_domain="addons"
outputfile="output-social.txt"

#####

exec `cat /dev/null > $outputfile`
exec `rm -rf $outputdir > /dev/null`

input=`cat $inputfile`

for address in $input; do

ignore_domain_check=`echo $address | grep -i -E $ignore_domain | wc -l | sed 's/ //g'`
        domain=`echo $address | sed -r 's/^(.+\/\/)([^/]+)(.*)/\2/'`

if [ $ignore_domain_check == 0 ]; then

echo "spidering $address"
exec `wget -D $domain -R .swf,.JPG,.PNG,.GIF,.tiff,.bmp,*smartproxy*,.ppt,.ics,.gz,.xpi,.pdf,.exe,.rss,.js,.png,.css,.gif,.jpg,.ico,.flv,.dmg,.zip,.txt -r -q -l 1 -nc --connect-timeout=5 -P$outputdir --no-check-certificate --html-extension -Q 2m -U "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/9.0.1" $address`
exec `grep -lriE "platform\.twitter\.com|connect\.facebook\.com|twitter\.com/share|facebook\.com/sharer|twitter\.com/intent/tweet" ./$outputdir/$domain --include=*.html >> $outputfile`

echo "deleting $domain"
exec `rm -rf $outputdir/$domain`

else
echo "skipping $address"
echo "skipping $address" >> $outputfile

fi

done
echo "Done".
Something went wrong with that request. Please try again.