-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
executable file
·121 lines (88 loc) · 2.1 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#! /usr/bin/env python
import urllib, sgmllib, sys, getopt, time, re
class MyParser(sgmllib.SGMLParser):
"My Class for Parsing"
def parse(self, s):
"Parse the given string 's'"
self.feed(s)
self.close()
def __init__(self,verbose=0):
"Init an object, passing verbose to the superclass"
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
self.img = []
def start_a(self, attributes):
"Process a hyperlink and its attributes"
for name, value in attributes:
if name == "href":
self.hyperlinks.append(value)
def start_img(self, attributes):
for name, value in attributes:
if name == "src":
self.img.append(value)
def get_hyperlinks(self):
return self.hyperlinks
class CheckLinks():
def __init__(self):
self.urls404 = []
self.invalid = []
self.start = time.time()
def _getcode(self,url):
f = urllib.urlopen(url)
code = f.getcode()
f.close()
return code
def check(self, url):
p = re.compile('http://')
match = p.findall(url)
if match.count('http://'):
test = url
else:
test = site + url
print '> %s' % test
try:
code= self._getcode(test)
if code == 404:
self.urls404.append(url)
return code
except:
self.invalid.append(url)
return 0
def looplinks(self,links):
count = 0
print 'Checking:'
for url in links:
self.check(url)
count += 1
print '---------'
timetaken = time.time() - self.start
print 'Scanned %d urls in %.2f seconds' % ( count, timetaken )
print '---------'
try:
site = sys.argv[1]
except:
site = 'http://google.com'
print 'Plese give a url'
try:
f = urllib.urlopen(site)
s = f.read()
f.close()
myparser = MyParser()
myparser.parse(s)
links = myparser.get_hyperlinks()
except:
print 'Cant find %s' % site
check = CheckLinks()
check.looplinks(links)
countinvalid = 0
for i in check.invalid:
countinvalid += 1
if countinvalid:
print 'There was %d invalid URLs' % countinvalid
print check.invalid
count404 = 0
for i in check.urls404:
count404 += 1
if count404:
print 'There were %d not found pages' % count404
print check.urls404