-
Notifications
You must be signed in to change notification settings - Fork 198
/
test_links.py
135 lines (112 loc) · 4.11 KB
/
test_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import requests
import re
import os, subprocess, signal, sys
from getopt import getopt
from time import sleep
# This test runs through all html files in /notebooks,
# and verifies that links are working.
# to do
# check anchor tags as well
# accept flag to check deployed pages
# should I be using beautifulsoup to parse html?
# Get command-line arguments
# -r --root: The root directory that files are served from
# (not yet implemented) -d --directory: The directory where files are stored locally
# Should clarify that this test currently pulls links from local files,
# but can test deployed version of files. That only makes sense
# if deployed version matches current local version.
# Should either pull links locally, and test locally, or
# pull from deployed site, and test deployed site.
root = 'http://localhost:8000/'
opts, args = getopt(sys.argv[1:], "r:", ["root=",])
for opt, arg in opts:
if opt in ('--root', '-r'):
root = arg + '/'
print("Using root: ", root)
def get_links_in_line(line):
# Returns a list of links contained in a line of code.
links = []
# Split lines so they start with links.
# ie, segments will be ~ ="hello_world.html">blah
link_re = """=(["'])(.*?)(["'].*)"""
p = re.compile(link_re)
for segment in line.split('a href'):
m = p.match(segment)
if m:
#print 'match: ', m.group(2)
links.append(m.group(2))
return links
def get_links_in_file(root_dir, filename):
# Returns a list of all the links in a file.
f = open(root_dir + filename, 'r')
lines = f.readlines()
f.close()
links_to_check = []
for line in lines:
if 'a href' in line:
for link in get_links_in_line(line):
# Ignore anchor tags for now
if 'html' in link:
links_to_check.append(link)
return links_to_check
def check_links(filename, links, bad_links, links_tested):
# Checks all links given, and adds bad links to bad_links.
print("links to check: ", links)
for link in links:
print("Checking link: %s..." % link)
# Only check links that haven't already been checked:
if link in links_tested:
continue
# External links don't need our root.
if 'http' in link:
url = link
else:
url = root + link
print('checking url: ', url)
r = requests.get(url)
print('Status code: ', r.status_code)
if r.status_code != 200:
bad_links[filename + '---' + link] = r.status_code
else:
links_tested.append(link)
# Location of html files
# Assume all files in this directory, no nesting
root_dir = '/srv/projects/intro_programming/intro_programming/notebooks/'
# Get all html filenames in this directory.
# Use os.walk if files end up nested.
filenames = []
for filename in os.listdir(root_dir):
if 'html' in filename:
filenames.append(filename)
# Start a server locally, in the notebooks directory.
print("Starting server...")
cmd = 'chdir /srv/projects/intro_programming/intro_programming/notebooks/ && '
cmd += 'python -m SimpleHTTPServer'
pro = subprocess.Popen(cmd, shell=True, preexec_fn=os.setsid)
# Make sure server has a chance to start before making request.
print("SLEEPING...")
sleep(1)
# Report on progress as test runs, but store bad links for final report.
# dict: {page---link: status_code}
bad_links = {}
# Only test each unique link once.
links_tested = []
num_links_checked = 0
# Check links in all files.
#filenames = ['var_string_num.html']
for filename in filenames:
links_to_check = get_links_in_file(root_dir, filename)
check_links(filename, links_to_check, bad_links, links_tested)
num_links_checked += len(links_to_check)
# Kill the server process
os.killpg(pro.pid, signal.SIGTERM)
# Report on bad links.
print("\n\n*** Bad Links ***")
if bad_links:
for link in bad_links:
print('\n', bad_links[link], link)
else:
print("Congratulations, all links are working.")
print("\n")
print("Checked %d links." % num_links_checked)
print("Tested %d unique links." % len(links_tested))