forked from ptwobrussell/Mining-the-Social-Web
-
Notifications
You must be signed in to change notification settings - Fork 0
/
microformats__xfn_crawl.py
108 lines (85 loc) · 2.43 KB
/
microformats__xfn_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
import sys
import os
import urllib2
from BeautifulSoup import BeautifulSoup
import HTMLParser
import networkx as nx
ROOT_URL = sys.argv[1]
if len(sys.argv) > 2:
MAX_DEPTH = int(sys.argv[2])
else:
MAX_DEPTH = 1
XFN_TAGS = set([
'colleague',
'sweetheart',
'parent',
'co-resident',
'co-worker',
'muse',
'neighbor',
'sibling',
'kin',
'child',
'date',
'spouse',
'me',
'acquaintance',
'met',
'crush',
'contact',
'friend',
])
OUT = "graph.dot"
depth = 0
g = nx.DiGraph()
next_queue = [ROOT_URL]
while depth < MAX_DEPTH:
depth += 1
(queue, next_queue) = (next_queue, [])
for item in queue:
try:
page = urllib2.urlopen(item)
except urllib2.URLError:
print 'Failed to fetch ' + item
continue
try:
soup = BeautifulSoup(page)
except HTMLParser.HTMLParseError:
print 'Failed to parse ' + item
continue
anchorTags = soup.findAll('a')
if not g.has_node(item):
g.add_node(item)
for a in anchorTags:
if a.has_key('rel'):
if len(set(a['rel'].split()) & XFN_TAGS) > 0:
friend_url = a['href']
g.add_edge(item, friend_url)
g[item][friend_url]['label'] = a['rel'].encode('utf-8')
g.node[friend_url]['label'] = a.contents[0].encode('utf-8')
next_queue.append(friend_url)
# Further analysis of the graph could be accomplished here
if not os.path.isdir('out'):
os.mkdir('out')
try:
nx.drawing.write_dot(g, os.path.join('out', OUT))
except ImportError, e:
# Help for Windows users:
# Not a general purpose method, but representative of
# the same output write_dot would provide for this graph
# if installed and easy to implement
dot = []
for (n1, n2) in g.edges():
dot.append('"%s" [label="%s"]' % (n2, g.node[n2]['label']))
dot.append('"%s" -> "%s" [label="%s"]' % (n1, n2, g[n1][n2]['label']))
f = open(os.path.join('out', OUT), 'w')
f.write('''strict digraph {
%s
}''' % (';\n'.join(dot), ))
f.close()
# *nix users could produce an image file with a good layout
# as follows from a terminal:
# $ circo -Tpng -Ograph graph.dot
# Windows users could use the same options with circo.exe
# or use the GVedit desktop application