-
Notifications
You must be signed in to change notification settings - Fork 0
/
forfatter_scraper.py
77 lines (58 loc) · 2.85 KB
/
forfatter_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import urllib2, sys, re
#from BeautifulSoup import BeautifulSoup
# or if your're using BeautifulSoup4:
from bs4 import BeautifulSoup
# next time use: http://www.crummy.com/software/BeautifulSoup/bs4/doc/#parsing-only-part-of-a-document to only parse a prt of the document...
#
# script to scrape the emails from the Norwegian Authors´ Union - http://www.forfatterforeningen.no/english
#
url_base = 'http://www.forfatterforeningen.no'
start_url = 'http://www.forfatterforeningen.no/medlemmer-b'
urls_to_scrape = []
emails = []
def scrape(url):
print url
soup = BeautifulSoup(urllib2.urlopen(url).read())
# two columns matches this query
columns = soup.find_all("div", { "class" : "field-content" })
#loop thrugh them
for col in columns:
for tag in col.find_all(text=re.compile("\(.+\)")): # "\(.+\)" mateches (a)
#print tag, len(tag.split())
# how long the strings containing (.) is matters
if len(tag.split()) == 3:
try: # the short email line contains empty space, proably in this format: 'name (a) host.no'
tag.split(" ")
#print tag.split(" ")[-3] + '@' + tag.split(" ")[-1]
emails.append(tag.split(" ")[-3] + '@' + tag.split(" ")[-1])
except: # the short email line dosnt contains empty space, proably in this format: 'name(a)host.no'
#print tag.split()[-1].split('(a)')[0] + '@' + tag.split()[-1].split('(a)')[1]
emails.append(tag.split()[-1].split('(a)')[0] + '@' + tag.split()[-1].split('(a)')[1])
elif len(tag.split()) == 5:
#print tag.split()[-3] + tag.split()[-2] + tag.split()[-1]
#print tag.split()[-3] + '@' + tag.split()[-1]
emails.append(tag.split()[-3] + '@' + tag.split()[-1])
elif len(tag.split()) == 4:
#print tag.split()[-3] + '@' + tag.split()[-1]
emails.append(tag.split()[-3] + '@' + tag.split()[-1])
else:
print "pokker, her er det ugler i mosen.."
print tag, len(tag.split())
# let's get strted:
soup = BeautifulSoup(urllib2.urlopen(start_url).read())
# collect all urls and append to urls_to_scrape
abc_liks_div = soup.find(id="block-menu-menu-meny-medlemmer").find_all('a', href=True) # cool! find and find_all can be chained!
for link in abc_liks_div:
urls_to_scrape.append(url_base+link['href'])
# run the scraper
for url in urls_to_scrape:
scrape(url)
#print emails, len(emails), len(set(emails))
with open('DnF_epostliste.txt', 'wb') as f:
for email in set(emails):
f.write(email + '\n')
# with open('file_to_write', 'w') as f:
# f.write('file contents')
sys.exit("ferdig")