-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathartists_extend.py
More file actions
executable file
·102 lines (70 loc) · 2.91 KB
/
artists_extend.py
File metadata and controls
executable file
·102 lines (70 loc) · 2.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python
import csv
import lxml.html
import requests
import time
from datetime import datetime
'''
This script is used to take our csv list of artists and grab additional information about each artist from their dbpedia resource such as:
-movement they were associated with
-influences?
-influenced?
-dates of birth/death
-whether they have a wikipedia record or not, in the first place
-nationality and much, much more.
'''
fname = '../data/artists-full-'+datetime.now().isoformat()+'.csv'
with open('../data/artists_ld.csv', 'r') as file:
r = csv.reader(file)
w = csv.writer(open(fname, 'w'))
header = r.next()
header.extend(['ntnl', 'mvmnt', 'born', 'died', 'subj_terms'])
w.writerow(header)
for row in r:
if row[2] != 'nA':
dom = lxml.html.fromstring(requests.get(row[2]).content)
mvmnt = []
ntnl = []
subj = []
if dom.cssselect('a[rel="dbpedia-owl:movement"]'):
for a in dom.cssselect('a[rel="dbpedia-owl:movement"]'):
mvmnt.append(a.text_content().split(':')[1])
elif dom.cssselect('span[property="dbpprop:movement"]'):
for b in dom.cssselect('span[property="dbpprop:movement"]'):
mvmnt.append(b.text_content().split(':')[1])
else:
mvmnt.append('nA')
if dom.cssselect('a[rel="dbpedia-owl:nationality"]'):
for a in dom.cssselect('a[rel="dbpedia-owl:nationality"]'):
ntnl.append(a.text_content().split(':')[1])
elif dom.cssselect('span[property="dbpprop:nationality"]'):
for b in dom.cssselect('span[property="dbpprop:nationality"]'):
ntnl.append(b.text_content())
else:
ntnl.append('nA')
if dom.cssselect('span[property="dbpedia-owl:birthDate"]'):
for a in dom.cssselect('span[property="dbpedia-owl:birthDate"]'):
dob = a.text_content()
elif dom.cssselect('span[property="dbpprop:birthDate"]'):
for b in dom.cssselect('span[property="dbpprop:birthDate"]'):
dob = b.text_content()
else:
dob = 'nA'
if dom.cssselect('span[property="dbpedia-owl:deathDate"]'):
for a in dom.cssselect('span[property="dbpedia-owl:deathDate"]'):
dod = a.text_content()
elif dom.cssselect('span[property="dbpprop:deathDate"]'):
for b in dom.cssselect('span[property="dbpprop:deathDate"]'):
dod = b.text_content()
else:
dod = 'nA'
if dom.cssselect('a[rel="dcterms:subject"]'):
for a in dom.cssselect('a[rel="dcterms:subject"]'):
subj.append(a.text_content().split(':')[1])
else:
subj.append('nA')
ntnl = [v.replace('_', ' ') for v in ntnl]
mvmnt = [v.replace('_', ' ') for v in mvmnt]
print 'adding: ', ntnl, mvmnt, dob, dod, subj, 'for: ', row[1]
row.extend([', '.join(ntnl).encode('utf8'), ', '.join(mvmnt).encode('utf8'), dob, dod, ', '.join(subj).encode('utf8')])
w.writerow(row)