-
Notifications
You must be signed in to change notification settings - Fork 1
/
scopusapi.py
206 lines (170 loc) · 7.13 KB
/
scopusapi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# -*- coding: utf-8 -*-
from data_source import DataSource, DataSourceConnection
from model import Publication, Author, Identifier, URL, Index
from htmlform import HTMLForm
from util import strip_bom, make_page_range
from throttle import ThreadingThrottler
from collections import OrderedDict
from urllib import urlencode, quote
import requests
from requests.utils import add_dict_to_cookiejar
import re
INCLUDING_RE = r' \(including subseries [^)]+\)'
class ScopusAPI(DataSource):
def __init__(self, api_key):
self.api_key = api_key
def connect(self):
return ScopusAPIConnection(api_key=self.api_key)
class ScopusAPIConnection(DataSourceConnection):
def __init__(self, api_key):
self.api_key = api_key
def find_next_url(self, links, ref='next'):
for link in links:
if link['@ref'] == ref:
return link['@href']
return None
def publications_from_query(self, query):
url = 'https://api.elsevier.com/content/search/scopus'
params = {
'apiKey': self.api_key,
'view': 'complete',
'query': query
}
raw_json = requests.get(url, params=params).json()
search_results = raw_json['search-results']
total_results = int(search_results['opensearch:totalResults'])
if total_results == 0:
return
entries = raw_json['search-results']['entry']
for pub in self.entries_to_publications(entries):
yield pub
while True:
next_link = self.find_next_url(raw_json['search-results']['link'])
if next_link is None:
break
# (mrshu): hotfix pre scopus, z nejakeho dovodu ocakavaju, ze HTTPS
# pojde aj na porte 80
next_link = next_link.replace('api.elsevier.com:80',
'api.elsevier.com:443')
raw_json = requests.get(next_link).json()
entries = raw_json['search-results']['entry']
for pub in self.entries_to_publications(entries):
yield pub
def search_by_author(self, surname, name=None, year=None):
query = '{}'.format(surname)
if name is not None:
if len(name) > 0:
name = name[0]
query += ', {}'.format(name)
query = 'AUTHOR-NAME({})'.format(query)
if year is not None:
query += ' AND PUBYEAR IS {}'.format(year)
for pub in self.publications_from_query(query):
yield pub
def authors_from_json(self, json):
def none_to_emptystr(s):
if s is None:
return ''
return s
return [Author(surname=author['surname'],
names=[none_to_emptystr(author['given-name'])])
for author in json]
def entries_to_publications(self, entries):
"""Prerobi data zo SCOPUS json reprezentacie na internu Publication."""
def empty_to_none(s):
if s is None:
return None
s = s.strip()
if len(s) == 0:
return None
return s
def exists_to_none(d, key):
if key in d:
if type(d[key]) is list:
return [empty_to_none(x['$']) for x in d[key]]
else:
return empty_to_none(d[key])
else:
return None
def append_identifier(d, key, obj, type):
ids = exists_to_none(d, key)
if ids:
if isinstance(ids, list):
for id in ids:
obj.identifiers.append(Identifier(id, type=type))
else:
obj.identifiers.append(Identifier(ids, type=type))
for entry in entries:
author_count = int(entry['author-count']['$'])
if author_count == 0:
authors = []
else:
authors_in_json = entry.get('author', [])
authors = self.authors_from_json(authors_in_json)
year = empty_to_none(entry['prism:coverDate'])
if year:
year = int(year.split('-')[0])
pub = Publication(empty_to_none(entry['dc:title']), authors, year)
pub.times_cited = empty_to_none(entry['citedby-count'])
source_title = exists_to_none(entry, 'prism:publicationName')
if source_title:
source_title, replacements = re.subn(INCLUDING_RE,
'',
source_title)
source_title = source_title.strip()
if replacements:
pub.series = source_title
else:
pub.published_in = source_title
url = self.find_next_url(entry['link'], ref='scopus')
pub.source_urls.append(URL(url,
type='SCOPUS',
description='SCOPUS'))
citedby_url = self.find_next_url(entry['link'],
ref='scopus-citedby')
if citedby_url is not None:
pub.cite_urls.append(URL(citedby_url,
type='SCOPUS',
description='SCOPUS'))
pub.pages = exists_to_none(entry, 'prism:pageRange')
pub.volume = exists_to_none(entry, 'prism:volume')
pub.issue = exists_to_none(entry, 'prism:issueIdentifier')
pub.pages = exists_to_none(entry, 'prism:pageRange')
append_identifier(entry, 'prism:doi', pub, 'DOI')
append_identifier(entry, 'prism:isbn', pub, 'ISBN')
append_identifier(entry, 'prism:issn', pub, 'ISSN')
append_identifier(entry, 'eid', pub, 'SCOPUS')
pub.indexes.append(Index('SCOPUS', type='SCOPUS'))
yield pub
def search_citations_by_eid(self, eid):
"""Vrati iterator vracajuci zoznam publikacii, ktore cituju dane
eid."""
query = "refeid('{}')".format(eid)
for pub in self.publications_from_query(query):
yield pub
def search_citations(self, publications):
"""Vrati iterator vracajuci zoznam publikacii, ktore cituju publikacie
v zozname publications
"""
for publication in publications:
eid = list(Identifier.find_by_type(publication.identifiers,
'SCOPUS'))
if len(eid) == 0:
continue
eid = eid[0].value
for pub in self.search_citations_by_eid(eid):
yield pub
def assign_indexes(self, publications):
"""Zisti a nastavi, v akych indexoch sa publikacie nachadzaju
"""
pass
def close(self):
pass
if __name__ == '__main__':
with ScopusAPI(api_key='').connect() as conn:
pubs = list(conn.search_by_author('Vinar', name='T'))
for pub in pubs:
print pub
print "Citations:"
for pub in conn.search_citations(pubs):
print pub