# Links

Get direct link(s) to publisher's web page for articles (where available).

In [1]:
import requests

import lxml.html

from time import sleep

In [48]:
MAX_TRIES = 10

def url_for_form(form):

    params = {x.attrib['name']: x.attrib['value'] for x in form.xpath('.//input[@type="hidden"]') if x.name not in ['multilanguage_language_code']}
    
    for try_n in range(1, MAX_TRIES+1):
        
        response = requests.get('http://wtgcsfx.hosted.exlibrisgroup.com/wtsc/cgi/core/sfxresolver.cgi', params=params)
                           
        if response.status_code == 200: break

        logging.warn("Network problem getting publisher page (status code {}) on try {}".format(response.status_code, try_n))

        if try_n < MAX_TRIES: sleep(try_n)
            
    else:
        
        logging.error("Out of tries (max {})".format(MAX_TRIES))
        
        return None
    
    url = response.url

    return url

def links_for_pmid(pmid):
    
    if np.isnan(pmid): return None
    
    pmid = int(pmid)
    
    logging.info("Starting PMID '{}'...".format(pmid))
        
    for try_n in range(1, MAX_TRIES+1):
        
        response = requests.get("http://wtgcsfx.hosted.exlibrisgroup.com/wtsc?sid=Entrez:PubMed&id=pmid:{}".format(pmid))
                           
        if response.status_code == 200: break

        logging.warn("Network problem getting WTGC library page (status code {}) on try {}".format(response.status_code, try_n))

        if try_n < MAX_TRIES: sleep(try_n)
            
    else:
        
        logging.error("Out of tries (max {})".format(MAX_TRIES))
        
        return None
        
    dom = lxml.html.fromstring(response.text)
    
    forms = dom.xpath('//form[contains(@name, "basic")]')
    
    urls = [url_for_form(x) for x in forms]

    html = '<ul>' + ' '.join('<li> <a target="_blank" href="{}">Link_{}</a>'.format(url, n) for n, url in enumerate(urls, 1)) + '</ul>'
    
    logging.info("... finished PMID '{}'.".format(pmid))
    
    return html

In [3]:
docs = pd.read_pickle('docs_0.pkl')

docs.shape

(92, 5)

In [4]:
docs['pubmed_id'].dropna().size

81

In [6]:
pubmed_ids = docs[['pubmed_id']].dropna().set_index('pubmed_id', drop=False)['pubmed_id']

In [49]:
links = pubmed_ids.apply(links_for_pmid).to_frame('links')

[2015/Jun/16 12:10:06 INFO    ] Starting PMID '3023614'...
[2015/Jun/16 12:10:10 INFO    ] ... finished PMID '3023614'.
[2015/Jun/16 12:10:10 INFO    ] Starting PMID '2435903'...
[2015/Jun/16 12:10:13 INFO    ] ... finished PMID '2435903'.
[2015/Jun/16 12:10:13 INFO    ] Starting PMID '2435904'...
[2015/Jun/16 12:10:16 INFO    ] ... finished PMID '2435904'.
[2015/Jun/16 12:10:16 INFO    ] Starting PMID '3184128'...
[2015/Jun/16 12:10:19 INFO    ] ... finished PMID '3184128'.
[2015/Jun/16 12:10:19 INFO    ] Starting PMID '3339603'...
[2015/Jun/16 12:10:22 INFO    ] ... finished PMID '3339603'.
[2015/Jun/16 12:10:22 INFO    ] Starting PMID '2892936'...
[2015/Jun/16 12:10:25 INFO    ] ... finished PMID '2892936'.
[2015/Jun/16 12:10:25 INFO    ] Starting PMID '2840498'...
[2015/Jun/16 12:10:28 INFO    ] ... finished PMID '2840498'.
[2015/Jun/16 12:10:28 INFO    ] Starting PMID '2840504'...
[2015/Jun/16 12:10:30 INFO    ] ... finished PMID '2840504'.
[2015/Jun/16 12:10:30 INFO    ] Starting

In [39]:
HTML(links.to_html())

Unnamed: 0_level_0,links
pubmed_id,Unnamed: 1_level_1
3023614,Link_1 Link_2
2435903,Link_1 Link_2
2435904,Link_1 Link_2
3184128,Link_1 Link_2
3339603,Link_1 Link_2
2892936,Link_1 Link_2
2840498,Link_1 Link_2
2840504,Link_1 Link_2
2552119,Link_1 Link_2
2142737,Link_1 Link_2


In [40]:
links.to_pickle('links.pkl')