## Jupyter Notebook for: 1) retrieving publisher links from a list of DOIs; 2) retrieving and storing abstracts from a list of publisher links

### Import modules

In [74]:
import requests
import pandas
import time
import os
from bs4 import BeautifulSoup
import tldextract
import caffeine

### Load table of references with DOIs and parse
- Separate into DOIs without publisher links and DOIs already with publisher links
- Try to clean up some of the existing publisher links

In [32]:
# Load DOI
dois = pandas.read_csv("data/AR6_DOIs.csv")
dois = dois.loc[~dois["doi"].isna()] # Remove papers without DOIs
dois = dois.loc[dois["abstract"].isna()]  # Remove papers that already have abstracts
complete_dois = dois.loc[~dois["url"].isna()] # Papers with doi and a url
incomplete_dois = dois.loc[dois["url"].isna()] # Papers with doi and a url
complete_dois.loc[complete_dois["url"].str.contains(" year "), "url"] = complete_dois["url"].str.rsplit(' ').str[0] # fix a subset of complete_dois that have a problem with their urls (WG3 Chapter 17)
incomplete_dois

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


Unnamed: 0,WG,chapter,year,label,journal,publisher,abstract,doi,url,isbn,issn
1,WGI,AnnexI,2019,Ablain2019,Earth System Science Data,,,10.5194/essd-11-1189-2019,,,1866-3516
5,WGI,AnnexI,2017,Andersson2017,,Satellite Application Facility on Climate Moni...,,10.5676/EUM_SAF_CM/HOAPS/V002,,,
7,WGI,AnnexI,2017,Angerer2017,Atmospheric Measurement Techniques,,,10.5194/amt-10-4845-2017,,,
12,WGI,AnnexI,2016,Bakker2016,Earth System Science Data,,,10.5194/essd-8-383-2016,,,
15,WGI,AnnexI,2016,Banzon2016,Earth System Science Data,,,10.5194/essd-8-165-2016,,,
...,...,...,...,...,...,...,...,...,...,...,...
66941,WGII,Chapter18,2015,RN1017,Climate and Development,,,10.1080/17565529.2014.900603,,,
66990,WGII,Chapter18,2009,RN1066,Gender & Development,,,10.1080/13552070802696839,,,"1355-2074, 1364-9221"
66998,WGII,Chapter18,2014,RN1074,Health Place,,,10.1016/j.healthplace.2014.05.008,,,
67050,WGII,Chapter18,2018,RN1126,Journal of Science and Technology Policy Manag...,,,10.1108/JSTPM-07-2018-079,,,2053-4620


### Get publishers' links for incomplete dois

In [33]:
incomplete_dois["meta"] = None # set up a column for metadata
for index, cite in incomplete_dois.iterrows(): # for each citation....
  print(index, end='\r') # Print where we're at
  if cite["doi"] is not None: # if we have a DOI...
    doi = cite["doi"]
    if "http" in doi:
      pass # if we have a full link already
    else:
      doi = "http://doi.org/" + doi # if just the DOI number, make a link out of it
    try:
      response = requests.head(doi, allow_redirects=True, timeout=20) # Get the publisher's link - whatever the DOI redirects to
      incomplete_dois.at[index,"url"] = response.url # store the publisher link in the url column
      incomplete_dois.at[index,"meta"] = "Publisher link" # update metadata
    except:
      incomplete_dois.at[index,"meta"] = "Couldn't get publisher's link" # update metadata

incomplete_dois

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complete_dois["meta"] = None # set up a column for metadata
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incomplete_dois["meta"] = None # set up a column for metadata


67060

Unnamed: 0,WG,chapter,year,label,journal,publisher,abstract,doi,url,isbn,issn,meta
1,WGI,AnnexI,2019,Ablain2019,Earth System Science Data,,,10.5194/essd-11-1189-2019,https://essd.copernicus.org/articles/11/1189/2...,,1866-3516,Publisher link
5,WGI,AnnexI,2017,Andersson2017,,Satellite Application Facility on Climate Moni...,,10.5676/EUM_SAF_CM/HOAPS/V002,https://wui.cmsaf.eu/safira/action/viewDoiDeta...,,,Publisher link
7,WGI,AnnexI,2017,Angerer2017,Atmospheric Measurement Techniques,,,10.5194/amt-10-4845-2017,https://amt.copernicus.org/articles/10/4845/2017/,,,Publisher link
12,WGI,AnnexI,2016,Bakker2016,Earth System Science Data,,,10.5194/essd-8-383-2016,https://essd.copernicus.org/articles/8/383/2016/,,,Publisher link
15,WGI,AnnexI,2016,Banzon2016,Earth System Science Data,,,10.5194/essd-8-165-2016,https://essd.copernicus.org/articles/8/165/2016/,,,Publisher link
...,...,...,...,...,...,...,...,...,...,...,...,...
66941,WGII,Chapter18,2015,RN1017,Climate and Development,,,10.1080/17565529.2014.900603,https://www.tandfonline.com/doi/abs/10.1080/17...,,,Publisher link
66990,WGII,Chapter18,2009,RN1066,Gender & Development,,,10.1080/13552070802696839,https://www.tandfonline.com/doi/full/10.1080/1...,,"1355-2074, 1364-9221",Publisher link
66998,WGII,Chapter18,2014,RN1074,Health Place,,,10.1016/j.healthplace.2014.05.008,https://linkinghub.elsevier.com/retrieve/pii/S...,,,Publisher link
67050,WGII,Chapter18,2018,RN1126,Journal of Science and Technology Policy Manag...,,,10.1108/JSTPM-07-2018-079,https://www.emerald.com/insight/content/doi/10...,,2053-4620,Publisher link


### Get publisher's links for complete dois
Transfer over links that aren't doi.org

In [40]:
complete_dois["meta"] = None # set up a column for metadata
for index, cite in complete_dois.iterrows(): # for each citation....
  print(index, end='\r') # Print where we're at
  if cite["url"] is not None: # if we have a url...
    url = cite["url"]
    if "doi." in url:
      try:
        response = requests.head(url, allow_redirects=True, timeout=20) # Get the publisher's link - whatever the DOI redirects to
        complete_dois.at[index,"url"] = response.url # store the publisher link in the url column
        complete_dois.at[index,"meta"] = "Publisher link" # update metadata
      except:
        complete_dois.at[index,"meta"] = "Couldn't get publisher's link" # update metadata
    else: # we have a full link already
      complete_dois.at[index,"url"] = url # store the publisher link in the url column
      complete_dois.at[index,"meta"] = "Existing link" # update metadata   

complete_dois

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  complete_dois["meta"] = None # set up a column for metadata


83412

Unnamed: 0,WG,chapter,year,label,journal,publisher,abstract,doi,url,isbn,issn,meta
6,WGI,AnnexI,2010,essd.2.215.2010,Earth System Science Data,,,10.5194/essd-2-215-2010,https://www.earth-syst-sci-data.net/2/215/2010/,,,Existing link
11,WGI,AnnexI,2011,Atlas2011,Bulletin of the American Meteorological Society,,,10.1175/2010BAMS2946.1,http://journals.ametsoc.org/doi/10.1175/2010BA...,,0003-0007,Existing link
18,WGI,AnnexI,2014,article,Oceanography,,,10.5670/oceanog.2014.16,https://doi.org/10.5670/oceanog.2014.16%20http...,,10428275,Publisher link
19,WGI,AnnexI,2017,hess.21.589.2017,Hydrology and Earth System Sciences,,,10.5194/hess-21-589-2017,https://www.hydrol-earth-syst-sci.net/21/589/2...,,,Existing link
21,WGI,AnnexI,2016,Beckley2016,,PO.DAAC,,10.5067/GMSLM-TJ142,https://podaac.jpl.nasa.gov/dataset/MERGED_TP_...,,,Publisher link
...,...,...,...,...,...,...,...,...,...,...,...,...
83408,WGIII,Chapter17,,Zhang_2018.1,Applied Energy,Elsevier {BV,,10.1016/j.apenergy.2017.07.036,https://linkinghub.elsevier.com/retrieve/pii/S...,,,Publisher link
83409,WGIII,Chapter17,,Zhang_2019,Reviews of Geophysics,American Geophysical Union ({AGU,,10.1029/2019rg000644,https://onlinelibrary.wiley.com/doi/10.1029/20...,,,Publisher link
83410,WGIII,Chapter17,,Zhao_2013,Energy Policy,Elsevier {BV,,10.1016/j.enpol.2013.08.092,https://linkinghub.elsevier.com/retrieve/pii/S...,,,Publisher link
83411,WGIII,Chapter17,,Zhenmin_2019,Nature Climate Change,Springer Science and Business Media {LLC,,10.1038/s41558-019-0519-4,https://www.nature.com/articles/s41558-019-0519-4,,,Publisher link


In [69]:
# Try to fix some links that didn't construct properly
for index, cite in complete_dois.iterrows(): # for each citation....
  print(index, end='\r') # Print where we're at
  if cite["url"].count("http") > 1: # if we somehow have multiple http addresses in the url
    doi = "https://doi.org/"+cite["doi"]
    try:
      response = requests.head(doi, allow_redirects=True, timeout=20) # Get the publisher's link based on the doi
      complete_dois.at[index,"url"] = response.url # store the publisher link in the url column
      complete_dois.at[index,"meta"] = "Publisher link" # update metadata
    except:
      complete_dois.at[index,"meta"] = "Couldn't get publisher's link" # update metadata
  else: # we have a full and properly constructed link already
    pass

complete_dois

83412

Unnamed: 0,WG,chapter,year,label,journal,publisher,abstract,doi,url,isbn,issn,meta
6,WGI,AnnexI,2010,essd.2.215.2010,Earth System Science Data,,,10.5194/essd-2-215-2010,https://www.earth-syst-sci-data.net/2/215/2010/,,,Existing link
11,WGI,AnnexI,2011,Atlas2011,Bulletin of the American Meteorological Society,,,10.1175/2010BAMS2946.1,http://journals.ametsoc.org/doi/10.1175/2010BA...,,0003-0007,Existing link
18,WGI,AnnexI,2014,article,Oceanography,,,10.5670/oceanog.2014.16,https://tos.org/oceanography/article/a-time-se...,,10428275,Publisher link
19,WGI,AnnexI,2017,hess.21.589.2017,Hydrology and Earth System Sciences,,,10.5194/hess-21-589-2017,https://www.hydrol-earth-syst-sci.net/21/589/2...,,,Existing link
21,WGI,AnnexI,2016,Beckley2016,,PO.DAAC,,10.5067/GMSLM-TJ142,https://podaac.jpl.nasa.gov/dataset/MERGED_TP_...,,,Publisher link
...,...,...,...,...,...,...,...,...,...,...,...,...
83408,WGIII,Chapter17,,Zhang_2018.1,Applied Energy,Elsevier {BV,,10.1016/j.apenergy.2017.07.036,https://linkinghub.elsevier.com/retrieve/pii/S...,,,Publisher link
83409,WGIII,Chapter17,,Zhang_2019,Reviews of Geophysics,American Geophysical Union ({AGU,,10.1029/2019rg000644,https://onlinelibrary.wiley.com/doi/10.1029/20...,,,Publisher link
83410,WGIII,Chapter17,,Zhao_2013,Energy Policy,Elsevier {BV,,10.1016/j.enpol.2013.08.092,https://linkinghub.elsevier.com/retrieve/pii/S...,,,Publisher link
83411,WGIII,Chapter17,,Zhenmin_2019,Nature Climate Change,Springer Science and Business Media {LLC,,10.1038/s41558-019-0519-4,https://www.nature.com/articles/s41558-019-0519-4,,,Publisher link


### Re-join the complete and incomplete DOI lists and prepare for getting abstracts

In [71]:
all_dois = pandas.concat([complete_dois,incomplete_dois])
all_dois.to_csv("AR6_PubLinks.csv") # Save our work so far so we don't have to do this all over again if getting the abstracts fails

### Here we begin the process of retrieving the actual text of the papers - including abstracts
We use a "headless" browser

In [76]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') # Helps with pages actually loading
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option('prefs', {
          "plugins.always_open_pdf_externally": True, # Disable Chrome's PDF Viewer
          "download.prompt_for_download": False, #To auto download the file
          "download.directory_upgrade": True,
          "download.default_directory": 'Test',
           })

from webdriver_manager.chrome import ChromeDriverManager # Requires a manual installation of selenium
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)




[WDM] - Current google-chrome version is 109.0.5414
[WDM] - Get LATEST chromedriver version for 109.0.5414 google-chrome
[WDM] - About to download new driver from https://chromedriver.storage.googleapis.com/109.0.5414.74/chromedriver_mac64.zip
[WDM] - Driver has been saved in cache [/Users/enost/.wdm/drivers/chromedriver/mac64/109.0.5414.74]


### Get the abstracts from the publisher's pages

In [None]:
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'} # This is how we identifying ourselves/the bot to the sites

import psycopg2
conn = None 
connection = 'CREDENTIALS HERE'
conn = psycopg2.connect(connection)
cur = conn.cursor()

# Function for inserting data into the database
def DBinsert(wg, chapter, year, label, journal, publisher, abstract, doi, url, isbn, issn, meta, domain, count):
  print("inserting: " + str(count), end='\r')
  sql = "INSERT INTO ipcc_abstracts(wg, chapter, year, label, journal, publisher, abstract, doi, url, isbn, issn, meta, domain) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
  try:
    cur.execute(sql, [wg, chapter, year, label, journal, publisher, abstract, doi, url, isbn, issn, meta, domain])
    conn.commit()
    #print("inserted", end='\r')
  except (Exception, psycopg2.DatabaseError) as error:
    print(error)
    conn.rollback()

#sample = all_dois.sample(10)
for index, cite in all_dois.iterrows():
  print(index, end='\r')
  abstract = body = None
  if cite["url"] is not None:
    try:
      ## Get text
      driver.get(cite["url"])
      res = driver.page_source.encode('utf-8')
      soup = BeautifulSoup(res, "lxml")
      body = soup.find('body')
      if "academic.oup.com" in cite["url"]:
        abstract = body.find("section", {"class": "abstract"})
      elif "tandfonline.com" in cite["url"]: 
        abstract = body.find("div", {"class": "hlFld-Abstract"}) 
      elif "annualreviews.org" in cite["url"]: 
        abstract = body.find("div", {"class": "hlFld-Abstract"})
      elif "journals.sagepub.com" in cite["url"]: 
        #WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="uc-btn-accept-banner"]'))).click()
        abstract = body.find("div", {"class": "hlFld-Abstract"})
      elif "link.springer.com" in cite["url"]: 
        abstract = body.find("div", {"class": "c-article-section__content"}) #{"id": "Abs1"})#{
      elif "onlinelibrary.wiley.com" in cite["url"]:
        #WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="uc-btn-accept-banner"]'))).click()
        abstract = body.find("section", {"class": "article-section__abstract"}) # Or h2 "Abstract"? section class article-section__abstract
      elif "jstor.org" in cite["url"]:
        abstract = body.find("div", {"class": "abstract"}) 
      elif "direct.mit.edu" in cite["url"]:
        abstract = body.find("section", {"class": "abstract"})
      elif "journals.ametsoc.org" in cite["url"]: 
        abstract = body.find("section", {"class": "abstract"})
      elif "oxfordhandbooks.com" in cite["url"]: 
        abstract = body.find("div", {"class": "abstract"}) 
      elif "linkinghub.elsevier.com" in cite["url"]:
        # delay redirect?
        abstract = body.find("div", {"class": "Abstracts"})
      elif "elibrary.worldbank.org" in cite["url"]:
        abstract = body.find("div", {"class": "abstractSection"})
      elif "ssrn.org" in cite["url"]:
        #WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="onetrust-accept-btn-handler"]'))).click()
        abstract = body.find("section", {"class": "abstract-text"})
      elif "bioone.org" in cite["url"]:
        abstract = body.find("section", {"class": "ArticleContentText"}) 
      elif "pnas.org" in cite["url"]:
        abstract = body.find("section", {"id": "abstracts"}) 
      elif "nature.com" in cite["url"]:
        abstract = body.find("section", {"class": "c-article-section__content"})
      elif "mdpi.com" in cite["url"]:
        abstract = body.find("section", {"class": "art-abstract"})        
      
      if abstract != None:
        abstract = abstract.text
      
      try:
        domain = tldextract.extract(cite["url"])
        domain = domain.domain
      except:
        domain = None
        
      # Send full record to database
      #wg, chapter, year, label, journal, publisher, abstract, doi, url, isbn, issn, meta, domain
      try:
        DBinsert(cite["WG"], cite["chapter"], cite["year"], cite["label"], cite["journal"], cite["publisher"], abstract, 
                 cite["doi"], cite["url"], cite["isbn"], cite["issn"], cite["meta"], domain, index)
      except:
        print('Error putting the paper in the database')            
        
    except:
      print('Error getting the paper')

print("DONE")
conn.close()

### Check and summarize results

In [None]:
# Query database for all results - This can be written into the notebook, but is TBD. So far I've just queried the database directly.
## Total numner
## Number with and without abstracts
## Number with abstracts from .bib files 
## Number without abstracts with no dois (can't do much about these)
## Number without abstracts with dois
## Number of these we were able to get a link for (existing or publisher's)
## Number of these we were able to get an abstract for
## Scrapped abstracts by domain/publisher (% of links we scrapped that we actually got the abstract for)
## Abstracts by WG/Chapter
## Abstracts by year
## Abstracts by journal
## Abstracts by publisher
## Abstracts by keyword search

In [None]:
# 