In [None]:
# 1. Convert the chapter to a Word document https://www.adobe.com/ca/acrobat/online/pdf-to-word.html
## Potential alternatives in case this conversion doesn't work: Open in Adobe Acrobat and Save as Text. 
## Open in Acrobat and export as PDF but only the pages with the bibliography. Then try to convert.

# 2. Copy the references into Sublime Text (or other text editor) then into Excel. Delete empty lines and extraneous ones. Sorting A-Z helps.
## This produces some errors - not all citations formatted properly after copy/pasting. e.g. multiple citations on the same line.

# 3. Then copy into Crossref's simple text query form https://apps.crossref.org/SimpleTextQuery 
## This can lookup the DOI (link) for up to 1000 references at a time. Each reference needs to be on its own line. 
## It's not perfect, but a minimal amount of time could be spent formatting things in Excel or a plain text editor like Sublime Text before running the form. 

# 4. Copy and paste the references with DOIs from Crossref into a CSV
## Insert a new first row in the CSV and add the text "citation" to label this column

# 5. Load that CSV file into this Python program. The program associates each citation with its DOI (assuming the DOI directly follows the citation). 
## I believe this captures ~400 of the ~500 references in WG2 AR5 Chapter 13, for instance. In the remaining 100 cases it didn't work or they are white papers, UN reports, etc. without DOIs. 
## The program then gets the actual URL from the publisher and then "visits" that URL and attempts to save the paper or abstract.

In [1]:
import requests
import pandas
import time
from bs4 import BeautifulSoup

In [5]:
citations = pandas.read_csv("IPCC_AR6_WGII_Chapter08_postCrossRef.csv") # Replace this with the CSV created from the CrossRef lookup
citations

Unnamed: 0,citation
0,"Abbott, D. and S. Pollard, 2004: Hardship and ..."
1,"Abbott, K. W., 2017: Orchestration: strategic ..."
2,https://doi.org/10.2139/ssrn.2983512
3,"Abdullah, A. N. M. et al., 2016: A short-term ..."
4,https://doi.org/10.1016/j.gloenvcha.2018.12.003
...,...
1771,"Zimmermann, A., J. Benda, H. Webber and Y. Jaf..."
1772,"Zougmoré, R. et al., 2016: Toward climate-smar..."
1773,https://doi.org/10.1186/s40066-016-0075-3
1774,"Zuñiga, R. A. A., G. N. Lima and A. M. G. Vill..."


In [6]:
"""
This code directly associates each reference with its DOI, assuming the DOI was placed
by the Crossref form on the line beneath the reference
"""

citations_dois = pandas.DataFrame(columns = ["ref", "doi"])

for index, row in citations.iterrows():
  if row["citation"][0:5] == "https": # assume doi
    # add to previous row's doi in citations_dois
    citations_dois.loc[index-1]["doi"] = row["citation"]
    pass
  else:
    citations_dois.loc[index] = [row["citation"], None]


citations_dois.reset_index(inplace=True, drop=True)
citations_dois

Unnamed: 0,ref,doi
0,"Abbott, D. and S. Pollard, 2004: Hardship and ...",
1,"Abbott, K. W., 2017: Orchestration: strategic ...",https://doi.org/10.2139/ssrn.2983512
2,"Abdullah, A. N. M. et al., 2016: A short-term ...",https://doi.org/10.1016/j.gloenvcha.2018.12.003
3,"Abrahamse, W. and R. Shwom, 2018: Domestic ene...",https://doi.org/10.1002/wcc.525
4,"Acharya, P., B. Boggess and K. Zhang, 2018: As...",https://doi.org/10.3390/ijerph15020247
...,...,...
993,"Ziervogel, G., 2019a: Building transformative ...",https://doi.org/10.1007/s13280-018-1141-9
994,"Ziervogel, G., 2019b: Unpacking the Cape Town ...",
995,"Zimmermann, A., J. Benda, H. Webber and Y. Jaf...",
996,"Zougmoré, R. et al., 2016: Toward climate-smar...",https://doi.org/10.1186/s40066-016-0075-3


In [7]:
"""
This cell tries to get the link to the publisher for each text
"""
citations_dois["pub_link"] = None
citations_dois["doi_short"] = None
citations_dois["meta"] = None

for index, cite in citations_dois.iterrows():
  print(index, end='\r')
  if cite["doi"] is not None:
    doi = cite["doi"][16:]
    citations_dois.at[index,"doi_short"] = doi
    try:
      response = requests.head(cite["doi"], allow_redirects=True, timeout=20) # Get the publisher's link - whatever the DOI redirects to
      citations_dois.at[index,"pub_link"] = response.url
      #pdf = pdfkit.from_url(cite["pub_link"], doi+'.pdf') # Unfortunately, publishers seem to think this is a robot so they often don't grant access
      # Go to the publisher's link
      citations_dois.at[index,"meta"] = "Publisher link"
    except:
      citations_dois.at[index,"meta"] = "Something didn't work with getting the publisher's link"

citations_dois

997

Unnamed: 0,ref,doi,pub_link,doi_short,meta
0,"Abbott, D. and S. Pollard, 2004: Hardship and ...",,,,
1,"Abbott, K. W., 2017: Orchestration: strategic ...",https://doi.org/10.2139/ssrn.2983512,http://www.ssrn.com/abstract=2983512,10.2139/ssrn.2983512,Publisher link
2,"Abdullah, A. N. M. et al., 2016: A short-term ...",https://doi.org/10.1016/j.gloenvcha.2018.12.003,https://linkinghub.elsevier.com/retrieve/pii/S...,10.1016/j.gloenvcha.2018.12.003,Publisher link
3,"Abrahamse, W. and R. Shwom, 2018: Domestic ene...",https://doi.org/10.1002/wcc.525,https://onlinelibrary.wiley.com/doi/10.1002/wc...,10.1002/wcc.525,Publisher link
4,"Acharya, P., B. Boggess and K. Zhang, 2018: As...",https://doi.org/10.3390/ijerph15020247,https://www.mdpi.com/1660-4601/15/2/247,10.3390/ijerph15020247,Publisher link
...,...,...,...,...,...
993,"Ziervogel, G., 2019a: Building transformative ...",https://doi.org/10.1007/s13280-018-1141-9,https://link.springer.com/article/10.1007/s132...,10.1007/s13280-018-1141-9,Publisher link
994,"Ziervogel, G., 2019b: Unpacking the Cape Town ...",,,,
995,"Zimmermann, A., J. Benda, H. Webber and Y. Jaf...",,,,
996,"Zougmoré, R. et al., 2016: Toward climate-smar...",https://doi.org/10.1186/s40066-016-0075-3,https://agricultureandfoodsecurity.biomedcentr...,10.1186/s40066-016-0075-3,Publisher link


In [8]:
# Here we begin the process of retrieving the actual text of the papers - including abstracts
## We use a "headless" browser

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait

chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument('--headless') # Helps with pages actually loading
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_experimental_option('prefs', {
          "plugins.always_open_pdf_externally": True, # Disable Chrome's PDF Viewer
          "download.prompt_for_download": False, #To auto download the file
          "download.directory_upgrade": True,
          "download.default_directory": 'Test',
           })

from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)




[WDM] - Current google-chrome version is 105.0.5195
[WDM] - Get LATEST chromedriver version for 105.0.5195 google-chrome
[WDM] - There is no [mac64] chromedriver for browser 105.0.5195 in cache
[WDM] - About to download new driver from https://chromedriver.storage.googleapis.com/105.0.5195.52/chromedriver_mac64.zip
[WDM] - Driver has been saved in cache [/Users/enost/.wdm/drivers/chromedriver/mac64/105.0.5195.52]


In [None]:
# Now we try to go get the full-text
# As we do so, we'll count any mentions of "indigenous knowledge" or "local knowledge"

citations_dois["indigenous knowledge"] = None
citations_dois["local knowledge"] = None

for index, cite in citations_dois[0:3].iterrows():
  print(index, end='\r')
  if cite["pub_link"] is not None:
    try:
      ## Get text
      driver.get(cite["pub_link"])
      res = driver.page_source.encode('utf-8')
      soup = BeautifulSoup(res, "html.parser")
      text = soup.get_text()
      citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Got text"
      ## Analyze text
      try:
        text_analysis = text.split(" ")
        text_analysis = [word.lower() for word in text_analysis]
        citations_dois.at[index, "indigenous knowledge"] = text_analysis.count("indigenous knowledge")
        citations_dois.at[index, "local knowledge"] = text_analysis.count("local knowledge")
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Analyzed text"
      except:
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Couldn't analyze text"
      ## Export text
      try:
        with open("texts/"+cite['doi_short']+".txt", "w") as text_file:
          text_file.write(text)
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Exported the paper"
      except:
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Couldn't export the paper"
    except:
      print('Something went wrong with getting the paper')
      citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Something went wrong with getting the paper"

citations_dois

In [None]:
# As an alternative to getting full-text, here we get the abstracts from CrossRef
citations_dois["indigenous"] = None
citations_dois["local"] = None
citations_dois["knowledge"] = None

for index, cite in citations_dois.iterrows():
  print(index, end='\r')
  if cite["doi_short"] is not None:
    try:
      ## Get abstract
      time.sleep(1) # Sleep to slow things down a bit
      response = requests.get("https://api.crossref.org/works/"+cite["doi_short"]+"?mailto=enost@uoguelph.ca", timeout=20)
      data = response.json()
      try:
        abstract = data['message']['abstract']
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Got abstract"
        ## Analyze abstract
        try:
          abstract_analysis = abstract.split(" ")
          abstract_analysis = [word.lower() for word in abstract_analysis]
          citations_dois.at[index, "indigenous"] = abstract_analysis.count("indigenous")
          citations_dois.at[index, "local"] = abstract_analysis.count("local")
          citations_dois.at[index, "knowledge"] = abstract_analysis.count("knowledge")
          citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Analyzed abstract"
          ## Export abstract
          try:
            filename = cite['doi_short'].replace("/", "_")
            filename = filename.replace(".", "_")
            with open("abstracts/"+filename+".txt", "w") as text_file:
              text_file.write(abstract)
            citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Exported the abstract"
          except:
            citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Couldn't export the abstract"
        except:
          citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Couldn't analyze abstract"
      except:
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Got record but no abstract"   
    except:
      print('Something went wrong with getting the abstract')
      citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Something went wrong with getting the abstract"

citations_dois

In [9]:
# As yet another alternative (preferred), here we get the abstracts from the publisher's pages
header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0'}

citations_dois["indigenous"] = None
citations_dois["local"] = None
citations_dois["knowledge"] = None

#sample = citations_dois.sample(10)

for index, cite in citations_dois.iterrows():
  print(index, end='\r')
  abstract = body = None
  if cite["pub_link"] is not None:
    try:
      ## Get text
      #print(cite["pub_link"])
      driver.get(cite["pub_link"])
      res = driver.page_source.encode('utf-8')
      #res = requests.get(cite["pub_link"], allow_redirects=True, headers = header, timeout=120).content.decode()
      soup = BeautifulSoup(res, "lxml")
      body = soup.find('body')
      if "academic.oup.com" in cite["pub_link"]:
        abstract = body.find("section", {"class": "abstract"})
      elif "tandfonline.com" in cite["pub_link"]: 
        abstract = body.find("div", {"class": "hlFld-Abstract"}) 
      elif "annualreviews.org" in cite["pub_link"]: 
        abstract = body.find("div", {"class": "hlFld-Abstract"})
      elif "journals.sagepub.com" in cite["pub_link"]: 
        #WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="uc-btn-accept-banner"]'))).click()
        abstract = body.find("div", {"class": "hlFld-Abstract"})
      elif "link.springer.com" in cite["pub_link"]: 
        abstract = body.find("div", {"class": "c-article-section__content"}) #{"id": "Abs1"})#{
      elif "onlinelibrary.wiley.com" in cite["pub_link"]:
        #WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="uc-btn-accept-banner"]'))).click()
        abstract = body.find("section", {"class": "article-section__abstract"}) # Or h2 "Abstract"? section class article-section__abstract
      elif "jstor.org" in cite["pub_link"]:
        abstract = body.find("div", {"class": "abstract"}) 
      elif "direct.mit.edu" in cite["pub_link"]:
        abstract = body.find("section", {"class": "abstract"})
      elif "journals.ametsoc.org" in cite["pub_link"]: 
        abstract = body.find("section", {"class": "abstract"})
      elif "oxfordhandbooks.com" in cite["pub_link"]: 
        abstract = body.find("div", {"class": "abstract"}) 
      elif "linkinghub.elsevier.com" in cite["pub_link"]:
        # delay redirect?
        abstract = body.find("div", {"class": "Abstracts"})
      elif "elibrary.worldbank.org" in cite["pub_link"]:
        abstract = body.find("div", {"class": "abstractSection"})
      elif "ssrn.org" in cite["pub_link"]:
        #WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="onetrust-accept-btn-handler"]'))).click()
        abstract = body.find("section", {"class": "abstract-text"})
      elif "bioone.org" in cite["pub_link"]:
        abstract = body.find("section", {"class": "ArticleContentText"}) 
      elif "pnas.org" in cite["pub_link"]:
        abstract = body.find("section", {"id": "abstracts"}) 
      elif "nature.com" in cite["pub_link"]:
        abstract = body.find("section", {"class": "c-article-section__content"})
      elif "mdpi.com" in cite["pub_link"]:
        abstract = body.find("section", {"class": "art-abstract"})        
        
      #print(abstract.text)
      # else get whole page/paper?
      ## Analyze abstract
      try:
        abstract_analysis = abstract.text.split(" ")
        abstract_analysis = [word.lower() for word in abstract_analysis]
        citations_dois.at[index, "indigenous"] = abstract_analysis.count("indigenous")
        citations_dois.at[index, "local"] = abstract_analysis.count("local")
        citations_dois.at[index, "knowledge"] = abstract_analysis.count("knowledge")
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Analyzed abstract"
        ## Export abstract
        try:
          filename = cite['doi_short'].replace("/", "_")
          filename = filename.replace(".", "_")
          with open("abstracts/"+filename+".txt", "w") as text_file:
            text_file.write(abstract.text)
          citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Exported the abstract"
        except:
          citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Couldn't export the abstract"
      except:
        citations_dois.at[index, "meta"] = citations_dois.at[index, "meta"] + "; Couldn't analyze abstract"
    except:
      print('Something went wrong with getting the paper')
      
citations_dois

997

Unnamed: 0,ref,doi,pub_link,doi_short,meta,indigenous,local,knowledge
0,"Abbott, D. and S. Pollard, 2004: Hardship and ...",,,,,,,
1,"Abbott, K. W., 2017: Orchestration: strategic ...",https://doi.org/10.2139/ssrn.2983512,http://www.ssrn.com/abstract=2983512,10.2139/ssrn.2983512,Publisher link; Couldn't analyze abstract,,,
2,"Abdullah, A. N. M. et al., 2016: A short-term ...",https://doi.org/10.1016/j.gloenvcha.2018.12.003,https://linkinghub.elsevier.com/retrieve/pii/S...,10.1016/j.gloenvcha.2018.12.003,Publisher link; Analyzed abstract; Exported th...,0,0,0
3,"Abrahamse, W. and R. Shwom, 2018: Domestic ene...",https://doi.org/10.1002/wcc.525,https://onlinelibrary.wiley.com/doi/10.1002/wc...,10.1002/wcc.525,Publisher link; Analyzed abstract; Exported th...,0,0,1
4,"Acharya, P., B. Boggess and K. Zhang, 2018: As...",https://doi.org/10.3390/ijerph15020247,https://www.mdpi.com/1660-4601/15/2/247,10.3390/ijerph15020247,Publisher link; Couldn't analyze abstract,,,
...,...,...,...,...,...,...,...,...
993,"Ziervogel, G., 2019a: Building transformative ...",https://doi.org/10.1007/s13280-018-1141-9,https://link.springer.com/article/10.1007/s132...,10.1007/s13280-018-1141-9,Publisher link; Analyzed abstract; Exported th...,0,2,0
994,"Ziervogel, G., 2019b: Unpacking the Cape Town ...",,,,,,,
995,"Zimmermann, A., J. Benda, H. Webber and Y. Jaf...",,,,,,,
996,"Zougmoré, R. et al., 2016: Toward climate-smar...",https://doi.org/10.1186/s40066-016-0075-3,https://agricultureandfoodsecurity.biomedcentr...,10.1186/s40066-016-0075-3,Publisher link; Couldn't analyze abstract,,,


In [10]:
citations_dois.to_csv("doi_citations_AR6_WG2_CH8.csv")

In [11]:
citations_dois.loc[((citations_dois["indigenous"] > 0) | (citations_dois["local"] > 0)) & (citations_dois["knowledge"] > 0)]

Unnamed: 0,ref,doi,pub_link,doi_short,meta,indigenous,local,knowledge
21,"Aipira, C., A. Kidd and K. Morioka, 2017: Clim...",https://doi.org/10.1007/978-3-319-50094-2_13,https://link.springer.com/chapter/10.1007/978-...,10.1007/978-3-319-50094-2_13,Publisher link; Analyzed abstract; Exported th...,0,1,1
37,"Andersson, E. and E. C. H. Keskitalo, 2017: Te...",https://doi.org/10.1080/1088937X.2016.1261195,https://www.tandfonline.com/doi/full/10.1080/1...,10.1080/1088937X.2016.1261195,Publisher link; Analyzed abstract; Exported th...,0,1,1
54,"Axelsson-Linkowski, W. et al., 2020: Shifting ...",https://doi.org/10.1007/s10745-020-00171-3,https://link.springer.com/article/10.1007/s107...,10.1007/s10745-020-00171-3,Publisher link; Analyzed abstract; Exported th...,1,1,2
132,"Brugnach, M., M. Craps and A. Dewulf, 2017: In...",https://doi.org/10.1007/s10584-014-1280-3,https://link.springer.com/article/10.1007/s105...,10.1007/s10584-014-1280-3,Publisher link; Analyzed abstract; Exported th...,5,0,2
256,"Drenkhan, F., C. Huggel, L. Guardamino and W. ...",https://doi.org/10.1016/j.scitotenv.2019.02.070,https://linkinghub.elsevier.com/retrieve/pii/S...,10.1016/j.scitotenv.2019.02.070,Publisher link; Analyzed abstract; Exported th...,0,2,1
278,"Eriksen, S. et al., 2011: When not every respo...",https://doi.org/10.3763/cdev.2010.0060,https://www.tandfonline.com/doi/abs/10.3763/cd...,10.3763/cdev.2010.0060,Publisher link; Analyzed abstract; Exported th...,0,2,1
284,"Evers, J. and A. Pathirana, 2018: Adaptation t...",https://doi.org/10.1007/s10584-018-2242-y,https://link.springer.com/article/10.1007/s105...,10.1007/s10584-018-2242-y,Publisher link; Analyzed abstract; Exported th...,0,1,1
319,"Galappaththi, E. K., J. D. Ford and E. M. Benn...",https://doi.org/10.1007/s10584-020-02716-3,https://link.springer.com/article/10.1007/s105...,10.1007/s10584-020-02716-3,Publisher link; Analyzed abstract; Exported th...,3,2,2
352,"Granderson, A. A., 2017: The role of tradition...",https://doi.org/10.1175/WCAS-D-16-0094.1,https://journals.ametsoc.org/doi/10.1175/WCAS-...,10.1175/WCAS-D-16-0094.1,Publisher link; Analyzed abstract; Exported th...,4,2,4
385,"Hernández-Morcillo, M. et al., 2018: Scanning ...",https://doi.org/10.1016/j.envsci.2017.11.013,https://linkinghub.elsevier.com/retrieve/pii/S...,10.1016/j.envsci.2017.11.013,Publisher link; Analyzed abstract; Exported th...,0,2,1


In [12]:
len(citations_dois.loc[((citations_dois["indigenous"] > 0) | (citations_dois["local"] > 0)) & (citations_dois["knowledge"] > 0)])

23