
# Journal Data Harvester

## v. 1.1

1. Grabs all DOI associated with `ISSN` set below from CR, saves to DOI file
1. Attempts to extract all metadata from CR for those DOI for first stage of data set completion. Will save to CSV when completed
1. Attempts to download the HTML landing pages from resolving `http://dx.doi.org/doi`
1. Attempts to screen scrape the Abstract from that page, and rebuild an updated CSV file with that info

## Caveates
- Beautiful Soup grab is domain specific, Soup Find pattern set in first cell


## Set the values in the next cell

In [1]:
#JOURNAl TO Grab

#Name of the folder and prefix for our file names
TITLE = "SMR_test"

#Will be term for CrossRef ISSN search
ISSN = "1441-3523"


#Soup Pattern
#This is the value passed to Beautiful Soup to grab Abstract Text. Different for each Domain

# T & F - soup.find("div",{"class":"abstractSection abstractInFull"}).text
SOUP_TAG = "div"
SOUP_DICT = {"class":"abstractSection abstractInFull"}


# Human Kinetics - soup.find("section",{"class": "abstract"}).text
#SOUP_TAG = "section"
#SOUP_DICT = {"class": "abstract"}


In [2]:
#FLAGS - For bedbugging mostly

#Set to Zero to download everything
#(At least 50 to make sure habanero pagers works. v 2 will fix this)
SUBSET_SIZE = 50

DO_STAGE_1 = True
DO_STAGE_2 = True
DO_STAGE_3 = True
DO_STAGE_4 = True

#What to name our columns
ART_COLUMNS = [
    "DOI",
    "AUTHOR_NAME",
    "AUTHOR_ORDER",
    "AFFILIATION",
    "TITLE",
    "DATE",
    "VOLUME",
    "ISSUE",
    "PAGES",
    "KEYWORDS"
]

In [3]:
# Test to see if habanero is installed / available

import IPython

try:
    from habanero import Crossref
except:
    
    print("*******************************")
    print("Need to install habanero still!")
    print("Installation will proceed. When completed you'll need to")
    print("Re-run whole notebook again")
    print("*******************************")
    %pip install habanero
    IPython.Application.instance().kernel.do_shutdown(True)

#libraries
import os
import pandas as pd
import requests
from habanero import Crossref
import glob

#make folders
try:
    os.mkdir(TITLE)
except:
    print("Folder already made")
    
print("Prep Done.")


####VARIABLES
cr = Crossref()


if DO_STAGE_1:

    #### Stage 1
    print("\nSTAGE 1: Harvesting DOIs for this title from CR")
    try:
        j_doi_count = cr.journals(ISSN)
        max_dois = int(j_doi_count['message']['counts']['total-dois'])
        print("Cross Ref has this many DOIs: ",str(max_dois))
    except:
        print("CrossRef API is having troubles... Couldn't find number of DOIs associated with title")


    if SUBSET_SIZE != 0:
        max_dois = SUBSET_SIZE

    #Harvest all DOIs for this journal
    try:
        res = cr.journals(ids = ISSN, works = True, cursor = "*", cursor_max = max_dois, progress_bar = True)
        sum([ len(z['message']['items']) for z in res ])
        items = [ z['message']['items'] for z in res ]
        items = [ item for sublist in items for item in sublist ]
    except:
        print("CrossRef API is having troubles... Couldn't harvest DOIs of title")

    print("Total DOIs from CR for",TITLE,": ",len(items))
    with open(TITLE+"/"+ISSN+"_DOI.txt", "w") as doi_file:
        for a in items:
            #print(a['DOI'])
            doi_file.write(a["DOI"]+"\n")

    print("...Done.")
    
if DO_STAGE_2:

    #### Stage 2
    print("\nSTAGE 2: Harvest Metadata for each article and put into Dataframe")

    article_list = []
    progress = 0
    problem_DOIs = []

    print("Building Dataframe...")
    with open(TITLE+"/"+ISSN+"_DOI.txt") as d_file:

        for doi in d_file.readlines():
            #progress bar
            progress +=1

            art = cr.works(doi)
            a_title = art['message']['title'][0]

            try:
                a_vol = art['message']['volume']
            except:
                a_vol = "NA"

            try:
                a_issue = art['message']['issue']
            except:
                a_issue = "NA"
             
            try:
                a_pages = art['message']['page']
            except:
                a_pages = "NA"

            try:
                a_date = str(art['message']['published']['date-parts'][0][0]) + "-" + \
                         str(art['message']['published']['date-parts'][0][1]) +"-"+ \
                         str(art['message']['published']['date-parts'][0][2])
            except:
                a_date = "NA"


            try:    
                a_kws = ', '.join(art['message']['subject'])

            except:
                a_kws = "NA"


            try:
                for author in art['message']['author']:
                    a_dets = []
                    a_dets.append(doi.strip("\n"))
                    a_dets.append(author['given'] + " " + author['family'])
                    a_dets.append(author['sequence'])
                    a_dets.append(author['affiliation'][0]['name'])
                    a_dets.append(a_title)
                    a_dets.append(a_date)
                    a_dets.append(a_vol)
                    a_dets.append(a_issue)
                    a_dets.append(a_pages)
                    a_dets.append(a_kws)
                    article_list.append(a_dets)
            except:
                problem_DOIs.append(doi)

            if progress % 50 == 0:
                print(progress)
                
    j_data = pd.DataFrame(article_list)
    j_data.columns = ART_COLUMNS

    print("Number of Problem DOIS: ", len(problem_DOIs))

    with open(TITLE+"/"+ISSN+"_BAD_DOI.txt","w") as b_doi:
        for d in problem_DOIs:
            b_doi.write("https://dx.doi.org/"+d.strip("\n")+", couldn't get all metadata (stage 2)\n")

    #CSV of progress thus far
    j_data.to_csv(TITLE+"/"+TITLE+"_"+ISSN+".csv",index=False)
    print("...Done.")
    
if DO_STAGE_3:
    
    #### Stage 3
    print("\nSTAGE 3: Download HTML Landing Pages")
    
    headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0'
    }
    
    counter = 0
    with open(TITLE+"/"+ISSN+"_DOI.txt") as doi_file:
        for d in doi_file:
            counter += 1
            url_to_grab = "http://dx.doi.org/"+d.strip("\n")
            label = d.replace("/","_")
            html = requests.get(url_to_grab,headers=headers).text
            if not glob.glob(TITLE+"/"+label+".html"): #Only download if we didn't already
                try:
                    html_grabbed = open(TITLE+"/"+label+".html","w")
                    html_grabbed.write(html)
                except:
                    print("Problem downloading: ",d)
            if counter % 50 == 0:
                print(counter)

    print("...Done.")
    
    
if DO_STAGE_4:

    #The j_data dataframe won't be here unless stage two has been completed
    #put it together again if so
    if DO_STAGE_2 == False:
        jdata = pd.read_csv(TITLE+"/"+TITLE+"_"+ISSN+".csv")

    #### Stage 4
    print("\nSTAGE 4: Enrich Dataframe with HTML Data")

    from bs4 import BeautifulSoup

    art_list_md = []
    trouble_html = []
    counter = 0

    for html_md in glob.glob(TITLE+"/*.html"):
        counter +=1
        with open(html_md) as h_file:

            a_md = []

            try:
                a_doi = html_md.split("/")[1].replace("_","/")[:-5].replace("\n","")
            except:
                a_doi = "NA"
                trouble_html.append(html_md.replace("\n","") + ", could not find DOI\n")

            try:
                soup = BeautifulSoup(h_file,'html.parser')
                a_abs = soup.find(SOUP_TAG,SOUP_DICT).text
            except:
                trouble_html.append(html_md.replace("\n","") + ", could not find Abstract\n")
                a_abs = "NA"

            a_md.append(a_doi)
            a_md.append(a_abs)

        art_list_md.append(a_md)
        if counter % 50 == 0:
            print(counter)

    with open(TITLE+"/"+ISSN+"_bad_html.txt","w") as b_html_file:
        for h in trouble_html:
            b_html_file.write(h)

    j_extra_md = pd.DataFrame(art_list_md)
    j_extra_md.columns = ["DOI","ABSTRACT"]

    final_df = j_data.merge(j_extra_md, left_on="DOI",right_on="DOI")
    final_df.to_csv(TITLE+"/"+TITLE+"_"+ISSN+".csv",index=False)

    print("...Done.")
    
    
print("All Done!")

Prep Done.

STAGE 1: Harvesting DOIs for this title from CR
Cross Ref has this many DOIs:  871


100%|██████████| 2/2 [00:00<00:00,  2.33it/s]


Total DOIs from CR for SMR_test :  60
...Done.

STAGE 2: Harvest Metadata for each article and put into Dataframe
Building Dataframe...
50
Number of Problem DOIS:  8
...Done.
All Done!
