
# Journal Data Harvester

CrossRef To T & F Version

|Title|ISSN|Platform|
|---|---|---|
| _Journal of Sport Management_ | 0888-4773 |Human Kinetics & Elsvier|
| _European Sport Management Quarterly_ | 1618-4742 |T&F|
| _Sport Management Review_ | 1441-3523 |T&F|


In [13]:
#JOURNAl TO Grab

TITLE = "ESMQ"
ISSN = "1618-4742"

In [14]:
#This Library isn't in the core Python Libraries.
# You need to run this cell and then restart the runtime to have it function.
%pip install habanero


Note: you may need to restart the kernel to use updated packages.


In [34]:
#libraries
import os
import pandas as pd
import requests
from habanero import Crossref
import glob

#make folders
try:
    os.mkdir(TITLE)
except:
    print("Folder already made")
    
print("Prep Done.")

Folder already made
Prep Done.


## Cross Ref Side

In [25]:
####VARIABLES
cr = Crossref()


#### Stage 1
print("\nSTAGE 1: Harvesting DOIs for this title from CR")
try:
    j_doi_count = cr.journals(ISSN)
    max_dois = int(j_doi_count['message']['counts']['total-dois'])
    print("Cross Ref has this many DOIs: ",str(max_dois))
except:
    print("CrossRef API is having troubles... Couldn't find number of DOIs associated with title")
    
#Harvest all DOIs for this journal
try:
    res = cr.journals(ids = ISSN, works = True, cursor = "*", cursor_max = max_dois, progress_bar = True)
    sum([ len(z['message']['items']) for z in res ])
    items = [ z['message']['items'] for z in res ]
    items = [ item for sublist in items for item in sublist ]
except:
    print("CrossRef API is having troubles... Couldn't harvest DOIs of title")
    
print("Total DOIs from CR for",TITLE,": ",len(items))
doi_file = open(TITLE+"/"+ISSN+"_DOI.txt", "w")
for a in items:
    doi_file.write(a["DOI"]+"\n")

print("...Done.")


STAGE 1: Harvesting DOIs for this title from CR
Cross Ref has this many DOIs:  753


100%|██████████| 37/37 [00:18<00:00,  2.02it/s]

Total DOIs from CR for ESMQ :  754
...Done.

STAGE 2: Harvest Metadata for each article and put into Dataframe





In [None]:
#### Stage 2
print("\nSTAGE 2: Harvest Metadata for each article and put into Dataframe")

article_list = []
progress = 0
problem_DOIs = []

print("Building Dataframe...")
with open(TITLE+"/"+ISSN+"_DOI.txt") as d_file:
    
    for doi in d_file.readlines():
        #progress bar
        progress +=1
        
        art = cr.works(doi)
        a_title = art['message']['title'][0]
        
        try:
            a_pages = art['message']['page']
        except:
            a_pages = "NA"
        
        try:
            a_vol = art['message']['volume']
        except:
            a_vol = "NA"
        try:
            a_issue = art['message']['issue']
        except:
            a_issue = "NA"
        
        try:
            a_date = str(art['message']['published-online']['date-parts'][0][0]) + "-" + \
                     str(art['message']['published-online']['date-parts'][0][1]) +"-"+ \
                     str(art['message']['published-online']['date-parts'][0][2])
        except:
            a_date = "NA"
    
        
        try:    
            a_kws = ', '.join(art['message']['subject'])

        except:
            a_kws = "NA"
            
    
    
        try:
            for author in art['message']['author']:
                a_dets = []
                a_dets.append(doi)
                a_dets.append(author['given'] + " " + author['family'])
                a_dets.append(author['sequence'])
                a_dets.append(author['affiliation'][0]['name'])
                a_dets.append(a_title)
                a_dets.append(a_date)
                a_dets.append(a_vol)
                a_dets.append(a_issue)
                a_dets.append(a_pages)
                a_dets.append(a_kws)
                article_list.append(a_dets)
        except:
            problem_DOIs.append(doi)
            
        if progress % 50 == 0:
            print(progress)


j_data = pd.DataFrame(article_list)
j_data.columns = [
    "DOI",
    "AUTHOR_NAME",
    "AUTHOR_ORDER",
    "AFFILIATION",
    "TITLE",
    "DATE",
    "VOLUME",
    "ISSUE",
    "PAGES",
    "KEYWORDS"
]

print("Number of Problem DOIS: ", len(problem_DOIs))

with open(TITLE+"/"+ISSN+"_BAD_DOI.txt","w") as b_doi:
    for d in problem_DOIs:
        b_doi.write(d+"\n")

#CSV of progress thus far
j_data.to_csv(TITLE+"/"+TITLE+"_"+ISSN+".csv",index=False)
print("...Done.")


#### Stage 3
print("\nSTAGE 3: Download HTML Landing Pages")
counter = 0
with open(TITLE+"/"+ISSN+"_DOI.txt") as doi_file:
    for d in doi_file:
        counter += 1
        url_to_grab = "https://www.tandfonline.com/doi/full/"+d.strip("\n")
        label = d.replace("/","_")
        html = requests.get(url_to_grab).text
        if not glob.glob(TITLE+"/"+label+".html"): #Only download if we didn't already
            try:
                html_grabbed = open(TITLE+"/"+label+".html","w")
                html_grabbed.write(html)
            except:
                print("Problem with: ",d)
        if counter % 50 == 0:
            print(counter)
            
print("...Done.")

In [None]:
#### Stage 4
print("\nSTAGE 4: Enrich Dataframe with HTML Data")

#open all html files and build into Dataframe


#match up both DF on DOI

#Write out both dataframes to CSVs again

print("...Done.")