# Combine: DOI & BSC


Sticks together final CSV for title combining info from DOIs and BSC scrapes

https://habanero.readthedocs.io/en/latest/modules/crossref.html

In [124]:
# Set these values for the title in quesion

#JSM
#TITLE = "JSM" 
#ISSN = "0888-4773"

#SMR
TITLE = "SMR"
ISSN = "1441-3523"

DOI_FILE = ISSN+"_DOI.txt"
BAD_DOI_FILE = ISSN+"_BAD_DOI.txt"
FINAL_CSV = "J_DATA_"+TITLE+"/"+TITLE+".csv"
HTML_FILES_LOCATION = "J_DATA_"+TITLE



In [125]:
#This Library isn't in the core Python Libraries.
# You need to run this cell and then restart the runtime to have it function.
%pip install habanero

Note: you may need to restart the kernel to use updated packages.


In [126]:
import pandas as pd
from habanero import Crossref
import glob
print("Done importing Libraries!")

Done importing Libraries!


In [127]:
#OPEN DOI & EXTRACT Columns

cr = Crossref()
article_list = []
progress = 0
problem_DOIs = []

with open(DOI_FILE) as d_file:
    
    for doi in d_file.readlines():
        #progress bar
        progress +=1
        
        art = cr.works(doi)
        a_title = art['message']['title'][0]
        
        try:
            a_pages = art['message']['page']
        except:
            a_pages = "NA"
        
        try:
            a_vol = art['message']['volume']
        except:
            a_vol = "NA"
        try:
            a_issue = art['message']['issue']
        except:
            a_issue = "NA"
        
        try:
            a_date = str(art['message']['published-online']['date-parts'][0][0]) + "-" + \
                     str(art['message']['published-online']['date-parts'][0][1]) +"-"+ \
                     str(art['message']['published-online']['date-parts'][0][2])
        except:
            a_date = "NA"
    
        
        try:    
            a_kws = ', '.join(art['message']['subject'])

        except:
            a_kws = "NA"
            
    
    
        try:
            for author in art['message']['author']:
                a_dets = []
                a_dets.append(doi)
                a_dets.append(author['given'] + " " + author['family'])
                a_dets.append(author['sequence'])
                a_dets.append(author['affiliation'][0]['name'])
                a_dets.append(a_title)
                a_dets.append(a_date)
                a_dets.append(a_vol)
                a_dets.append(a_issue)
                a_dets.append(a_pages)
                a_dets.append(a_kws)
                article_list.append(a_dets)
        except:
            problem_DOIs.append(doi)
            
        if progress % 50 == 0:
            print(progress)


j_data = pd.DataFrame(article_list)
j_data.columns = [
    "DOI",
    "AUTHOR_NAME",
    "AUTHOR_ORDER",
    "AFFILIATION",
    "TITLE",
    "DATE",
    "VOLUME",
    "ISSUE",
    "PAGES",
    "KEYWORDS"
]

print("Number of Problem DOIS: ", len(problem_DOIs))

with open(BAD_DOI_FILE,"w") as b_doi:
    for d in problem_DOIs:
        b_doi.write(d+"\n")


50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
Number of Problem DOIS:  77


In [128]:
#This'll be all Cross Ref info
j_data

Unnamed: 0,DOI,AUTHOR_NAME,AUTHOR_ORDER,AFFILIATION,TITLE,DATE,VOLUME,ISSUE,PAGES,KEYWORDS
0,10.1016/j.smr.2009.05.002\n,Emma Sherry,first,"La Trobe University, Melbourne, Australia",A Wider Social Role for Sport: Who´s Keeping t...,2021-2-3,12,4,273-274,"Marketing, Management Science and Operations R..."
1,10.1016/j.smr.2009.05.002\n,F. Coalter,additional,"Routledge, 2 Park Square, Milton Park, Abingdo...",A Wider Social Role for Sport: Who´s Keeping t...,2021-2-3,12,4,273-274,"Marketing, Management Science and Operations R..."
2,10.1016/j.smr.2018.10.002\n,Jinming Zheng,first,"Northumbria University, United Kingdom",Interorganisational conflict between national ...,2021-2-3,22,5,667-681,"Marketing, Management Science and Operations R..."
3,10.1016/j.smr.2018.10.002\n,Patrick Wing Chung Lau,additional,"Hong Kong Baptist University, Hong Kong",Interorganisational conflict between national ...,2021-2-3,22,5,667-681,"Marketing, Management Science and Operations R..."
4,10.1016/j.smr.2018.10.002\n,Shushu Chen,additional,"University of Birmingham, United Kingdom",Interorganisational conflict between national ...,2021-2-3,22,5,667-681,"Marketing, Management Science and Operations R..."
...,...,...,...,...,...,...,...,...,...,...
2196,10.1016/j.smr.2016.09.001\n,Andrea N. Geurin,first,"New York University, Tisch Institute for Sport...",User-generated branding via social media: An e...,2021-2-3,20,3,273-284,"Marketing, Management Science and Operations R..."
2197,10.1016/j.smr.2016.09.001\n,Lauren M. Burch,additional,"Indiana University–Purdue University Columbus,...",User-generated branding via social media: An e...,2021-2-3,20,3,273-284,"Marketing, Management Science and Operations R..."
2198,10.1016/j.smr.2019.02.004\n,Luu Trong Tuan,first,"Swinburne Business School, Swinburne Universit...",Coach humility and player creativity: The role...,2021-2-3,23,2,284-301,"Marketing, Management Science and Operations R..."
2199,10.1016/j.smr.2018.07.002\n,Per G. Svensson,first,"School of Kinesiology, Louisiana State Univers...",Exploring how external stakeholders shape soci...,2021-2-3,22,4,540-552,"Marketing, Management Science and Operations R..."


## TODO

Regrab HTML from BSC for this title. Apparently it didn't work!

In [129]:
# Enrich metadata by opening up each file, grabbing abstract
# and adding to DF
#EG. 10.1016/j.smr.2009.05.002
#BSC Link: https://search.ebscohost.com/login.aspx?direct=true&db=bth&AN=44425705&site=ehost-live&scope=site

#for file in glob.glob(HTML_FILES_LOCATION+"/44425705.html"):
#    print(file)

In [130]:
# Write out final CSV file
j_data.to_csv(FINAL_CSV,index=False)