## Use multiple Scopus Author IDs to retrieve lists of articles by author

In [4]:
#pip install config

Collecting config
  Downloading config-0.5.0.post0-py2.py3-none-any.whl (20 kB)
Installing collected packages: config
Successfully installed config-0.5.0.post0
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Dependencies
import requests
import json
import pandas as pd
import numpy as np
import re
import io
from config import api_key
from collections import OrderedDict
from pandas.io.json import json_normalize  

In [7]:
#The function "load_csv_author_ids" loads a CSV you have created that has columns called: last_name, scopus_author_id.
#This CSV may also contain other information helpful to your work. The function returns a pandas data frame called 
#"multiple_authors_df".

file_path = "radiation_oncology_without_details.csv"

def load_csv_author_ids(file_path):
    # File to Load
    multiple_authors_df = file_path

    # Read the CSV file and store into Pandas DataFrame with the column Scopus Author ID as a string
    multiple_authors_df = pd.read_csv(multiple_authors_df, encoding="utf-8", dtype ={'scopus_author_id': str})

    #Change the column names to lower case with underscore for spaces
    multiple_authors_df.columns =  multiple_authors_df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace("(","").str.replace(")","")
    #radiation_oncology_df.head()
    return multiple_authors_df

multiple_authors_df = load_csv_author_ids(file_path)
load_csv_author_ids(file_path)

Unnamed: 0,last_name,first_name,mi,scopus_author_id,scopus_search,unnamed:_5
0,Donnelly,Eric,D,21233377200,AU-ID(21233377200),AU-ID(21233377200) OR
1,Gentile,Michelle,S,56018970700,AU-ID(56018970700),AU-ID(56018970700) OR
2,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR
3,Hayes,John,P,55313350000,AU-ID(55313350000),AU-ID(55313350000) OR
4,Kalapurakal,John,A,7003993738,AU-ID(7003993738),AU-ID(7003993738) OR
5,Kruser,Timothy,J,24448583300,AU-ID(24448583300),AU-ID(24448583300) OR
6,Mittal,Bharat,B,7102661470,AU-ID(7102661470),AU-ID(7102661470) OR
7,Mittal,Bharat,B,57207807061,AU-ID(57207807061),AU-ID(57207807061) OR
8,Sachdev,Sean,,56443683800,AU-ID(56443683800),AU-ID(56443683800) OR
9,Sathiaseelan,Vythialingam,,6701754514,AU-ID(6701754514),AU-ID(6701754514) OR


In [8]:
#List the column names in the dataframe
multiple_authors_df.columns

Index(['last_name', 'first_name', 'mi', 'scopus_author_id', 'scopus_search',
       'unnamed:_5'],
      dtype='object')

In [9]:
#Check the data type in the dataframe columns called scopus_author_id and scopus_search
multiple_authors_df.scopus_author_id.dtype
#multiple_authors_df.scopus_search.dtype

dtype('int64')

In [10]:
#The function "clean_author_id_list" takes in the multiple_authors_df and formats the "scopus_author_id" column 
#as a string, then uses the column to create a list, removes any of the "nan" values for authors that don't 
#have an ID, and finally returns a list called "cleaned_author_id_list"

def clean_author_id_list(multiple_authors_df):
    
    #Change the data type in the dataframe column called "scopus_author_id" from int64 to a string. 
    multiple_authors_df['scopus_author_id'] = multiple_authors_df['scopus_author_id'].astype(str)

    #Save the column called scopus_author_id to a list called Author_ID_List
    author_id_list = multiple_authors_df['scopus_author_id'].tolist()
    #print(author_ID_List)
    
    #Clean the Author_ID_List to remove nan
    cleaned_author_id_list = [x for x in author_id_list if str(x) != 'nan']
    #print(cleaned_author_id_list)
        
    return cleaned_author_id_list

cleaned_author_id_list = clean_author_id_list(multiple_authors_df)
clean_author_id_list(multiple_authors_df)

['21233377200',
 '56018970700',
 '7003610066',
 '55313350000',
 '7003993738',
 '24448583300',
 '7102661470',
 '57207807061',
 '56443683800',
 '6701754514',
 '36143455500',
 '24336584500',
 '7005165328',
 '453']

In [11]:
#The function "create_multiple_author_id_query" takes in the "cleaned_author_id_list" and adds the necessary syntax of
# "AU-ID(xxxxxxxxx)" that is required for searching Scopus Author IDs. The function returns 
#the "scopous_multiple_author_id_query".

def create_multiple_author_id_query(cleaned_author_id_list):

    #Add the necessary syntax to the cleaned_Author_ID_List
    scopus_mulitple_author_id_query = []
    for x in cleaned_author_id_list:
        authorID_string = "".join(("AU-ID(", x,")"))
        #print(authorID_string)
        scopus_mulitple_author_id_query.append(authorID_string)

    #print(scopus_mulitple_author_id_query)
    return scopus_mulitple_author_id_query

scopus_mulitple_author_id_query = create_multiple_author_id_query(cleaned_author_id_list)
create_multiple_author_id_query(cleaned_author_id_list)

['AU-ID(21233377200)',
 'AU-ID(56018970700)',
 'AU-ID(7003610066)',
 'AU-ID(55313350000)',
 'AU-ID(7003993738)',
 'AU-ID(24448583300)',
 'AU-ID(7102661470)',
 'AU-ID(57207807061)',
 'AU-ID(56443683800)',
 'AU-ID(6701754514)',
 'AU-ID(36143455500)',
 'AU-ID(24336584500)',
 'AU-ID(7005165328)',
 'AU-ID(453)']

In [13]:
def create_multiple_author_id_query(cleaned_author_id_list):

    #Add the necessary syntax to the cleaned_Author_ID_List
    scopus_mulitple_author_id_query = []
    for x in cleaned_author_id_list:
        authorID_string = "".join(("AU-ID(", x,")"))
        #print(authorID_string)
        scopus_mulitple_author_id_query.append(authorID_string)

    #print(scopus_mulitple_author_id_query)
    return scopus_mulitple_author_id_query

scopus_mulitple_author_id_query = create_multiple_author_id_query(cleaned_author_id_list)
create_multiple_author_id_query(cleaned_author_id_list)

#The function "get_scopus_articles" takes in the "scopus_multiple_author_id_query" and creates a necessary URL 
#for querying the Scopus API. The Scopus API key is passed in through the "headers" (see above Dependencies 
#"from config import api_key") and the config file is also referenced in the git ignore so it won't be exposed 
#on Github. The API is called and returns a response for each Scopus Author ID in the list Each response is 
#saved in a "single_author_article_dict". Each of the "single_author_article_dict" are then appended to the 
#"multiple_author_article_list". The function returns a list of dictionaries called 
#the "multiple_author_article_list". 

multiple_author_article_list = []
#multiple_author_article_dict = {}
single_author_article_dict = {}
date = "2002-2003"

In [31]:
#The function "get_scopus_articles" takes in the "scopus_multiple_author_id_query" and creates a necessary URL 
#for querying the Scopus API. The Scopus API key is passed in through the "headers" (see above Dependencies 
#"from config import api_key") and the config file is also referenced in the git ignore so it won't be exposed 
#on Github. The API is called and returns a response for each Scopus Author ID in the list Each response is 
#saved in a "single_author_article_dict". Each of the "single_author_article_dict" are then appended to the 
#"multiple_author_article_list". The function returns a list of dictionaries called 
#the "multiple_author_article_list". 

multiple_author_article_list = []
#multiple_author_article_dict = {}
single_author_article_dict = {}
date = "2002-2020"
view = "Complete"

def get_scopus_articles(scopus_mulitple_author_id_query):
    
    for authorid in scopus_mulitple_author_id_query:
        url = "http://api.elsevier.com/content/search/scopus?"
        fieldList = ["dc:title",
                     "prism:publicationName",
                     "prism:coverDate", "prism:doi", "authname"]
                    
        headers = {
             "X-ELS-APIKey": api_key,
             'Accept':'application/json'
        }
        parameters = {
            "query": 'nanosafety',
            "view": view,
            "date": date
        }
        
        #Make the API request 
        single_author_article_response = requests.get(url, headers=headers, params=parameters)
        #print(single_author_article_response.url)
        #print(single_author_article_response.status_code)
        
              
        #Append each single_author_article_dict response to multiple_author_article_list to create a list of dictionaries
        single_author_article_dict = single_author_article_response.json()
        #print(type(single_author_article_dict)) 
        #print(single_author_article_dict)
        multiple_author_article_list.append(single_author_article_dict.copy())
    
    return multiple_author_article_list
       
get_scopus_articles(scopus_mulitple_author_id_query)

#References
#https://dev.elsevier.com/guides/ScopusSearchViews.htm
#https://stackoverflow.com/questions/53558837/python-loop-to-pull-api-data-for-iterating-urls
#https://stackoverflow.com/questions/36410800/python-3-parse-json-from-multiple-api-requests-into-a-list-and-output-to-a-fil
#https://www.pluralsight.com/guides/web-scraping-with-request-python

[{'search-results': {'opensearch:totalResults': '4682',
   'opensearch:startIndex': '0',
   'opensearch:itemsPerPage': '25',
   'opensearch:Query': {'@role': 'request',
    '@searchTerms': 'nanosafety',
    '@startPage': '0'},
   'link': [{'@_fa': 'true',
     '@ref': 'self',
     '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=nanosafety&view=Complete&date=2002-2020',
     '@type': 'application/json'},
    {'@_fa': 'true',
     '@ref': 'first',
     '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=nanosafety&view=Complete&date=2002-2020',
     '@type': 'application/json'},
    {'@_fa': 'true',
     '@ref': 'next',
     '@href': 'https://api.elsevier.com/content/search/scopus?start=25&count=25&query=nanosafety&view=Complete&date=2002-2020',
     '@type': 'application/json'},
    {'@_fa': 'true',
     '@ref': 'last',
     '@href': 'https://api.elsevier.com/content/search/scopus?start=4657&count=25&query=nanosafety&view=Compl

In [32]:
url = "http://api.elsevier.com/content/search/scopus?" + "?format=json"
headers = {"X-ELS-APIKey": api_key, 'Accept':'application/json'}
parameters = {"query": 'nanosafety', "view": view, "date": date}
single_author_article_response = requests.get(url, headers=headers, params=parameters)
single_author_article_response.json()

{'search-results': {'opensearch:totalResults': '4682',
  'opensearch:startIndex': '0',
  'opensearch:itemsPerPage': '25',
  'opensearch:Query': {'@role': 'request',
   '@searchTerms': 'nanosafety',
   '@startPage': '0'},
  'link': [{'@_fa': 'true',
    '@ref': 'self',
    '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=nanosafety&view=Complete&date=2002-2020',
    '@type': 'application/json'},
   {'@_fa': 'true',
    '@ref': 'first',
    '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=nanosafety&view=Complete&date=2002-2020',
    '@type': 'application/json'},
   {'@_fa': 'true',
    '@ref': 'next',
    '@href': 'https://api.elsevier.com/content/search/scopus?start=25&count=25&query=nanosafety&view=Complete&date=2002-2020',
    '@type': 'application/json'},
   {'@_fa': 'true',
    '@ref': 'last',
    '@href': 'https://api.elsevier.com/content/search/scopus?start=4657&count=25&query=nanosafety&view=Complete&date=2002-2020',


In [22]:
#The function "make_scopus_articles_df" takes in the "multiple_author_article_list" and uses json_normalize to
#flatten the json contained in the "entry" field. The function returns a dataframe called the "scopus_articles_df".

def make_scopus_articles_df(multiple_author_article_list):
    #final_list = json_normalize(multiple_author_list, meta=["search-results"], record_path=["search-results", "entry"])
    scopus_articles_df = pd.DataFrame.from_dict(json_normalize(multiple_author_article_list, meta=["search-results"], record_path=["search-results", "entry"]),orient="columns")
    
    return scopus_articles_df

scopus_articles_df = make_scopus_articles_df(multiple_author_article_list)
make_scopus_articles_df(multiple_author_article_list)

#References
#https://stackoverflow.com/questions/48177934/flatten-or-unpack-list-of-nested-dicts-in-dataframe
#https://stackoverflow.com/questions/50161070/convert-list-of-dicts-of-dict-into-dataframe
#https://stackoverflow.com/questions/43984865/python-having-trouble-returning-a-pandas-data-frame-from-a-user-defined-functio
#https://stackoverflow.com/questions/37668291/flatten-double-nested-json

Unnamed: 0,@_fa,author,dc:title,error,prism:coverDate,prism:doi,prism:publicationName,prism:url,search-results
0,True,,,Result set was empty,,,,,"{'opensearch:totalResults': '0', 'opensearch:s..."
1,True,"[{'@_fa': 'true', 'authname': 'Gentile M.S.'},...",Targeting colon cancer cells with genistein-17...,,2003-01-01,10.3892/ijo.22.5.955,International journal of oncology,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '1', 'opensearch:s..."
2,True,"[{'@_fa': 'true', 'authname': 'Bisht K.'}, {'@...",Geldanamycin and 17-Allylamino-17-demethoxygel...,,2003-12-15,,Cancer Research,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."
3,True,"[{'@_fa': 'true', 'authname': 'Bisht K.'}, {'@...",Inhibition of cyclooxygenase-2 with NS-398 and...,,2003-11-01,10.1080/09553000310001621400,International Journal of Radiation Biology,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."
4,True,"[{'@_fa': 'true', 'authname': 'Dewhirst M.W.'}...",Those in gene therapy should pay closer attent...,,2003-10-01,10.1016/S0360-3016(03)00421-8,International Journal of Radiation Oncology Bi...,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."
5,True,"[{'@_fa': 'true', 'authname': 'Lin X.'}, {'@_f...",2-Deoxy-D-glucose-induced cytotoxicity and rad...,,2003-06-15,,Cancer Research,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."
6,True,"[{'@_fa': 'true', 'authname': 'Trimble E.'}, {...",Clinical Trials Referral Resource. Clinical tr...,,2003-01-01,,"Oncology (Williston Park, N.Y.)",https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."
7,True,"[{'@_fa': 'true', 'authname': 'Ohiro Y.'}, {'@...",Inhibition of stress-inducible kinase pathways...,,2003-01-01,10.1128/MCB.23.1.322-334.2003,Molecular and Cellular Biology,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."
8,True,"[{'@_fa': 'true', 'authname': 'Karimpour S.'},...",The holy grail of radiation oncology: Lessons ...,,2003-01-01,10.1016/S0360-3016(02)03861-0,International Journal of Radiation Oncology Bi...,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."
9,True,"[{'@_fa': 'true', 'authname': 'Gius D.'}, {'@_...",Treatment of nasopharyngeal cancer: Raising th...,,2002-11-06,10.1093/jnci/94.21.1594,Journal of the National Cancer Institute,https://api.elsevier.com/content/abstract/scop...,"{'opensearch:totalResults': '15', 'opensearch:..."


In [9]:
#The function "flatten_search_results" takes in the "scopus_articles_df" and flattens the json contained in the
#"search results" field and creates series then columns of the content. The funciton returns a pandas dataframe
#called "remove_opensearchQuery_nest" which is renamed "scopus_flattened_search_results_df". 

def flatten_search_results(scopus_articles_df):
    remove_searchresults_nest = pd.concat([scopus_articles_df.drop(['search-results'], axis=1), scopus_articles_df['search-results'].apply(pd.Series)], axis=1, join="outer")
    remove_opensearchQuery_nest = pd.concat([remove_searchresults_nest.drop(['opensearch:Query'], axis=1), remove_searchresults_nest['opensearch:Query'].apply(pd.Series)], axis=1, join="outer")
    remove_opensearchQuery_nest = pd.concat([remove_searchresults_nest.drop(['opensearch:Query'], axis=1), remove_searchresults_nest['opensearch:Query'].apply(pd.Series)], axis=1, join="outer")
    return remove_opensearchQuery_nest

scopus_flatten_search_results_df = flatten_search_results(scopus_articles_df)
flatten_search_results(scopus_articles_df)

#References
#https://stackoverflow.com/questions/29325458/dictionary-column-in-pandas-dataframe/29330853

Unnamed: 0,@_fa,author,citedby-count,dc:identifier,dc:title,eid,error,prism:aggregationType,prism:coverDate,prism:doi,...,subtype,subtypeDescription,opensearch:totalResults,opensearch:startIndex,opensearch:itemsPerPage,link,entry,@role,@searchTerms,@startPage
0,True,,,,,,Result set was empty,,,,...,,,0,0,0,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'error': 'Result set was emp...",request,AU-ID(21233377200),0
1,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0042128695,Targeting colon cancer cells with genistein-17...,2-s2.0-0042128695,,Journal,2003-01-01,,...,ar,Article,1,0,1,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(56018970700),0
2,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",157.0,SCOPUS_ID:9144261127,Geldanamycin and 17-Allylamino-17-demethoxygel...,2-s2.0-9144261127,,Journal,2003-12-15,,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
3,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0347949519,Inhibition of cyclooxygenase-2 with NS-398 and...,2-s2.0-0347949519,,Journal,2003-11-01,10.1080/09553000310001621400,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
4,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",26.0,SCOPUS_ID:0042383099,Those in gene therapy should pay closer attent...,2-s2.0-0042383099,,Journal,2003-10-01,10.1016/S0360-3016(03)00421-8,...,le,Letter,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
5,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",115.0,SCOPUS_ID:0038407274,2-Deoxy-D-glucose-induced cytotoxicity and rad...,2-s2.0-0038407274,,Journal,2003-06-15,,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
6,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037215656,The holy grail of radiation oncology: Lessons ...,2-s2.0-0037215656,,Journal,2003-01-01,10.1016/S0360-3016(02)03861-0,...,ed,Editorial,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
7,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",0.0,SCOPUS_ID:0037262512,Clinical Trials Referral Resource. Clinical tr...,2-s2.0-0037262512,,Journal,2003-01-01,,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
8,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",27.0,SCOPUS_ID:0037216745,Inhibition of stress-inducible kinase pathways...,2-s2.0-0037216745,,Journal,2003-01-01,10.1128/MCB.23.1.322-334.2003,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
9,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037032514,Treatment of nasopharyngeal cancer: Raising th...,2-s2.0-0037032514,,Journal,2002-11-06,,...,ed,Editorial,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0


In [10]:
#The function "fix_empty_author" takes in the "scopus_flattened_search_results_df" and uses a for loop
# to add an empty list to any column in the "author" column that is empty (i.e. contains nan). The function
#returns a pandas dataframe called the "scopus_flattened_search_results_df" which is renamed 
#to "scopus_fix_empty_author_df". 

def fix_empty_author(scopus_flatten_search_results_df):

    for row in scopus_flatten_search_results_df.loc[scopus_flatten_search_results_df.author.isnull(), 'author'].index:
        scopus_flatten_search_results_df.at[row, 'author'] = []
    
    return scopus_flatten_search_results_df

scopus_fix_empty_author_df = fix_empty_author(scopus_flatten_search_results_df)
fix_empty_author(scopus_flatten_search_results_df)

Unnamed: 0,@_fa,author,citedby-count,dc:identifier,dc:title,eid,error,prism:aggregationType,prism:coverDate,prism:doi,...,subtype,subtypeDescription,opensearch:totalResults,opensearch:startIndex,opensearch:itemsPerPage,link,entry,@role,@searchTerms,@startPage
0,True,[],,,,,Result set was empty,,,,...,,,0,0,0,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'error': 'Result set was emp...",request,AU-ID(21233377200),0
1,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0042128695,Targeting colon cancer cells with genistein-17...,2-s2.0-0042128695,,Journal,2003-01-01,,...,ar,Article,1,0,1,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(56018970700),0
2,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",157.0,SCOPUS_ID:9144261127,Geldanamycin and 17-Allylamino-17-demethoxygel...,2-s2.0-9144261127,,Journal,2003-12-15,,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
3,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0347949519,Inhibition of cyclooxygenase-2 with NS-398 and...,2-s2.0-0347949519,,Journal,2003-11-01,10.1080/09553000310001621400,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
4,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",26.0,SCOPUS_ID:0042383099,Those in gene therapy should pay closer attent...,2-s2.0-0042383099,,Journal,2003-10-01,10.1016/S0360-3016(03)00421-8,...,le,Letter,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
5,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",115.0,SCOPUS_ID:0038407274,2-Deoxy-D-glucose-induced cytotoxicity and rad...,2-s2.0-0038407274,,Journal,2003-06-15,,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
6,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037215656,The holy grail of radiation oncology: Lessons ...,2-s2.0-0037215656,,Journal,2003-01-01,10.1016/S0360-3016(02)03861-0,...,ed,Editorial,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
7,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",0.0,SCOPUS_ID:0037262512,Clinical Trials Referral Resource. Clinical tr...,2-s2.0-0037262512,,Journal,2003-01-01,,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
8,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",27.0,SCOPUS_ID:0037216745,Inhibition of stress-inducible kinase pathways...,2-s2.0-0037216745,,Journal,2003-01-01,10.1128/MCB.23.1.322-334.2003,...,ar,Article,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0
9,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037032514,Treatment of nasopharyngeal cancer: Raising th...,2-s2.0-0037032514,,Journal,2002-11-06,,...,ed,Editorial,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0


In [11]:
#The function "flatten_author" takes in the "scopus_fix_empty_author_df" and loops through the "author" column which 
#is currently a list of dictionaries which contain the authname and authid for each author listed on the article. 
#The function loops through each item in the cell of the "author" column, saves each of the authids to a list and 
#each of the the authnames to a list. The funciton then appends the authid list to an ordered dicitonary 
#called "authorid_dict" and the authname list to an ordered dictionary called "authname_dict". Once appended, the 
#function empties the lists and moves on to the next cell in the "author" column. The function
#returns the authid and authname dictionaries. 

authorid_dict = OrderedDict()
authorname_dict = OrderedDict() 

def flatten_author(scopus_fix_empty_author_df):
    authorids = []
    #authorid_dict = OrderedDict()
    authornames = []
    #authorname_dict= OrderedDict() 
    for i in range(len(scopus_fix_empty_author_df)):
        #print(scopus_fix_empty_author_df.loc[i, "author"]) 
        column = scopus_fix_empty_author_df.loc[i, "author"]
        #print(i)
        #print(column)

        for item in column:
            #print(item)
            #print(item["authid"])
            #print(column)
            authorids.append(item["authid"])
            authornames.append(item["authname"])
            #print(authorids)
         
        #print(i)
        authorid_dict[i] = authorids
        authorname_dict[i]= authornames
        authorids=[]
        authornames=[]
        
    return authorid_dict, authorname_dict

#print(authorid_dict)
#print(authorname_dict)
flatten_author(scopus_fix_empty_author_df)
    


(OrderedDict([(0, []),
              (1,
               ['56018970700',
                '6701662630',
                '7404024068',
                '6701854664',
                '7102541014',
                '7005140598',
                '7102955789',
                '6701449622']),
              (2,
               ['7005370416',
                '57207801922',
                '7102211768',
                '6603982052',
                '7003524707',
                '8610676300',
                '35830961200',
                '7003569813',
                '7202169291',
                '7005392044',
                '55615581300',
                '35400609400',
                '7003610066']),
              (3,
               ['7005370416',
                '57207801922',
                '57193119887',
                '7004715706',
                '6603982052',
                '7005433388',
                '7003610066']),
              (4, ['36046300600', '35408019300', '6506436385', '700361

In [12]:
#The "add_author_info" function takes in the "scopus_fix_empty_author_df", the "authorname_dict" 
#and the "authorid_dict". It creates two new columns in the dataframe called "author_names" and "author_ids". 
#The function adds the "authorname_dict" and the "authorid_dict" to the columns. The function returns
#the "scopus_fix_empty_author_df" which is renamed to the "scopus_added_author_info_df". 

def add_author_info(scopus_fix_empty_author_df, authorname_dict, authorid_dict):
    scopus_fix_empty_author_df["author_names"] = pd.Series(authorname_dict)
    scopus_fix_empty_author_df["author_ids"] = pd.Series(authorid_dict)
    return scopus_fix_empty_author_df

scopus_added_author_info_df = add_author_info(scopus_fix_empty_author_df, authorname_dict, authorid_dict)
add_author_info(scopus_fix_empty_author_df, authorname_dict, authorid_dict)

Unnamed: 0,@_fa,author,citedby-count,dc:identifier,dc:title,eid,error,prism:aggregationType,prism:coverDate,prism:doi,...,opensearch:totalResults,opensearch:startIndex,opensearch:itemsPerPage,link,entry,@role,@searchTerms,@startPage,author_names,author_ids
0,True,[],,,,,Result set was empty,,,,...,0,0,0,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'error': 'Result set was emp...",request,AU-ID(21233377200),0,[],[]
1,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0042128695,Targeting colon cancer cells with genistein-17...,2-s2.0-0042128695,,Journal,2003-01-01,,...,1,0,1,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(56018970700),0,"[Gentile M., Vasu C., Green A., Murillo G., Da...","[56018970700, 6701662630, 7404024068, 67018546..."
2,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",157.0,SCOPUS_ID:9144261127,Geldanamycin and 17-Allylamino-17-demethoxygel...,2-s2.0-9144261127,,Journal,2003-12-15,,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Bisht K., Bradbury C., Mattson D., Kaushal A....","[7005370416, 57207801922, 7102211768, 66039820..."
3,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0347949519,Inhibition of cyclooxygenase-2 with NS-398 and...,2-s2.0-0347949519,,Journal,2003-11-01,10.1080/09553000310001621400,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Bisht K., Bradbury C., Zoberi I., Curry H., K...","[7005370416, 57207801922, 57193119887, 7004715..."
4,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",26.0,SCOPUS_ID:0042383099,Those in gene therapy should pay closer attent...,2-s2.0-0042383099,,Journal,2003-10-01,10.1016/S0360-3016(03)00421-8,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Dewhirst M., Sneed P., Karimpour S., Gius D.]","[36046300600, 35408019300, 6506436385, 7003610..."
5,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",115.0,SCOPUS_ID:0038407274,2-Deoxy-D-glucose-induced cytotoxicity and rad...,2-s2.0-0038407274,,Journal,2003-06-15,,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Lin X., Zhang F., Bradbury C., Kaushal A., Li...","[7404513477, 57199242888, 57207801922, 6603982..."
6,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037215656,The holy grail of radiation oncology: Lessons ...,2-s2.0-0037215656,,Journal,2003-01-01,10.1016/S0360-3016(02)03861-0,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Karimpour S., Gius D.]","[6506436385, 7003610066]"
7,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",0.0,SCOPUS_ID:0037262512,Clinical Trials Referral Resource. Clinical tr...,2-s2.0-0037262512,,Journal,2003-01-01,,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Trimble E., Schoenfeldt M., Streicher H., Giu...","[7005267919, 6603616084, 7005313461, 700361006..."
8,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",27.0,SCOPUS_ID:0037216745,Inhibition of stress-inducible kinase pathways...,2-s2.0-0037216745,,Journal,2003-01-01,10.1128/MCB.23.1.322-334.2003,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Ohiro Y., Usheva A., Kobayashi S., Duffy S., ...","[8660958800, 56259611200, 57199809113, 5719656..."
9,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037032514,Treatment of nasopharyngeal cancer: Raising th...,2-s2.0-0037032514,,Journal,2002-11-06,,...,15,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Gius D., Coleman C.]","[7003610066, 7201507251]"


In [13]:
#The function "fix_search_term" takes in the "scopus_add_author_info_df" and creates a new column 
#called "scopus_author_id_api" which contains the same data as the "@searchTerms" column. 
#The funciton then uses regex to remove everything around the parenthesis and leave
#what was in between the parentesis in the "scopus_author_id_api" column. The function returns
#the "scopus_fix_empty_author_df" which is renamed to the "scopus_fixed_search_term_df"

def fix_search_term(scopus_added_author_info_df):
    scopus_added_author_info_df['scopus_author_id_api'] = scopus_added_author_info_df['@searchTerms']
    scopus_added_author_info_df["scopus_author_id_api"].replace(r'[^(]*\(|\)[^)]*', '', inplace=True,regex=True)
    return scopus_fix_empty_author_df

scopus_fixed_search_term_df = fix_search_term(scopus_added_author_info_df)
fix_search_term(scopus_added_author_info_df)

#References
#https://stackoverflow.com/questions/32913960/python-regex-remove-a-pattern-at-the-end-of-string
#https://stackoverflow.com/questions/16842001/copy-text-between-parentheses-in-pandas-dataframe-column-into-another-column
#https://stackoverflow.com/questions/37593550/pandas-replacing-elements-not-working


Unnamed: 0,@_fa,author,citedby-count,dc:identifier,dc:title,eid,error,prism:aggregationType,prism:coverDate,prism:doi,...,opensearch:startIndex,opensearch:itemsPerPage,link,entry,@role,@searchTerms,@startPage,author_names,author_ids,scopus_author_id_api
0,True,[],,,,,Result set was empty,,,,...,0,0,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'error': 'Result set was emp...",request,AU-ID(21233377200),0,[],[],21233377200
1,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0042128695,Targeting colon cancer cells with genistein-17...,2-s2.0-0042128695,,Journal,2003-01-01,,...,0,1,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(56018970700),0,"[Gentile M., Vasu C., Green A., Murillo G., Da...","[56018970700, 6701662630, 7404024068, 67018546...",56018970700
2,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",157.0,SCOPUS_ID:9144261127,Geldanamycin and 17-Allylamino-17-demethoxygel...,2-s2.0-9144261127,,Journal,2003-12-15,,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Bisht K., Bradbury C., Mattson D., Kaushal A....","[7005370416, 57207801922, 7102211768, 66039820...",7003610066
3,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0347949519,Inhibition of cyclooxygenase-2 with NS-398 and...,2-s2.0-0347949519,,Journal,2003-11-01,10.1080/09553000310001621400,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Bisht K., Bradbury C., Zoberi I., Curry H., K...","[7005370416, 57207801922, 57193119887, 7004715...",7003610066
4,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",26.0,SCOPUS_ID:0042383099,Those in gene therapy should pay closer attent...,2-s2.0-0042383099,,Journal,2003-10-01,10.1016/S0360-3016(03)00421-8,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Dewhirst M., Sneed P., Karimpour S., Gius D.]","[36046300600, 35408019300, 6506436385, 7003610...",7003610066
5,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",115.0,SCOPUS_ID:0038407274,2-Deoxy-D-glucose-induced cytotoxicity and rad...,2-s2.0-0038407274,,Journal,2003-06-15,,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Lin X., Zhang F., Bradbury C., Kaushal A., Li...","[7404513477, 57199242888, 57207801922, 6603982...",7003610066
6,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037215656,The holy grail of radiation oncology: Lessons ...,2-s2.0-0037215656,,Journal,2003-01-01,10.1016/S0360-3016(02)03861-0,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Karimpour S., Gius D.]","[6506436385, 7003610066]",7003610066
7,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",0.0,SCOPUS_ID:0037262512,Clinical Trials Referral Resource. Clinical tr...,2-s2.0-0037262512,,Journal,2003-01-01,,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Trimble E., Schoenfeldt M., Streicher H., Giu...","[7005267919, 6603616084, 7005313461, 700361006...",7003610066
8,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",27.0,SCOPUS_ID:0037216745,Inhibition of stress-inducible kinase pathways...,2-s2.0-0037216745,,Journal,2003-01-01,10.1128/MCB.23.1.322-334.2003,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Ohiro Y., Usheva A., Kobayashi S., Duffy S., ...","[8660958800, 56259611200, 57199809113, 5719656...",7003610066
9,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037032514,Treatment of nasopharyngeal cancer: Raising th...,2-s2.0-0037032514,,Journal,2002-11-06,,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Gius D., Coleman C.]","[7003610066, 7201507251]",7003610066


In [14]:
#The function "merge_csv_to_final" takes in the "muliple_authors_df" and the "scopus_fixed_search_term_df" and
#merges the two dataframes using an "inner" merge on the "scopus author id" which means any person without a 
# Scopus Author ID is removed from the final dataframe. The function returns the "merged_df". 

def merge_csv_to_final(multiple_authors_df, scopus_fixed_search_term_df):
    merged_df = pd.merge(multiple_authors_df, scopus_fixed_search_term_df, how='inner', left_on='scopus_author_id', right_on='scopus_author_id_api',  )    
    return merged_df

merged_df = merge_csv_to_final(multiple_authors_df, scopus_fixed_search_term_df)
merge_csv_to_final(multiple_authors_df, scopus_fixed_search_term_df)

#References
#https://stackoverflow.com/questions/20375561/joining-pandas-dataframes-by-column-names

Unnamed: 0,last_name,first_name,mi,scopus_author_id,scopus_search,unnamed:_5,@_fa,author,citedby-count,dc:identifier,...,opensearch:startIndex,opensearch:itemsPerPage,link,entry,@role,@searchTerms,@startPage,author_names,author_ids,scopus_author_id_api
0,Donnelly,Eric,D,21233377200,AU-ID(21233377200),AU-ID(21233377200) OR,True,[],,,...,0,0,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'error': 'Result set was emp...",request,AU-ID(21233377200),0,[],[],21233377200
1,Gentile,Michelle,S,56018970700,AU-ID(56018970700),AU-ID(56018970700) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0042128695,...,0,1,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(56018970700),0,"[Gentile M., Vasu C., Green A., Murillo G., Da...","[56018970700, 6701662630, 7404024068, 67018546...",56018970700
2,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",157.0,SCOPUS_ID:9144261127,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Bisht K., Bradbury C., Mattson D., Kaushal A....","[7005370416, 57207801922, 7102211768, 66039820...",7003610066
3,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",6.0,SCOPUS_ID:0347949519,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Bisht K., Bradbury C., Zoberi I., Curry H., K...","[7005370416, 57207801922, 57193119887, 7004715...",7003610066
4,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",26.0,SCOPUS_ID:0042383099,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Dewhirst M., Sneed P., Karimpour S., Gius D.]","[36046300600, 35408019300, 6506436385, 7003610...",7003610066
5,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",115.0,SCOPUS_ID:0038407274,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Lin X., Zhang F., Bradbury C., Kaushal A., Li...","[7404513477, 57199242888, 57207801922, 6603982...",7003610066
6,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037215656,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Karimpour S., Gius D.]","[6506436385, 7003610066]",7003610066
7,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",0.0,SCOPUS_ID:0037262512,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Trimble E., Schoenfeldt M., Streicher H., Giu...","[7005267919, 6603616084, 7005313461, 700361006...",7003610066
8,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",27.0,SCOPUS_ID:0037216745,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Ohiro Y., Usheva A., Kobayashi S., Duffy S., ...","[8660958800, 56259611200, 57199809113, 5719656...",7003610066
9,Gius,David,R,7003610066,AU-ID(7003610066),AU-ID(7003610066) OR,True,"[{'@_fa': 'true', 'author-url': 'https://api.e...",2.0,SCOPUS_ID:0037032514,...,0,15,"[{'@_fa': 'true', '@ref': 'self', '@href': 'ht...","[{'@_fa': 'true', 'prism:url': 'https://api.el...",request,AU-ID(7003610066),0,"[Gius D., Coleman C.]","[7003610066, 7201507251]",7003610066


In [15]:
#The function "export_to_csv" takes in the merged_df, and a save_path which indicates where the merged_df
#will be saved. Use double back slashes in the save path to escape the backslash or otherwise it will be
#interpreted as a special character and won't work. The function does not return anything. 

save_path = "C:\\Users\\keg827\\Documents\\10. WorkStuff_KEG\\scopusAPIrequests\\merged_dataframe_final.csv"

def export_to_csv(merged_df, save_path):
    merged_df.to_csv (save_path, index = None, header=True, encoding="utf-8")
    

export_to_csv(merged_df, save_path)

In [None]:
#NLM CITATION FORMAT
#THIS FEATURE IS NOT YET DONE
#Brantingham JW, Bonnefin D, Perle SM, Cassa TK, Globe G, Pribicevic M, Hicks M, Korporaal C. Manipulative therapy for lower extremity conditions: update of a literature review. J Manipulative Physiol Ther. 2012 Feb;35(2):127-66.



In [None]:
#FLAG FIRST AND LAST AUTHORS
#BOLD AUTHOR NAME ON EXPORT
#THIS DOES IS NOT YET DONE

#https://stackoverflow.com/questions/52819114/pandas-to-csv-with-some-words-as-bold
#https://stackoverflow.com/questions/41212273/pandaspython-fill-empty-cells-with-with-previous-row-value
#https://stackoverflow.com/questions/52651074/python-pandas-equivalent-to-the-excel-fill-handle
#https://stackoverflow.com/questions/51938245/display-dataframe-values-in-bold-font-in-one-row-only
#https://stackoverflow.com/questions/54512133/string-matching-of-two-pandas-series

# author_index=[]

# for i in range(len(merged_df)):
#     #print(merged_df.loc[i, "scopus_author_id_api"]) 
#     author_id = merged_df.loc[i, "scopus_author_id"]
#     #print(author_id)
#     for id_list in merged_df["author_ids"]:
#         #print(id_list)
#         if author_id == matchID:
#              print(matchID.index)
#         else:
#              print("did not match")

# def CheckDF(df1,df2):
#     for (item, Value),(item1, Value1) in 
#     zip(df1['account'].iteritems(),df2['account'].iteritems()):
#         if len(str(Value).strip()) == len(str(Value1).strip()):
#             print(True)
#         else:
#             print(False)

# CheckDF(df1,df2)

In [None]:
# new_df.reindex(columns=[*new_df.columns.tolist(), 'author_id', 'author_name'])

# for row in new_df.loc[new_df.author_id.isnull(), 'author_id'].index:
#     new_df.at[row, 'author_id'] = []
    
# new_df

#scopus_flatten_search_results_df.head()
#scopus_flatten_search_results_df.index