## Preparation

In [39]:
import os
import requests
import json
import pandas as pd # type: ignore
import time
import itertools
import datetime
from dotenv import load_dotenv # type: ignore

In [40]:
# read env
load_dotenv()
apikey = os.getenv("APIKey")

In [41]:
# uncomment if read in Google Colab
# from google.colab import userdata
# apikey = userdata.get('apiKey')

## Using Scopus API

Define keywords and ISSN

In [42]:
issn = '1743-2774'

In [43]:
# keywords = ['bivariate', 'multivariate', 'visual*', 'carto*', 'geovis*']
keywords = ['bivariate', 'multivariate', 'visual*']

In [51]:
outputFolder = './temp3'

## Permutations of Search Queries

In [45]:
def generateKeywords(keywords, issn):
  """
  Create a list of search queries from permutations and combinations of keywords
  """

  searchQuery = []

  # Generate all combinations of 2 first keywords from the list
  combinations = list(itertools.combinations(keywords, 2))

  # For each combination, generate permutations with the remaining keywords
  for combo in combinations:
      remaining_keywords = [k for k in keywords if k not in combo]
      for keyword in remaining_keywords:
          # query = f"({combo[0]} AND {combo[1]} OR {keyword})"
          query = f"(TITLE-ABS-KEY(({combo[0]} AND {combo[1]}) OR {keyword}) AND ISSN({issn})) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)"
          searchQuery.append(query)

  return searchQuery

In [46]:
queryList = generateKeywords(keywords, issn)

In [47]:
queryList

['(TITLE-ABS-KEY((bivariate AND multivariate) OR visual*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND visual*) OR multivariate) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((multivariate AND visual*) OR bivariate) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)']

## Get the papers

In [50]:
def getPapers(queryList, issn, outputFolder):
    # Check if the output dir exists
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)

    for query in queryList:
        # for every query, search and fetch all results from every page
        print(f"Working on Query: {query}")
        start_index = 0 # start index for pagination
        all_papers = [] # store all papers from every page
        all_data = [] # store all raw data from every page
        batch = 1
        index = queryList.index(query)+1

        # repeat until all results are fetched from every page
        while True:
            print(f"Running batch: {batch} of index {index}")
            url = "https://api.elsevier.com/content/search/scopus"
            headers = {
                "Accept": "application/json",
                "X-ELS-APIKey": apikey
            }
            params = {
                "query": query,
                "sort": "date",
                "start": start_index
            }

            response = requests.get(url, headers=headers, params=params)
            # print(f'Response headers.. {response.headers}') # for debugging rate limits
            
            if response.status_code == 200:
                data = json.loads(response.text)
                all_data.append(data)

                # Extract specified attributes
                entries = data['search-results']['entry']
                papers = []
                for entry in entries:
                    paper = {
                        "doi": entry.get("prism:doi"),
                        "title": entry.get("dc:title"),
                        "authors": entry.get("dc:creator"),
                        "year": entry.get("prism:coverDisplayDate"),
                        "publicationName": entry.get("prism:publicationName"),
                        "url": entry.get("prism:url"),
                        "citedby-count": entry.get("citedby-count")
                    }
                    papers.append(paper)

                # combine all data from paginations
                all_papers.extend(papers)
                total_results = int(data['search-results']['opensearch:totalResults'])
                start_index += 25  # Increase the start index for the next page of results
                # print(f"Search result for this iteration:{start_index} of {total_results}")
                raw_filename = 'raw_'+issn+'_index'+str(index)+'_batch'+str(batch)+'.json'
                with open(os.path.join(outputFolder, raw_filename), 'w') as f:
                    json.dump(all_data, f)
                if start_index > total_results:
                    break
                time.sleep(60)  # Pause for 60 seconds to deal with rate limits

            else:
                print("Failed to fetch papers.")
                break
            batch += 1

            # Save all raw data of this search to JSON file
            # now = datetime.datetime.now()
            # now_time = now.strftime("%Y%m%d%H%M")
            # raw_filename = 'raw_'+issn+'_batch'+batch+'.json'
            # with open(os.path.join(outputFolder, raw_filename), 'w') as f:
            #     json.dump(all_data, f)

        # Load all papers from the single search into pandas DataFrame
        df = pd.DataFrame(all_papers)
        
        # Specify the file name of this search
        csv_filename = 'papers_'+issn+'_index'+str(index)+'.csv'
        csv_file_path = os.path.join(outputFolder, csv_filename)
        df.to_csv(csv_file_path)
            

    # Read the combined CSV file and remove duplicates
    if 'doi' in df.columns:
        final_df = df.drop_duplicates(subset='doi')
        cleaned_csv_filename = 'cleanedPapers_'+issn+'.csv'
        final_df.to_csv(os.path.join(outputFolder, cleaned_csv_filename), index=False)

    return final_df

In [52]:
final_dataframe = getPapers(queryList, issn, outputFolder)

Working on Query: (TITLE-ABS-KEY((bivariate AND multivariate) OR visual*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)
Running batch: 1 of index 1
Search result for this iteration:25 of 120
Running batch: 2 of index 1
Search result for this iteration:50 of 120
Running batch: 3 of index 1
Search result for this iteration:75 of 120
Running batch: 4 of index 1
Search result for this iteration:100 of 120
Running batch: 5 of index 1
Search result for this iteration:125 of 120
Working on Query: (TITLE-ABS-KEY((bivariate AND visual*) OR multivariate) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)
Running batch: 1 of index 2
Search result for this iteration:25 of 5
Working on Query: (TITLE-ABS-KEY((multivariate AND visual*) OR bivariate) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)
Running batch: 1 of index 3
Search result for this iteration:25 of 5


---

In [None]:
# def getPapers(query, issn, outputFolder, now_time):
#   """
#   Get the papers and raw search data from Scopus API
#   """
#   start_index = 0
#   all_papers = []
#   all_data = []

#   # Check if the output dir exists
#   if not os.path.exists(outputFolder):
#     os.makedirs(outputFolder)

#   while True:
#       url = "https://api.elsevier.com/content/search/scopus"
#       headers = {
#           "Accept": "application/json",
#           "X-ELS-APIKey": apikey
#       }
#       params = {
#           "query": query,
#           "sort": "date",
#           "start": start_index
#       }

#       response = requests.get(url, headers=headers, params=params)

#       if response.status_code == 200:
#           data = json.loads(response.text)

#           # Extract specified attributes
#           entries = data['search-results']['entry']
#           papers = []
#           for entry in entries:
#               paper = {
#                   "prism:url": entry.get("prism:url"),
#                   "dc:title": entry.get("dc:title"),
#                   "dc:creator": entry.get("dc:creator"),
#                   "prism:publicationName": entry.get("prism:publicationName"),
#                   "prism:doi": entry.get("prism:doi"),
#                   "citedby-count": entry.get("citedby-count")
#               }
#               papers.append(paper)

#           # combine all data from paginations
#           all_papers.extend(papers)
#           all_data.append(data)
#           total_results = int(data['search-results']['opensearch:totalResults'])
#           start_index += 25  # Increase the start index for the next page of results
#           if start_index >= total_results:
#               break
#           time.sleep(60)  # Pause for 60 seconds to deal with rate limits
#       else:
#           print("Failed to fetch papers.")
#           break

#   # Save all raw data of this search to JSON file
#   raw_filename = 'raw_'+issn+'_'+now_time+'.json'
#   with open(os.path.join(outputFolder, raw_filename), 'w') as f:
#       json.dump(all_data, f)
#   # Load all papers from the single search into pandas DataFrame
#   csv_filename = 'papers_'+issn+'_'+now_time+'.csv'
#   df = pd.DataFrame(all_papers)
#   df.to_csv(os.path.join(outputFolder, csv_filename))
#   return df

In [None]:
# def searchByQueries(queryList, issn, outputFolder):
#   dataframes = []
#   for query in queryList:
#     df = getPapers(query, issn, outputFolder)
#     dataframes.append(df)
#     # pause after each query to prevent API overload
#     time.sleep(100)

#   combined_df = pd.concat(dataframes)
#   final_df = combined_df.drop_duplicates(subset='doi')

#   now = datetime.datetime.now()
#   now_time = now.strftime("%Y%m%d%H%M")
#   cleaned_csv_filename = 'cleanedPapers_'+issn+'_'+now_time+'.csv'

#   df.to_csv(os.path.join(outputFolder, cleaned_csv_filename))
#   return final_df