## Preparation

In [1]:
import os
import requests
import json
import pandas as pd # type: ignore
import time
import itertools
# import datetime
from dotenv import load_dotenv # type: ignore

In [2]:
# read env
load_dotenv()
apikey = os.getenv("APIKey")

In [3]:
# uncomment if read in Google Colab. Setup the API key in Google Colab (see README.md)
# from google.colab import userdata
# apikey = userdata.get('apiKey')

## Using Scopus API

Define keywords and ISSN

In [4]:
# See list of ISSN in 'Database of Sources.csv'
issn = '1743-2774'

In [5]:
keywords = ['bivariate', 'multivariate', 'visual*', 'carto*', 'geovis*']
# keywords = ['bivariate', 'multivariate', 'visual*'] # for testing

## Permutations of Search Queries

In [6]:
def generateKeywords(keywords, issn):
  """
  Create a list of search queries from permutations and combinations of keywords
  """

  searchQuery = []

  # Generate all combinations of 2 first keywords from the list
  combinations = list(itertools.combinations(keywords, 2))

  # For each combination, generate permutations with the remaining keywords
  for combo in combinations:
      remaining_keywords = [k for k in keywords if k not in combo]
      for keyword in remaining_keywords:
          # query = f"({combo[0]} AND {combo[1]} OR {keyword})"
          query = f"(TITLE-ABS-KEY(({combo[0]} AND {combo[1]}) OR {keyword}) AND ISSN({issn})) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)"
          searchQuery.append(query)

  return searchQuery

In [7]:
queryList = generateKeywords(keywords, issn)

In [8]:
queryList

['(TITLE-ABS-KEY((bivariate AND multivariate) OR visual*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND multivariate) OR carto*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND multivariate) OR geovis*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND visual*) OR multivariate) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND visual*) OR carto*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND visual*) OR geovis*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND carto*) OR multivariate) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((bivariate AND carto*) OR visual*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)',
 '(TITLE-ABS-KEY((b

## Get the papers

In [9]:
def getPapers(queryList, issn):
    """
    get papers from Scopus API for all queries in the list
    """

    outputFolder = os.path.join(os.getcwd(), 'output', issn)

    # Check if the output dir exists
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)

    for query in queryList:
        # for every query, search and fetch all results from every page
        print(f"\nWorking on Query: {query}")
        start_index = 0 # start index for pagination
        all_papers = [] # store all papers from every page
        all_data = [] # store all raw data from every page
        batch = 1
        index = queryList.index(query)+1

        # repeat until all results are fetched from every page
        while True:
            print(f"   Running batch: {batch} from query {index}")
            url = "https://api.elsevier.com/content/search/scopus"
            headers = {
                "Accept": "application/json",
                "X-ELS-APIKey": apikey
            }
            params = {
                "query": query,
                "sort": "date",
                "start": start_index
            }

            response = requests.get(url, headers=headers, params=params)
            # print(f'Response headers.. {response.headers}') # for debugging rate limits
            
            if response.status_code == 200:
                data = json.loads(response.text)
                all_data.append(data)

                # Extract specified attributes
                entries = data['search-results']['entry']
                papers = []
                for entry in entries:
                    paper = {
                        "doi": entry.get("prism:doi"),
                        "title": entry.get("dc:title"),
                        "authors": entry.get("dc:creator"),
                        "year": entry.get("prism:coverDisplayDate"),
                        "publicationName": entry.get("prism:publicationName"),
                        "url": entry.get("prism:url"),
                        "citedby-count": entry.get("citedby-count")
                    }
                    papers.append(paper)

                # combine all data from paginations
                all_papers.extend(papers)
                total_results = int(data['search-results']['opensearch:totalResults'])
                start_index += 25  # Increase the start index for the next page of results
                print(f"Search result for this iteration: page {start_index} from {total_results}")
                
                # Save all raw data of this search to JSON file
                raw_filename = 'raw_'+issn+'_index'+str(index)+'_batch'+str(batch)+'.json'
                with open(os.path.join(outputFolder, raw_filename), 'w') as f:
                    json.dump(all_data, f)
                if start_index > total_results:
                    break
                time.sleep(60)  # Pause for 60 seconds to deal with rate limits
            else:
                print("Failed to fetch papers.")
                break
            batch += 1

        # Load all papers from the single search into pandas DataFrame
        df = pd.DataFrame(all_papers)
        
        # Specify the file name of this search
        csv_filename = 'papers_'+issn+'_index'+str(index)+'.csv'
        csv_file_path = os.path.join(outputFolder, csv_filename)
        df.to_csv(csv_file_path)
                
    # return final_df
    dfs = []
    # Loop over all files in the directory
    for filename in os.listdir(outputFolder):
        # Check if the file is a CSV file
        if filename.endswith('.csv'):
            print(f"Reading file: {filename}")
            filepath = os.path.join(outputFolder, filename)
            df_file = pd.read_csv(filepath)
            dfs.append(df_file)

    combined_df = pd.concat(dfs, ignore_index=True)
    final_df = combined_df.drop_duplicates(subset='doi')
    cleaned_csv_filename = 'cleanedPapers_'+issn+'.csv'
    final_df.to_csv(os.path.join(outputFolder, cleaned_csv_filename), index=False)
    print(f"Total papers found: {len(final_df)}")

    return final_df

In [10]:
final_dataframe = getPapers(queryList, issn)


Working on Query: (TITLE-ABS-KEY((bivariate AND multivariate) OR visual*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)
   Running batch: 1 from query 1


Search result for this iteration: page 25 from 120
   Running batch: 2 from query 1
Search result for this iteration: page 50 from 120
   Running batch: 3 from query 1
Search result for this iteration: page 75 from 120
   Running batch: 4 from query 1
Search result for this iteration: page 100 from 120
   Running batch: 5 from query 1
Search result for this iteration: page 125 from 120

Working on Query: (TITLE-ABS-KEY((bivariate AND multivariate) OR carto*) AND ISSN(1743-2774)) AND (PUBYEAR AFT 2009 AND PUBYEAR BEF 2024)
   Running batch: 1 from query 2
Search result for this iteration: page 25 from 369
   Running batch: 2 from query 2
Search result for this iteration: page 50 from 369
   Running batch: 3 from query 2
Search result for this iteration: page 75 from 369
   Running batch: 4 from query 2
Search result for this iteration: page 100 from 369
   Running batch: 5 from query 2
Search result for this iteration: page 125 from 369
   Running batch: 6 from query 2
Search result for

---