## A notebook to collate data from Dimensions Google BigQuery database

### Last updated: 19th January 2023

#### Authors: Charlie Rahal and Saurabh Khanna

Load dependancies:

In [None]:
import pandas as pd
import os
from google.cloud import bigquery
from helper_functions import pub_ids_from_issns, chunker
from tqdm import tqdm
tqdm.pandas()
import warnings
warnings.simplefilter("ignore", UserWarning)

Authenticate with to the bigquery client and project:

In [None]:
MY_PROJECT_ID = "dimensionspkp"
client = bigquery.Client(project=MY_PROJECT_ID)

In [None]:
raw_path = os.path.join("..", "data", "raw")
raw_data = pd.read_csv(os.path.join(raw_path, "ojs_issns_in_dimensions_2020.csv"))
issns_to_query = raw_data["issn"].tolist()
print("Total ISSNs to query: ", len(issns_to_query))

In [None]:
%%time
chunk_size = 1000
file_name = 'pubs_from_all_issns.csv'
file_path = os.path.join('..', 'data', 'raw',
                         'from_dimensions', file_name)

# @TODO a better tqdm decorator...
for issn_chunk in tqdm(chunker(issns_to_query, chunk_size)):
    results = pub_ids_from_issns(issn_chunk, client).to_dataframe()
    if os.path.exists(file_path) is False:
        results.to_csv(file_path, mode='w', header=False)
    else:
        results.to_csv(file_path, mode='a', header=False)