In [38]:
import os
import csv
import math
import random
import requests
from CitesphereConnectorTest import *
from CitesphereConnector import CitesphereConnector

In [39]:
auth_object = authObject()
auth_object.authType  = 'oauth'
auth_object.access_token="authentication-key"
connector = CitesphereConnector("https://diging-dev.asu.edu/citesphere-review/api", auth_object)
groups=connector.get_groups()
#default max number of items displayed on a collection items page 
max_size=50 

In [56]:
#download files from the collection items
def download_file(ids:set, citesphere_token:str,):
    folder_path = "citesphere-connector"
    path_list = []
    for (file_id, file_name) in list(ids):
        giles_url = f"https://diging.asu.edu/geco-giles-staging/api/v2/resources/files/{file_id}/content"
        os.makedirs(folder_path, exist_ok=True)
        filename = os.path.join(folder_path, f"{file_name}")
        path_list.append(filename)
        headers = {
            "Authorization": f"Bearer {citesphere_token}",
            "Content-Type": "application/pdf;charset=UTF-8"
            }
        response = requests.get(giles_url, headers=headers)
        if response.status_code == 200:
            with open(filename, "wb") as file:
                file.write(response.content)
    return path_list  

#Returns the set of tuples containing, file_ids and file_name for downloading
def get_set(items_list:list):
    
    ids=set()
    for values in items_list:

        #getting the file ids in uploadedFile, extractedText, pages, image, text, ocr, additionalFiles
        ids.add((values["uploadedFile"]["id"],values["uploadedFile"]["filename"]))
        if values["extractedText"] and values["extractedText"]!= "None":
            ids.add((values["extractedText"]["id"],values["extractedText"]["filename"]))
        if values["pages"] and values["pages"] != "None":
            for value in values["pages"]:
                ids.add((value["image"]["id"],value["image"]["filename"]))
                ids.add((value["text"]["id"],value["text"]["filename"]))
                ids.add((value["ocr"]["id"],value["ocr"]["filename"]))
                for file in value["additionalFiles"]:
                    ids.add((file["id"],file["filename"]))
    return ids
    
# Add the item to csv_dict that contains all the collection items(with metadata and downloaded file path)
def add_to_csv(items:list,csv_dict:dict):
    for item in items["items"]:
        if item["key"] in csv_dict:
            continue
        item["path"] = ""
        if item["gilesUploads"]:
            giles_ids = get_set(item["gilesUploads"])
            if giles_ids:
                item["path"] = download_file(giles_ids, auth_object.access_token)
        csv_dict[item["key"]] = item

In [57]:
#Contains the entries for the CSV file (collection items from each group)
csv_dict={}
for group in groups:
    group_id = group["id"]
    collections = connector.get_collections(group_id)
    for collection in collections["collections"]:
        #A maximum of 50 items are displayed on a collections item page
        num_pages = math.ceil(collection["numberOfItems"] / max_size)
        #if there is no item in that collection continue
        if num_pages == 0:
            continue

        if num_pages == 1:
            items = connector.get_collection_items_pg(group_id, collection["key"], 1)
            add_to_csv(items,csv_dict)
        else:
            #if there are more than 1 page in the collection then iterarte over all the pages
            for page in range(num_pages):
                items = connector.get_collection_items_pg(group_id, collection["key"], page + 1)
                add_to_csv(items,csv_dict)

500 FILEcHWoIjAxJSlV
500 FILEfItViY7iUgLM


In [58]:
#Create the CSV file with all the metadata and file path of the downloaded files.
with open('citesphereCSV.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    key, val = random.choice(list(csv_dict.items()))
    fields = val.keys()
    csv_items = list(csv_dict.values())
    writer.writerow(fields)
    for i in range(len(csv_items)):
        writer.writerow(list(csv_items[i].values()))