In [4]:
import os
import csv
import math
import random
import requests
from CitesphereConnectorTest import *
from CitesphereConnector import CitesphereConnector

In [5]:
auth_object = authObject()
auth_object.authType  = 'oauth'
auth_object.access_token="access-token"
connector = CitesphereConnector("https://diging-dev.asu.edu/citesphere-review/api", auth_object)
groups=connector.get_groups()
#default max number of items displayed on a collection items page 
max_size=50 

In [5]:
#download files from the collection items
def download_file(folder_path:str,ids:set, citesphere_token:str,):
    path_list = []
    for (file_id, file_name) in list(ids):
        giles_url = f"https://diging.asu.edu/geco-giles-staging/api/v2/resources/files/{file_id}/content"
        os.makedirs(folder_path, exist_ok=True)
        filename = os.path.join(folder_path, f"{file_name}")
        path_list.append(filename)
        headers = {
            "Authorization": f"Bearer {citesphere_token}",
            "Content-Type": "application/pdf;charset=UTF-8"
            }
        response = requests.get(giles_url, headers=headers)
        if response.status_code == 200:
            with open(filename, "wb") as file:
                file.write(response.content)
    return path_list  

#Returns the set of tuples containing, file_ids and file_name for downloading

"""
this function returns a set of tuples that are pair of filename and their file ids
set : (tuple(file_id,file_name),...)
files to download in a collection item are located in the following fields:
uploadedFile, extractedText, pages, image, text, ocr, additionalFiles
"""

def get_set(items_list:list):
    ids=set()
    for values in items_list:
        #getting the file ids in uploadedFile, extractedText, pages, image, text, ocr, additionalFiles
        ids.add((values["uploadedFile"]["id"],values["uploadedFile"]["filename"]))
        if values["extractedText"] and values["extractedText"]!= "None":
            ids.add((values["extractedText"]["id"],values["extractedText"]["filename"]))
        if values["pages"] and values["pages"] != "None":
            for value in values["pages"]:
                ids.add((value["image"]["id"],value["image"]["filename"]))
                ids.add((value["text"]["id"],value["text"]["filename"]))
                ids.add((value["ocr"]["id"],value["ocr"]["filename"]))
                for file in value["additionalFiles"]:
                    ids.add((file["id"],file["filename"]))
    return ids
   
#Create the CSV file with all the metadata and file path of the downloaded files.

"""
this function writes the meta data to the csv file.
uses a flag to write the fileds at the top of the file in the first time.
"""

def write_to_csv(csv_name:str,item:list,flag:int):
    with open(csv_name, 'a', newline='') as file:
        writer = csv.writer(file)
        if flag==0:
            fields = list(item.keys())
            writer.writerow(list(fields))
        writer.writerow(list(item.values()))
        
# Add the item to csv_dict that contains all the collection items(with metadata and downloaded file path)

"""
adds items to a dictionary object; csv_dict 
with the collection items unique key as the key for the dictionary
and the item dictionary as the value for the respective key, 
that contains our fields and value/metadata for a
given collection item that we write to the csv file. 
It also adds a path property that indicates the location of the downloades files(if any)
"""

def add_to_csv(csv_name:str,folder_name:str, items:list,csv_dict:dict,flag:int):
    for item in items["items"]:
        if item["key"] in csv_dict:
            continue
        item["path"] = ""
        if item["gilesUploads"]:
            giles_ids = get_set(item["gilesUploads"])
            if giles_ids:
                item["path"] = download_file(folder_name, giles_ids, auth_object.access_token)
        csv_dict[item["key"]] = item
        write_to_csv(csv_name,item,flag)
        flag = 1
    return flag

In [13]:
#downloads and generate a csv file containing all the group items information
def process_groups(csv_name:str,folder_path:str,groups:list, connector, max_size:int):
    csv_dict = {}
    flag = 0
    for group in groups:
        group_id = group["id"]
        collections = connector.get_collections(group_id)
        
        for collection in collections["collections"]:
            num_pages = math.ceil(collection["numberOfItems"] / max_size)

            if num_pages == 1:
                items = connector.get_collection_items_pg(group_id, collection["key"], 1)
                flag = add_to_csv(csv_name, folder_path, items, csv_dict, flag)
            elif num_pages > 1:
                for page in range(num_pages):
                    items = connector.get_collection_items_pg(group_id, collection["key"], page + 1)
                    flag = add_to_csv(csv_name, folder_path, items, csv_dict, flag)

    return csv_dict

In [None]:
csv_filename = "citesphere_csv.csv"

folder_path  = "Files"

process_groups(csv_filename,folder_path,groups,connector,max_size)