# Download Files From Citesphere Group

In [None]:
import os, sys
import requests
import json
import time
import traceback

## Specify Properties

In the following properties need to be set before continuing:
- `FOLDER_NAME`: path to the folder in which files should be downloaded, can be relative or absolute. Default value downloads files into a folder "download" located next to this notebook.
- `GROUP_ID`: id of the Zotero group that should be downloaded (can be retrieved from the url of a group in Citesphere).
- `CITESPHERE_API_URL`: API endpoint of Citesphere (should end in `/api`).
- `TOKEN`: Citesphere access token.
- `GILES_ROOT`: Base url of Giles.

In [None]:
FOLDER_NAME = 'download/'
GROUP_ID = ''
CITESPHERE_API_URL = ''
TOKEN = ''
GILES_ROOT = ''

In [None]:
# the following should only be changed if the Citesphere API changes
ITEMS_API_URL = f"{CITESPHERE_API_URL}/v1/groups/{GROUP_ID}/items"
UPLOAD_BY_PROGRESS = f"{GILES_ROOT}/api/v2/files/upload/check/"
UPLOAD_ENDPOINT = f"{GILES_ROOT}/api/v2/resources/files/upload/"

## Functions
The following functions do the main work of downloading files.

In [None]:
# get groups
def get_items(page):
    headers = {'Authorization': f'Bearer {TOKEN}'}
    response = requests.get(ITEMS_API_URL + "?page="+str(page), headers=headers)
    return response.json()

In [None]:
def get_filename_from_response(response):
    content_disposition = response.headers.get('Content-Disposition')
    if content_disposition and 'filename=' in content_disposition:
        # Extract the filename value
        filename = content_disposition.split('filename=')[1].strip('"')
        return filename
    return None

In [None]:
def download_ocr(file_ids):
    if not file_ids:
        return
    headers = {'Authorization': f'Bearer {TOKEN}'}
    
    filename = None
    for file_id in file_ids:
        endpoint = f"{GILES_ROOT}/api/v2/resources/files/{file_id}/content"
        response = requests.get(endpoint, headers=headers)
        if not filename:
            filename = get_filename_from_response(response)
            filename = filename[:filename.find(".")] + ".txt"
    
        # if we have a filename, we'll download the file
        # this will override files with the same name in the folder FOLDER_NAME!
        print(f"Downloading {filename}.")
        if filename:
            with open(FOLDER_NAME + filename, 'ab') as file:
                file.write(response.content)
        else:
            print("Can't download file.")

    

In [None]:
def get_ocr_ids(document):
    #print("Checking ocr ids for " + document["documentId"])
    sys.stdout.write(".")
    file_ids = []
    try:
        # we only want to build ocr if there is no extracted text
        if "extractedText" in document and document["extractedText"] and "id" in document["extractedText"]:
            #print("Text was extracted from PDF, so won't download OCR.")
            return None
        else:
            if "pages" in document:
                for page in document["pages"]:
                    if "ocr" in page and "id" in page["ocr"]:
                        file_ids.append(page["ocr"]["id"])
                    else:
                        print("\n")
                        print("OCR incomplete, so won't download OCR.")
                        return None
            else:
                print("No pages found.")
                return None
        #print("Returning: ", file_ids)
        return file_ids
    except Exception as e:
        traceback.print_exc()
    return None

In [None]:
def get_file_ids_from_progress(progress_id):
    headers = {'Authorization': f'Bearer {TOKEN}'}
    uploads = requests.get(UPLOAD_BY_PROGRESS + progress_id, headers=headers).json()
    file_ids = []

    # if processing in progress
    if 'msg' in uploads and "uploadId" in uploads:
        try:
            inprogress_uploads = requests.get(UPLOAD_ENDPOINT + uploads["uploadId"], headers=headers).json()
            for inprogress_upload in inprogress_uploads:
                ids = get_ocr_ids(inprogress_upload)
                if ids:
                    file_ids.append(ids)
        except Exception as e:
            traceback.print_exc()
    else:
        for upload in uploads:
            ids = get_ocr_ids(upload)
            if ids:
                file_ids.append(ids)
            
    return file_ids
    

## Download files
The following code uses the functions above to download the files.

In [None]:
# get info about files
file_ids = []
page=0
while(True):
    page = page+1
    items = get_items(page)
    if not items['items']:
        print("no more items, done.")
        break
    print("Page " + str(page))
    if "error" in items:
        print(items)
    
    # get file ids to download
    for item in items['items']:
        time.sleep(0.5)
        uploads = item['gilesUploads']
        for upload in uploads:
            try:
                if 'progressId' in upload and upload['progressId']:
                    ocr_ids = get_file_ids_from_progress(upload['progressId'])
                    if ocr_ids:
                        file_ids = file_ids + ocr_ids
                #if 'extractedText' in upload and upload['extractedText']:
                #    file_ids.append((upload['extractedText']['id']))
                else:
                    print("Could not download file for " + item['key'])
            except Exception as e:
                print("Encountered an error!", upload)
                traceback.print_exc()
print(file_ids)

In [None]:
# download all files of a document and append
for ocr_file_ids in file_ids:
    time.sleep(0.5)
    download_ocr(ocr_file_ids)