# Download Files From Citesphere Group

In [None]:
import os
import requests
import json
import time
import traceback

## Specify Properties

In the following properties need to be set before continuing:
- `FOLDER_NAME`: path to the folder in which files should be downloaded, can be relative or absolute. Default value downloads files into a folder "download" located next to this notebook.
- `GROUP_ID`: id of the Zotero group that should be downloaded (can be retrieved from the url of a group in Citesphere).
- `CITESPHERE_API_URL`: API endpoint of Citesphere (should end in `/api`).
- `TOKEN`: Citesphere access token.
- `GILES_ROOT`: Base url of Giles.

In [None]:
FOLDER_NAME = 'download/'
GROUP_ID = ''
CITESPHERE_API_URL = ''
TOKEN = ''
GILES_ROOT = ''

In [None]:
# the following should only be changed if the Citesphere API changes
ITEMS_API_URL = f"{CITESPHERE_API_URL}/v1/groups/{GROUP_ID}/items"
UPLOAD_BY_PROGRESS = f"{GILES_ROOT}/api/v2/files/upload/check/"
UPLOAD_ENDPOINT = f"{GILES_ROOT}/api/v2/resources/files/upload/"

## Functions
The following functions do the main work of downloading files.

In [None]:
# get groups
def get_items(page):
    headers = {'Authorization': f'Bearer {TOKEN}'}
    response = requests.get(ITEMS_API_URL + "?page="+str(page), headers=headers)
    return response.json()

In [None]:
def get_filename_from_response(response):
    content_disposition = response.headers.get('Content-Disposition')
    if content_disposition and 'filename=' in content_disposition:
        # Extract the filename value
        filename = content_disposition.split('filename=')[1].strip('"')
        return filename
    return None

In [None]:
def download_file(file_id):
    endpoint = f"{GILES_ROOT}/api/v2/resources/files/{file_id}/content"
    headers = {'Authorization': f'Bearer {TOKEN}'}
    response = requests.get(endpoint, headers=headers)
    filename = get_filename_from_response(response)

    # if we have a filename, we'll download the file
    # this will override files with the same name in the folder FOLDER_NAME!
    print(f"Trying to download {filename}.")
    if filename:
        with open(FOLDER_NAME + filename, 'wb') as file:
            file.write(response.content)

In [None]:
def get_file_id_from_progress(progress_id):
    headers = {'Authorization': f'Bearer {TOKEN}'}
    uploads = requests.get(UPLOAD_BY_PROGRESS + progress_id, headers=headers).json()
    file_ids = []

    # if processing in progress
    if 'msg' in uploads and "uploadId" in uploads:
        try:
            inprogress_uploads = requests.get(UPLOAD_ENDPOINT + uploads["uploadId"], headers=headers).json()
            for inprogress_upload in inprogress_uploads:
                if "extractedText" in inprogress_upload and inprogress_upload["extractedText"] and inprogress_upload["extractedText"]["id"]:
                    file_ids.append(inprogress_upload["extractedText"]["id"])
                else:
                    print(f"Can't download file for {progress_id}.")
        except Exception as e:
            print(inprogress_upload)
            traceback.print_exc()
    else:
        for upload in uploads:
            try:
                if "extractedText" in upload and upload["extractedText"] and upload["extractedText"]["id"]:
                    file_ids.append(upload["extractedText"]["id"])
                else:
                    print(f"Can't download file for {progress_id}.")
            except Exception as e:
                print(uploads)
                print(upload)
                traceback.print_exc()
            
    return file_ids
    

## Download files
The following code uses the functions above to download the files.

In [None]:
# get info about files
file_ids = []
page=0
while(True):
    page = page+1
    items = get_items(page)
    if not items['items']:
        print("no more items, done.")
        break
    print("Page " + str(page))
    if "error" in items:
        print(items)
    
    # get file ids to download
    for item in items['items']:
        time.sleep(0.5)
        uploads = item['gilesUploads']
        for upload in uploads:
            try:
                if 'extractedText' in upload and upload['extractedText']:
                    file_ids.append((upload['extractedText']['id']))
                elif 'progressId' in upload and upload['progressId']:
                    file_ids = file_ids + get_file_id_from_progress(upload['progressId'])
                else:
                    print("Could not download file for " + item['key'])
            except Exception as e:
                print("Encountered an error!", upload)
                traceback.print_exc()
print(file_ids)

In [None]:
for file_id in file_ids:
    time.sleep(0.5)
    download_file(file_id)